https://machinelearningmastery.com/simple-linear-regression-tutorial-for-machine-learning/ <br>
https://machinelearningmastery.com/implement-simple-linear-regression-scratch-python/

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql import Row
from pyspark.sql.functions import *
import math
ss = SparkSession.builder.getOrCreate()

## Create a data frame

In [2]:
irisSchema = StructType([StructField("sepal_length", DoubleType(), True),
                         StructField("sepal_width", DoubleType(), True),
                         StructField("petal_length", DoubleType(), True),
                         StructField("petal_width", DoubleType(), True),
                         StructField("class", StringType(), True)
    
])

In [3]:
iris = ss.read.csv("../Data/iris.csv", schema=irisSchema)

In [4]:
iris.count()

150

## Split training and test set.

In [5]:
train, test = iris.randomSplit([0.9, 0.1])

In [6]:
train.cache()

DataFrame[sepal_length: double, sepal_width: double, petal_length: double, petal_width: double, class: string]

In [7]:
test.write.saveAsTable("test")

## calculate covariance.

In [8]:
covariance = train.cov("sepal_width", "petal_width")

In [9]:
covariance

-0.11767654880330912

## calculate variance.

In [10]:
variance = train.select(variance("sepal_width")).first()[0]

## Estimate coefficients and apply the equation

# where petal_length = coeff_0 * sepal_length + coeff_1

### coeff_0 = covariance(x,y)/variance(x)


### coeff_1 = mean(y) – coeff_0 * mean(x)

In [11]:
coeff_0 = covariance/variance

In [12]:
coeff_1 = train.select(mean("petal_width")).first()[0]\
        - coeff_0 * train.select(mean("sepal_width")).first()[0]

In [13]:
coeff_0

-0.6093060733245946

In [14]:
coeff_1

3.038261587968603

In [21]:
test_output = ss.sql("SELECT sepal_width, petal_width, sepal_width * {0} + {1} AS prediction\
                      FROM test".format(coeff_0, coeff_1))

In [22]:
test_output.show()

+-----------+-----------+------------------+
|sepal_width|petal_width|        prediction|
+-----------+-----------+------------------+
|        3.4|        0.2|0.9666209386649811|
|        2.7|        1.2|1.3931351899921973|
|        2.7|        1.8|1.3931351899921973|
|        2.7|        1.9|1.3931351899921973|
|        2.8|        2.2|1.3322045826597382|
|        3.2|        2.3|   1.0884821533299|
|        3.1|        2.3|1.1494127606623596|
+-----------+-----------+------------------+



## rmse (root mean squre error) : https://en.wikipedia.org/wiki/Root-mean-square_deviation

In [26]:
test_output.rdd.map(lambda x : (x["prediction"] - x["petal_width"]) ** 2).sum()/test_output.count()

0.6560221641347653

In [27]:
ss.stop()