https://machinelearningmastery.com/simple-linear-regression-tutorial-for-machine-learning/ <br>
https://machinelearningmastery.com/implement-simple-linear-regression-scratch-python/

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql import Row
from pyspark.sql.functions import *
import math
ss = SparkSession.builder.getOrCreate()

In [2]:
ss.sql("DROP TABLE IF EXISTS test") #delete test table, if exsists.

DataFrame[]

## Create a schema

In [3]:
irisSchema = StructType([StructField("sepal_length", DoubleType(), True), 
                         StructField("sepal_width", DoubleType(), True),
                         StructField("petal_length", DoubleType(), True), 
                         StructField("petal_width", DoubleType(), True),
                         StructField("class", StringType(), True)])

## Create a data frame

In [4]:
iris = ss.read.csv('../Data/iris.csv', schema = irisSchema, header=False)

In [5]:
iris = iris.select('sepal_length','petal_length')

In [6]:
iris.show(5)

+------------+------------+
|sepal_length|petal_length|
+------------+------------+
|         5.1|         1.4|
|         4.9|         1.4|
|         4.7|         1.3|
|         4.6|         1.5|
|         5.0|         1.4|
+------------+------------+
only showing top 5 rows



## Split training and test set.

In [7]:
train, test = iris.randomSplit([0.9, 0.1])
train.cache()
test.write.saveAsTable("test")

## calculate covariance.

In [8]:
covariance = train.cov("sepal_length","petal_length")
covariance

1.2941426893716979

## calculate variance.

In [9]:
train.select(variance("sepal_length")).first()

Row(var_samp(sepal_length)=0.6964932472108046)

In [10]:
variance = train.select(variance("sepal_length")).first()[0]
variance

0.6964932472108046

## Estimate coefficients and apply the equation
# where petal_length = coeff_0 * sepal_length + coeff_1

### coeff_0 = covariance(x,y)/variance(x)


### coeff_1 = mean(y) – coeff_0 * mean(x)

In [11]:
coeff_0 = covariance/variance

In [12]:
coeff_1 =  train.select(mean("petal_length")).first()[0] - coeff_0 * train.select(mean("sepal_length")).first()[0]

In [13]:
test_output = ss.sql("SELECT sepal_length, petal_length, sepal_length * {0} + {1} AS prediction FROM test".format(coeff_0, coeff_1))

In [14]:
test_output.show()

+------------+------------+------------------+
|sepal_length|petal_length|        prediction|
+------------+------------+------------------+
|         4.8|         1.4|1.8387194299434748|
|         5.0|         1.2|2.2103361525906093|
|         5.0|         1.6|2.2103361525906093|
|         5.0|         1.6|2.2103361525906093|
|         5.0|         3.5|2.2103361525906093|
|         5.4|         1.3|  2.95356959788488|
|         5.4|         1.5|  2.95356959788488|
|         5.6|         4.5| 3.325186320532013|
|         5.7|         4.1| 3.510994681855581|
|         5.7|         4.2| 3.510994681855581|
|         6.0|         4.5| 4.068419765826282|
|         6.1|         4.7| 4.254228127149848|
|         6.3|         4.4| 4.625844849796983|
|         6.3|         4.7| 4.625844849796983|
|         6.5|         5.8| 4.997461572444117|
|         6.7|         4.4| 5.369078295091252|
|         6.7|         5.8| 5.369078295091252|
|         7.0|         4.7| 5.926503379061955|
|         7.7

## rmse (root mean squre error) : https://en.wikipedia.org/wiki/Root-mean-square_deviation

In [15]:
rmse = math.sqrt(test_output.rdd.map(lambda x : (x["prediction"] - x["petal_length"])**2)\
                      .reduce(lambda x,y : x + y)/test_output.count())

In [16]:
rmse

0.8784876438719023