https://machinelearningmastery.com/simple-linear-regression-tutorial-for-machine-learning/ <br>
https://machinelearningmastery.com/implement-simple-linear-regression-scratch-python/

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql import Row
from pyspark.sql.functions import *
import math
ss = SparkSession.builder.getOrCreate()

In [2]:
ss.sql("DROP TABLE IF EXISTS test") #delete test table, if exsists.

DataFrame[]

## Create a schema

In [3]:
irisSchema = StructType([StructField("sepal_length", DoubleType(), True), 
                         StructField("sepal_width", DoubleType(), True),
                         StructField("petal_length", DoubleType(), True), 
                         StructField("petal_width", DoubleType(), True),
                         StructField("class", StringType(), True)])

## Create a data frame

In [4]:
iris = ss.read.csv('../Data/iris.csv', schema = irisSchema, header=False)

In [5]:
iris = iris.select('sepal_width','petal_width')

In [6]:
iris.show(5)

+-----------+-----------+
|sepal_width|petal_width|
+-----------+-----------+
|        3.5|        0.2|
|        3.0|        0.2|
|        3.2|        0.2|
|        3.1|        0.2|
|        3.6|        0.2|
+-----------+-----------+
only showing top 5 rows



## Split training and test set.

In [8]:
train, test = iris.randomSplit([0.9, 0.1])
train.cache()
test.write.saveAsTable("test")

## calculate covariance.

In [9]:
covariance = train.cov("sepal_width","petal_width")
covariance

-0.1094772036474167

## calculate variance.

In [10]:
train.select(variance("sepal_width")).first()

Row(var_samp(sepal_width)=0.17971124620060855)

In [11]:
variance = train.select(variance("sepal_width")).first()[0]
variance

0.17971124620060855

## Estimate coefficients and apply the equation
# where petal_width = coeff_0 * sepal_width + coeff_1

### coeff_0 = covariance(x,y)/variance(x)


### coeff_1 = mean(y) – coeff_0 * mean(x)

In [12]:
coeff_0 = covariance/variance

In [13]:
coeff_1 =  train.select(mean("petal_width")).first()[0] - coeff_0 * train.select(mean("sepal_width")).first()[0]

In [14]:
test_output = ss.sql("SELECT sepal_width, petal_width, sepal_width * {0} + {1} AS prediction FROM test".format(coeff_0, coeff_1))

In [15]:
test_output.show()

+-----------+-----------+------------------+
|sepal_width|petal_width|        prediction|
+-----------+-----------+------------------+
|        2.6|        1.2|1.4817753347427745|
|        2.8|        1.9|  1.35993854827343|
|        2.8|        2.0|  1.35993854827343|
|        3.1|        0.2|1.1771833685694133|
|        3.2|        0.2|1.1162649753347411|
|        3.4|        0.2|0.9944281888653967|
|        3.4|        2.4|0.9944281888653967|
|        3.7|        0.2|  0.81167300916138|
|        4.4|        0.4| 0.385244256518674|
+-----------+-----------+------------------+



## rmse (root mean squre error) : https://en.wikipedia.org/wiki/Root-mean-square_deviation

In [16]:
rmse = math.sqrt(test_output.rdd.map(lambda x : (x["prediction"] - x["petal_width"])**2)\
                      .reduce(lambda x,y : x + y)/test_output.count())

In [17]:
rmse

0.7857255019232553

In [18]:
ss.stop()