https://machinelearningmastery.com/simple-linear-regression-tutorial-for-machine-learning/ <br>
https://machinelearningmastery.com/implement-simple-linear-regression-scratch-python/

In [1]:
from pyspark import SparkContext
from pyspark.sql.types import *
from pyspark.sql import Row
from pyspark.sql.functions import *
import math
sc = SparkContext.getOrCreate()
sqlContext = SQLContext(sc)

In [2]:
sqlContext.sql("DROP TABLE IF EXISTS test") #delete test table, if exsists.

DataFrame[]

In [3]:
irisSchema = StructType([StructField("sepal_length", DoubleType(), True), 
                         StructField("sepal_width", DoubleType(), True),
                         StructField("petal_length", DoubleType(), True), 
                         StructField("petal_width", DoubleType(), True),
                         StructField("class", StringType(), True)])

In [4]:
iris = sqlContext.read \
    .format('com.databricks.spark.csv') \
    .options(header='false') \
    .load('../Data/iris.csv', schema = irisSchema)

In [5]:
iris = iris.select('sepal_length','petal_length')

In [6]:
iris.show()

+------------+------------+
|sepal_length|petal_length|
+------------+------------+
|         5.1|         1.4|
|         4.9|         1.4|
|         4.7|         1.3|
|         4.6|         1.5|
|         5.0|         1.4|
|         5.4|         1.7|
|         4.6|         1.4|
|         5.0|         1.5|
|         4.4|         1.4|
|         4.9|         1.5|
|         5.4|         1.5|
|         4.8|         1.6|
|         4.8|         1.4|
|         4.3|         1.1|
|         5.8|         1.2|
|         5.7|         1.5|
|         5.4|         1.3|
|         5.1|         1.4|
|         5.7|         1.7|
|         5.1|         1.5|
+------------+------------+
only showing top 20 rows



In [52]:
#Split training and test set.
train, test = iris.randomSplit([0.9, 0.1])
train.cache()
test.write.saveAsTable("test")

In [54]:
# calculate covariance.
covariance = train.cov("sepal_length","petal_length")
covariance

1.3008872549019572

In [55]:
# calculate variance.
variance = train.select(variance("sepal_length")).first()[0]
variance

0.6969144880174262

In [58]:
# Estimate coefficients
# where petal_length = coeff_0 * sepal_length + coeff_1

In [59]:
#coeff_0 = covariance(x,y)/variance(x)
coeff_0 = covariance/variance

In [60]:
#coeff_1 = mean(y) – coeff_0 * mean(x)
coeff_1 =  train.select(mean("petal_length")).first()[0] - coeff_0 * train.select(mean("sepal_length")).first()[0]

In [68]:
test_output = sqlContext.sql("SELECT sepal_length, petal_length, sepal_length * {0} + {1} AS prediction FROM test".format(coeff_0, coeff_1))

In [90]:
test_output.show()

+------------+------------+------------------+
|sepal_length|petal_length|        prediction|
+------------+------------+------------------+
|         4.8|         1.4|1.8021366355539996|
|         5.2|         1.5|    2.548791943926|
|         5.2|         3.9|    2.548791943926|
|         5.6|         4.2|3.2954472522979987|
|         5.9|         4.2|3.8554387335769995|
|         6.0|         4.0| 4.042102560669999|
|         6.3|         5.1|    4.602094041949|
|         6.3|         5.6|    4.602094041949|
|         6.4|         5.6|    4.788757869042|
|         6.5|         4.6| 4.975421696134999|
|         6.7|         4.4|    5.348749350321|
|         6.7|         5.2|    5.348749350321|
|         6.8|         5.5|    5.535413177414|
|         7.4|         6.1|    6.655396139972|
+------------+------------+------------------+



In [88]:
# rmse (root mean squre error) : https://en.wikipedia.org/wiki/Root-mean-square_deviation
rmse = math.sqrt(test_output.rdd.map(lambda x : (x["prediction"] - x["petal_length"])**2)\
                      .reduce(lambda x,y : x + y)/test_output.count())

In [89]:
rmse

0.722144867982061