In [1]:
import findspark
findspark.init()

In [5]:
from pyspark.ml.regression import LinearRegression
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors

In [3]:
# create a sparksession (windows specific)
spark = SparkSession.builder \
    .master("local") \
    .appName("LinearRegression") \
    .config("spark.sql.warehouse.dir", "file:///C:/temp") \
    .getOrCreate()

In [4]:
inputLines = spark.sparkContext.textFile("../SparkData/regression.txt")

In [6]:
# take raw data and convert to MLLib friendly format
data = inputLines.map(lambda x: x.split(",")).map(lambda x: \
                         ( float(x[0]), Vectors.dense(float(x[1])) ) )

In [7]:
data.top(5)

[(2.76, DenseVector([-2.8])),
 (2.71, DenseVector([-2.9])),
 (2.67, DenseVector([-2.51])),
 (2.62, DenseVector([-2.69])),
 (2.56, DenseVector([-2.6]))]

In [8]:
# convert RDD to a DataFrame, set appropriate column header names
colNames = ["label", "features"]
df = data.toDF(colNames)

In [11]:
df.show(5)

+-----+--------+
|label|features|
+-----+--------+
|-1.74|  [1.66]|
| 1.24| [-1.18]|
| 0.29|  [-0.4]|
|-0.13|  [0.09]|
|-0.39|  [0.38]|
+-----+--------+
only showing top 5 rows



In [12]:
# set up a 50/50 train test split
trainTest = df.randomSplit([0.5, 0.5])
trainingDF = trainTest[0]
testDF = trainTest[1]

In [13]:
# create a linear regression model
lir = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

In [14]:
# train model using training data
model = lir.fit(trainingDF)

In [15]:
# predict values in the test section of data
# cache to ensure persistence
fullPredictions = model.transform(testDF).cache()

In [20]:
fullPredictions.show(5)

+-----+--------+-------------------+
|label|features|         prediction|
+-----+--------+-------------------+
|-3.54|  [3.44]| -2.449195925445587|
|-3.23|  [3.26]|-2.3218189659186192|
|-2.89|  [2.89]|-2.0599885491131866|
| -2.6|  [2.58]|-1.8406171188167424|
|-2.54|  [2.49]|-1.7769286390532588|
+-----+--------+-------------------+
only showing top 5 rows



In [16]:
# extract the predictions and "known" correct labels
predictions = fullPredictions.select("prediction").rdd.map(lambda x: x[0])
labels = fullPredictions.select("label").rdd.map(lambda x: x[0])

In [17]:
# zip them together
predictionAndLabel = predictions.zip(labels).collect()

In [18]:
for prediction in predictionAndLabel:
    print(prediction)

(-2.449195925445587, -3.54)
(-2.3218189659186192, -3.23)
(-2.0599885491131866, -2.89)
(-1.8406171188167424, -2.6)
(-1.7769286390532588, -2.54)
(-1.6707811727807858, -2.43)
(-1.7344696525442695, -2.36)
(-1.564633706508313, -2.27)
(-1.607092693017302, -2.26)
(-1.4938687289933308, -2.14)
(-1.3594152717148653, -2.12)
(-1.3947977604723563, -2.06)
(-1.4443332447328436, -2.0)
(-1.500945226744829, -1.98)
(-1.387721262720858, -1.94)
(-1.3947977604723563, -1.91)
(-1.4160272537268508, -1.87)
(-1.203732321181905, -1.83)
(-1.3169562852058763, -1.8)
(-1.239114809939396, -1.79)
(-1.281573796448385, -1.7)
(-1.1258908459154249, -1.68)
(-1.2249618144363996, -1.68)
(-1.2532678054423925, -1.68)
(-1.1541968369214177, -1.66)
(-1.2249618144363996, -1.66)
(-1.3169562852058763, -1.64)
(-1.1329673436669232, -1.63)
(-1.2320383121878977, -1.61)
(-1.1046613526609304, -1.6)
(-1.0692788639034394, -1.54)
(-1.1541968369214177, -1.54)
(-1.2532678054423925, -1.54)
(-1.203732321181905, -1.53)
(-1.0480493706489447, -1.47)