# Linear Regression

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('lr_example').getOrCreate()

In [3]:
from pyspark.ml.regression import LinearRegression

In [4]:
data = spark.read.csv('./Ecommerce_Customers.csv', inferSchema=True, header=True)

In [7]:
data.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [18]:
for item in data.head():
    print(item)

mstephenson@fernandez.com
835 Frank TunnelWrightmouth, MI 82180-9605
Violet
34.49726772511229
12.65565114916675
39.57766801952616
4.0826206329529615
587.9510539684005


In [19]:
from pyspark.ml.feature import VectorAssembler

In [24]:
assembler = VectorAssembler(
    inputCols=[
                'Avg Session Length',
                'Time on App',
                'Time on Website',
                'Length of Membership'],
    outputCol='features'
)

In [25]:
output = assembler.transform(data)

In [26]:
output.select('features').show()

+--------------------+
|            features|
+--------------------+
|[34.4972677251122...|
|[31.9262720263601...|
|[33.0009147556426...|
|[34.3055566297555...|
|[33.3306725236463...|
|[33.8710378793419...|
|[32.0215955013870...|
|[32.7391429383803...|
|[33.9877728956856...|
|[31.9365486184489...|
|[33.9925727749537...|
|[33.8793608248049...|
|[29.5324289670579...|
|[33.1903340437226...|
|[32.3879758531538...|
|[30.7377203726281...|
|[32.1253868972878...|
|[32.3388993230671...|
|[32.1878120459321...|
|[32.6178560628234...|
+--------------------+
only showing top 20 rows



In [28]:
final_data = output.select('features', 'Yearly Amount Spent')

In [29]:
final_data.show()

+--------------------+-------------------+
|            features|Yearly Amount Spent|
+--------------------+-------------------+
|[34.4972677251122...|  587.9510539684005|
|[31.9262720263601...|  392.2049334443264|
|[33.0009147556426...| 487.54750486747207|
|[34.3055566297555...|  581.8523440352177|
|[33.3306725236463...|  599.4060920457634|
|[33.8710378793419...|   637.102447915074|
|[32.0215955013870...|  521.5721747578274|
|[32.7391429383803...|  549.9041461052942|
|[33.9877728956856...|  570.2004089636196|
|[31.9365486184489...|  427.1993848953282|
|[33.9925727749537...|  492.6060127179966|
|[33.8793608248049...|  522.3374046069357|
|[29.5324289670579...|  408.6403510726275|
|[33.1903340437226...|  573.4158673313865|
|[32.3879758531538...|  470.4527333009554|
|[30.7377203726281...|  461.7807421962299|
|[32.1253868972878...| 457.84769594494855|
|[32.3388993230671...| 407.70454754954415|
|[32.1878120459321...|  452.3156754800354|
|[32.6178560628234...|   605.061038804892|
+----------

In [30]:
train_data, test_data = final_data.randomSplit([0.7,0.3])

In [31]:
train_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                346|
|   mean|  498.8024970897303|
| stddev|  81.92339670545118|
|    min| 256.67058229005585|
|    max|  765.5184619388373|
+-------+-------------------+



In [34]:
test_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                154|
|   mean| 500.46334504057603|
| stddev|  73.35938408156609|
|    min| 298.76200786180766|
|    max|  689.7876041747194|
+-------+-------------------+



In [35]:
lr = LinearRegression(labelCol='Yearly Amount Spent')

In [36]:
lrModel = lr.fit(train_data)

In [39]:
print('Coefficients: {}\nIntercept: {}'.format(lrModel.coefficients, lrModel.intercept))

Coefficients: [25.285443037193343,38.27694359391803,0.16497056598757462,61.66678796929365]
Intercept: -1022.2995172838534


In [40]:
test_result = lrModel.evaluate(test_data)

In [42]:
test_result.residuals.show()

+-------------------+
|          residuals|
+-------------------+
| 10.130890580638948|
|   4.20870838303847|
| 10.649360570605495|
| -4.547580464618022|
| 3.1694740206286838|
|-4.7662193866535745|
|-2.0026387765625486|
|   17.9452793045632|
|  7.139288388517741|
|-1.9526313416751577|
| -6.038853483845003|
| -4.691144657857649|
|  -16.8496727566465|
|-3.0443718451953714|
|  -9.07407174067447|
| -9.782587816161026|
|-17.192091228888387|
| 11.567604054287187|
|   5.99887057612807|
| -4.051169145598237|
+-------------------+
only showing top 20 rows



In [44]:
unlabeled_data = test_data.select('features')

In [45]:
predictions = lrModel.transform(unlabeled_data)

In [48]:
predictions.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[30.7377203726281...|451.64985161559093|
|[31.0472221394875...| 388.2886908059829|
|[31.1695067987115...| 416.7071702316873|
|[31.2681042107507...|428.01811363844195|
|[31.4459724827577...| 481.7074909144999|
|[31.5147378578019...|  494.578707383115|
|[31.5761319713222...| 543.2292227658909|
|[31.6005122003032...|461.22757218653373|
|[31.6548096756927...| 468.1241353390308|
|[31.7216523605090...| 349.7295579735478|
|[31.7242025238451...| 509.4267407718055|
|[31.8124825597242...|397.50148964165487|
|[31.8164283341993...| 517.9721642603029|
|[31.8186165667690...|  449.463045215331|
|[31.8279790554652...|  449.076819287616|
|[31.8648325480987...| 449.6738682929747|
|[31.9048571310136...| 491.1419486517045|
|[31.9262720263601...| 380.6373293900392|
|[31.9480174211613...|455.92200631676974|
|[31.9673209478824...| 449.8010103852505|
+--------------------+------------

In [51]:
print('RMSE: {}\nMSE: {}'
      .format(test_result.rootMeanSquaredError, test_result.meanSquaredError))

RMSE: 9.437461201758461
MSE: 89.06567393469628


In [52]:
spark.stop()