In [7]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [2]:
spark = SparkSession.builder.appName('lr_example').getOrCreate()
spark

In [3]:
df = spark.read.csv('Ecommerce_Customers.csv',inferSchema=True,header=True)
df.show()

+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+
|               Email|             Address|          Avatar|Avg Session Length|       Time on App|   Time on Website|Length of Membership|Yearly Amount Spent|
+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+
|mstephenson@ferna...|835 Frank TunnelW...|          Violet| 34.49726772511229| 12.65565114916675| 39.57766801952616|  4.0826206329529615|  587.9510539684005|
|   hduke@hotmail.com|4547 Archer Commo...|       DarkGreen| 31.92627202636016|11.109460728682564|37.268958868297744|    2.66403418213262|  392.2049334443264|
|    pallen@yahoo.com|24645 Valerie Uni...|          Bisque|33.000914755642675|11.330278057777512|37.110597442120856|   4.104543202376424| 487.54750486747207|
|riverarebecca@gma...|1414 David Throug...|   

In [4]:
df.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [5]:
df.count()

500

In [6]:
for item in df.head(1)[0]:
    print(item)

mstephenson@fernandez.com
835 Frank TunnelWrightmouth, MI 82180-9605
Violet
34.49726772511229
12.65565114916675
39.57766801952616
4.0826206329529615
587.9510539684005


In [8]:
df.columns

['Email',
 'Address',
 'Avatar',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent']

In [12]:
assembler = VectorAssembler(inputCols=['Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership'],outputCol='features')
assembler

VectorAssembler_83b894481e26

In [13]:
output = assembler.transform(df)
output.show()

+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+--------------------+
|               Email|             Address|          Avatar|Avg Session Length|       Time on App|   Time on Website|Length of Membership|Yearly Amount Spent|            features|
+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+--------------------+
|mstephenson@ferna...|835 Frank TunnelW...|          Violet| 34.49726772511229| 12.65565114916675| 39.57766801952616|  4.0826206329529615|  587.9510539684005|[34.4972677251122...|
|   hduke@hotmail.com|4547 Archer Commo...|       DarkGreen| 31.92627202636016|11.109460728682564|37.268958868297744|    2.66403418213262|  392.2049334443264|[31.9262720263601...|
|    pallen@yahoo.com|24645 Valerie Uni...|          Bisque|33.000914755642675|11.330278057777512|37

In [14]:
output.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)
 |-- features: vector (nullable = true)



In [15]:
output.select('features').show()

+--------------------+
|            features|
+--------------------+
|[34.4972677251122...|
|[31.9262720263601...|
|[33.0009147556426...|
|[34.3055566297555...|
|[33.3306725236463...|
|[33.8710378793419...|
|[32.0215955013870...|
|[32.7391429383803...|
|[33.9877728956856...|
|[31.9365486184489...|
|[33.9925727749537...|
|[33.8793608248049...|
|[29.5324289670579...|
|[33.1903340437226...|
|[32.3879758531538...|
|[30.7377203726281...|
|[32.1253868972878...|
|[32.3388993230671...|
|[32.1878120459321...|
|[32.6178560628234...|
+--------------------+
only showing top 20 rows



In [16]:
final_df = output.select('features','Yearly Amount Spent')
final_df.show()

+--------------------+-------------------+
|            features|Yearly Amount Spent|
+--------------------+-------------------+
|[34.4972677251122...|  587.9510539684005|
|[31.9262720263601...|  392.2049334443264|
|[33.0009147556426...| 487.54750486747207|
|[34.3055566297555...|  581.8523440352177|
|[33.3306725236463...|  599.4060920457634|
|[33.8710378793419...|   637.102447915074|
|[32.0215955013870...|  521.5721747578274|
|[32.7391429383803...|  549.9041461052942|
|[33.9877728956856...|  570.2004089636196|
|[31.9365486184489...|  427.1993848953282|
|[33.9925727749537...|  492.6060127179966|
|[33.8793608248049...|  522.3374046069357|
|[29.5324289670579...|  408.6403510726275|
|[33.1903340437226...|  573.4158673313865|
|[32.3879758531538...|  470.4527333009554|
|[30.7377203726281...|  461.7807421962299|
|[32.1253868972878...| 457.84769594494855|
|[32.3388993230671...| 407.70454754954415|
|[32.1878120459321...|  452.3156754800354|
|[32.6178560628234...|   605.061038804892|
+----------

In [17]:
final_df.printSchema()

root
 |-- features: vector (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [39]:
final_df.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                500|
|   mean|  499.3140382585909|
| stddev|   79.3147815497068|
|    min| 256.67058229005585|
|    max|  765.5184619388373|
+-------+-------------------+



In [18]:
train_df, test_df = final_df.randomSplit([0.7,0.3])
train_df.show()

+--------------------+-------------------+
|            features|Yearly Amount Spent|
+--------------------+-------------------+
|[29.5324289670579...|  408.6403510726275|
|[30.4925366965402...|  282.4712457199145|
|[30.5743636841713...| 442.06441375806565|
|[30.8162006488763...|   266.086340948469|
|[30.8364326747734...|  467.5019004269896|
|[30.8794843441274...|  490.2065999848547|
|[30.9716756438877...|  494.6386097568927|
|[31.0472221394875...|  392.4973991890214|
|[31.0613251567161...|  487.5554580579016|
|[31.0662181616375...| 448.93329320767435|
|[31.1280900496166...|  557.2526867470547|
|[31.2606468698795...|  421.3266312569514|
|[31.2681042107507...|  423.4705331738239|
|[31.2834474760581...|  591.7810894256675|
|[31.3123495994443...|  463.5914180279406|
|[31.3584771924370...|  495.1759504494754|
|[31.3895854806643...|  410.0696110599829|
|[31.4252268808548...|  530.7667186547619|
|[31.4459724827577...| 484.87696493512857|
|[31.4474464941278...|   418.602742095224|
+----------

In [19]:
test_df.show()

+--------------------+-------------------+
|            features|Yearly Amount Spent|
+--------------------+-------------------+
|[30.3931845423455...|  319.9288698031936|
|[30.7377203726281...|  461.7807421962299|
|[31.1239743499119...|  486.9470538397658|
|[31.1695067987115...|  427.3565308022928|
|[31.3091926408918...|  432.7207178399336|
|[31.3662121671876...|  430.5888825564849|
|[31.5147378578019...|  489.8124879964614|
|[31.5761319713222...|  541.2265839893283|
|[31.6610498227460...| 416.35835357990084|
|[31.8512531286083...|  472.9922466667984|
|[31.8745516945853...|  392.2852442462675|
|[31.9262720263601...|  392.2049334443264|
|[31.9480174211613...|  461.9208768928978|
|[31.9673209478824...| 445.74984123965226|
|[32.0123007682454...| 492.94505306595823|
|[32.0180740106320...|  357.7831107453153|
|[32.0478146331398...|  497.3895577588434|
|[32.0542618511847...|   561.874657668983|
|[32.0789475795693...| 357.86371863839173|
|[32.0961089938451...|  375.3984554102432|
+----------

In [20]:
train_df.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                360|
|   mean|  499.7328334559645|
| stddev|  81.56730814469225|
|    min| 256.67058229005585|
|    max|  765.5184619388373|
+-------+-------------------+



In [21]:
test_df.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                140|
|   mean| 498.23713632248683|
| stddev|   73.4745540925845|
|    min|  304.1355915788555|
|    max|  689.2356997616951|
+-------+-------------------+



In [22]:
lr = LinearRegression(featuresCol='features',labelCol='Yearly Amount Spent',predictionCol='prediction')
lr

LinearRegression_4e689d56ab6a

In [23]:
lr_model = lr.fit(train_df)
lr_model.summary

<pyspark.ml.regression.LinearRegressionTrainingSummary at 0x1a349094290>

In [25]:
test_results = lr_model.evaluate(test_df)
test_results

<pyspark.ml.regression.LinearRegressionSummary at 0x1a336d3b950>

In [26]:
test_results.r2

0.9791715699156316

In [27]:
test_results.rootMeanSquaredError

10.56595142497611

In [28]:
test_results.explainedVariance

5119.827525974597

In [32]:
test_results.meanAbsoluteError

8.294784606284084

In [33]:
test_results.meanSquaredError

111.6393295149547

In [34]:
test_results.r2adj

0.9785544312464651

In [35]:
test_results.degreesOfFreedom

135

In [36]:
test_results.devianceResiduals

[-22.961041558106558, 31.609303440831127]

In [38]:
test_results.predictions.show()

+--------------------+-------------------+------------------+
|            features|Yearly Amount Spent|        prediction|
+--------------------+-------------------+------------------+
|[30.3931845423455...|  319.9288698031936| 330.5730593032961|
|[30.7377203726281...|  461.7807421962299|449.65833540662334|
|[31.1239743499119...|  486.9470538397658|507.16304890872743|
|[31.1695067987115...|  427.3565308022928|  414.989948477152|
|[31.3091926408918...|  432.7207178399336|428.86433891804336|
|[31.3662121671876...|  430.5888825564849| 426.5872330195284|
|[31.5147378578019...|  489.8124879964614| 494.4900952691537|
|[31.5761319713222...|  541.2265839893283| 542.1012275187734|
|[31.6610498227460...| 416.35835357990084| 416.8989198308209|
|[31.8512531286083...|  472.9922466667984|463.60840614563017|
|[31.8745516945853...|  392.2852442462675| 398.5194201664656|
|[31.9262720263601...|  392.2049334443264|380.27370708442936|
|[31.9480174211613...|  461.9208768928978|455.25951225968197|
|[31.967

In [40]:
unlabeled_data = test_df.select('features')
unlabeled_data.show()

+--------------------+
|            features|
+--------------------+
|[30.3931845423455...|
|[30.7377203726281...|
|[31.1239743499119...|
|[31.1695067987115...|
|[31.3091926408918...|
|[31.3662121671876...|
|[31.5147378578019...|
|[31.5761319713222...|
|[31.6610498227460...|
|[31.8512531286083...|
|[31.8745516945853...|
|[31.9262720263601...|
|[31.9480174211613...|
|[31.9673209478824...|
|[32.0123007682454...|
|[32.0180740106320...|
|[32.0478146331398...|
|[32.0542618511847...|
|[32.0789475795693...|
|[32.0961089938451...|
+--------------------+
only showing top 20 rows



In [41]:
test_predictions = lr_model.transform(unlabeled_data)
test_predictions

DataFrame[features: vector, prediction: double]

In [42]:
test_predictions.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[30.3931845423455...| 330.5730593032961|
|[30.7377203726281...|449.65833540662334|
|[31.1239743499119...|507.16304890872743|
|[31.1695067987115...|  414.989948477152|
|[31.3091926408918...|428.86433891804336|
|[31.3662121671876...| 426.5872330195284|
|[31.5147378578019...| 494.4900952691537|
|[31.5761319713222...| 542.1012275187734|
|[31.6610498227460...| 416.8989198308209|
|[31.8512531286083...|463.60840614563017|
|[31.8745516945853...| 398.5194201664656|
|[31.9262720263601...|380.27370708442936|
|[31.9480174211613...|455.25951225968197|
|[31.9673209478824...| 450.1375565623557|
|[32.0123007682454...|488.64100053176185|
|[32.0180740106320...| 341.1051094606951|
|[32.0478146331398...|  479.636890380815|
|[32.0542618511847...|  555.790699880228|
|[32.0789475795693...|351.37953950150677|
|[32.0961089938451...|375.34254338444043|
+--------------------+------------

In [43]:
test_predictions.describe().show()

+-------+-----------------+
|summary|       prediction|
+-------+-----------------+
|  count|              140|
|   mean|496.5564175283532|
| stddev|71.79008064801432|
|    min|313.5402425013658|
|    max|674.9175734413725|
+-------+-----------------+

