In [None]:
%cd /content
!rm -rf sample_data

In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.2.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 28 kB/s 
[?25hCollecting py4j==0.10.9.3
  Downloading py4j-0.10.9.3-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 50.8 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.1-py2.py3-none-any.whl size=281853642 sha256=615b64e1f877faac1f191fff65618a26e02344c9e2e0d462ebd1088f411f65c9
  Stored in directory: /root/.cache/pip/wheels/9f/f5/07/7cd8017084dce4e93e84e92efd1e1d5334db05f2e83bcef74f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.3 pyspark-3.2.1
/content


In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('lrex').getOrCreate()

In [24]:
data = spark.read.csv('Ecommerce_Customers.csv', inferSchema=True, header=True)

In [25]:
data.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [27]:
data.show(5)

+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+
|               Email|             Address|          Avatar|Avg Session Length|       Time on App|   Time on Website|Length of Membership|Yearly Amount Spent|
+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+
|mstephenson@ferna...|835 Frank TunnelW...|          Violet| 34.49726772511229| 12.65565114916675| 39.57766801952616|  4.0826206329529615|  587.9510539684005|
|   hduke@hotmail.com|4547 Archer Commo...|       DarkGreen| 31.92627202636016|11.109460728682564|37.268958868297744|    2.66403418213262|  392.2049334443264|
|    pallen@yahoo.com|24645 Valerie Uni...|          Bisque|33.000914755642675|11.330278057777512|37.110597442120856|   4.104543202376424| 487.54750486747207|
|riverarebecca@gma...|1414 David Throug...|   

In [29]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vectors

In [31]:
data.columns

['Email',
 'Address',
 'Avatar',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent']

In [32]:
assembler = VectorAssembler(inputCols=['Avg Session Length', 'Time on App', 'Time on Website', 'Length of Membership'],
                            outputCol='features')

In [36]:
output = assembler.transform(data)
output.select('features').show(5, truncate=False)

+----------------------------------------------------------------------------+
|features                                                                    |
+----------------------------------------------------------------------------+
|[34.49726772511229,12.65565114916675,39.57766801952616,4.0826206329529615]  |
|[31.92627202636016,11.109460728682564,37.268958868297744,2.66403418213262]  |
|[33.000914755642675,11.330278057777512,37.110597442120856,4.104543202376424]|
|[34.30555662975554,13.717513665142507,36.72128267790313,3.120178782748092]  |
|[33.33067252364639,12.795188551078114,37.53665330059473,4.446308318351434]  |
+----------------------------------------------------------------------------+
only showing top 5 rows



In [37]:
final_data = output.select('features','Yearly Amount Spent')
final_data.show(5)

+--------------------+-------------------+
|            features|Yearly Amount Spent|
+--------------------+-------------------+
|[34.4972677251122...|  587.9510539684005|
|[31.9262720263601...|  392.2049334443264|
|[33.0009147556426...| 487.54750486747207|
|[34.3055566297555...|  581.8523440352177|
|[33.3306725236463...|  599.4060920457634|
+--------------------+-------------------+
only showing top 5 rows



In [41]:
train, test = final_data.randomSplit([0.7,0.3])
train.describe().show()
test.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                351|
|   mean| 496.02247276415744|
| stddev|  77.97893874713361|
|    min| 256.67058229005585|
|    max|  765.5184619388373|
+-------+-------------------+

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                149|
|   mean|   507.067994557558|
| stddev|  82.12198051637158|
|    min|  282.4712457199145|
|    max|  744.2218671047146|
+-------+-------------------+



In [43]:
lr = LinearRegression(labelCol='Yearly Amount Spent')
lr_model = lr.fit(train)

In [44]:
test_results = lr_model.evaluate(test)
test_results.residuals.show()



+-------------------+
|          residuals|
+-------------------+
| -4.321664937766684|
|-3.7326399817574725|
|  5.073072654712064|
|-12.504577309447257|
| -7.481598174172177|
|-3.3649015480696107|
| 19.228463644774422|
| -5.392912768240649|
| -5.018409126514484|
| 3.7658722614647218|
|-17.676410457956536|
|-1.8580974896552789|
| 17.460981672011997|
| 3.0000643732653884|
|  8.169683117187049|
| -9.014031285132603|
| 1.6767207861757925|
| -2.065315570040866|
| 11.899128166360583|
| 12.568553039434676|
+-------------------+
only showing top 20 rows



In [46]:
test_results.rootMeanSquaredError

10.09172833170066

In [47]:
test_results.r2

0.9847967364013986

In [48]:
unlabeled_data = test.select('features')
unlabeled_data.show(5)

+--------------------+
|            features|
+--------------------+
|[30.4925366965402...|
|[30.8794843441274...|
|[31.0472221394875...|
|[31.0662181616375...|
|[31.1280900496166...|
+--------------------+
only showing top 5 rows



In [50]:
predictions = lr_model.transform(unlabeled_data)
predictions.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[30.4925366965402...| 286.7929106576812|
|[30.8794843441274...|493.93923996661215|
|[31.0472221394875...|387.42432653430933|
|[31.0662181616375...| 461.4378705171216|
|[31.1280900496166...| 564.7342849212268|
|[31.2681042107507...|426.83543472189353|
|[31.3123495994443...| 444.3629543831662|
|[31.5147378578019...|495.20540076470206|
|[31.5257524169682...|448.98403593639637|
|[31.5316044825729...|432.74973346789784|
|[31.5702008293202...| 563.6219025993614|
|[31.5761319713222...| 543.0846814789836|
|[31.6098395733896...|427.08456797909616|
|[31.7366356860502...|493.93338188226653|
|[31.8209982016720...| 416.5055978960263|
|[31.8279790554652...| 449.0167788320741|
|[31.8293464559211...| 383.4756172017992|
|[31.8530748017465...|461.35043903239284|
|[31.9096268275227...| 551.5469075068786|
|[31.9262720263601...| 379.6363804048917|
+--------------------+------------