In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://mirrors.sonic.net/apache/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz
!tar xzf spark-3.1.2-bin-hadoop3.2.tgz
!pip install -q findspark


import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop3.2"


import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [None]:
from pyspark import SparkFiles
spark.sparkContext.addFile('https://raw.githubusercontent.com/erwindrarusli/machine-learning-linear-regression/master/Ecommerce%20Customers')
df = spark.read.options(inferSchema='True', header='True').csv(SparkFiles.get('Ecommerce Customers'))

In [None]:
df.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg. Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [None]:
df.describe().toPandas()

Unnamed: 0,summary,Email,Address,Avatar,Avg. Session Length,Time on App,Time on Website,Length of Membership,Yearly Amount Spent
0,count,1000,1000,500,500.0,500.0,500.0,500.0,454.0
1,mean,,,33.058900610740785,31.130461514638228,14.350284262734618,33.95692625943925,48.60730499130784,499.9198577164192
2,stddev,,,0.9518717613059207,6.128380357402919,7.32783840726642,9.787360915468764,144.2169189098249,78.3257926883325
3,min,"APO AA 17032-7944""","AK 05665""",30.879484344127498,8.668349517101323,8.508152176032603,0.7895199078816915,0.2699010899842742,266.086340948469
4,max,zscott@wright.com,YellowGreen,YellowGreen,36.13966248879052,39.22071295098572,40.005181638101895,744.2218671047146,765.5184619388372


In [None]:
df = df.toDF(*(c.replace('.', '') for c in df.columns))

In [None]:
df.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [None]:
df.columns

['Email',
 'Address',
 'Avatar',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent']

In [None]:
df.show(3)

+--------------------+------------------+------+------------------+------------------+-----------------+--------------------+-------------------+
|               Email|           Address|Avatar|Avg Session Length|       Time on App|  Time on Website|Length of Membership|Yearly Amount Spent|
+--------------------+------------------+------+------------------+------------------+-----------------+--------------------+-------------------+
|mstephenson@ferna...|  835 Frank Tunnel|  null|              null|              null|             null|                null|               null|
|         Wrightmouth|    MI 82180-9605"|Violet| 34.49726772511229|12.655651149166752|39.57766801952616|   4.082620632952961|  587.9510539684005|
|   hduke@hotmail.com|4547 Archer Common|  null|              null|              null|             null|                null|               null|
+--------------------+------------------+------+------------------+------------------+-----------------+--------------------

In [None]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=['Avg Session Length', 'Time on App', 'Time on Website', 
                                       'Length of Membership'], outputCol='features', 
                            handleInvalid='skip')
output = assembler.transform(df)

In [None]:
from pyspark.ml.feature import Imputer
imputer = Imputer(inputCols=['Yearly Amount Spent'], outputCols=['label'], strategy='mean')
imputer_model = imputer.fit(output)
output = imputer_model.transform(output)

In [None]:
output.show()

+------------------+---------------+------------------+------------------+------------------+------------------+--------------------+-------------------+--------------------+------------------+
|             Email|        Address|            Avatar|Avg Session Length|       Time on App|   Time on Website|Length of Membership|Yearly Amount Spent|            features|             label|
+------------------+---------------+------------------+------------------+------------------+------------------+--------------------+-------------------+--------------------+------------------+
|       Wrightmouth| MI 82180-9605"|            Violet| 34.49726772511229|12.655651149166752| 39.57766801952616|   4.082620632952961|  587.9510539684005|[34.4972677251122...| 587.9510539684005|
|       Diazchester| CA 06566-8576"|         DarkGreen|31.926272026360156|11.109460728682564|37.268958868297744|    2.66403418213262|  392.2049334443264|[31.9262720263601...| 392.2049334443264|
|       Cobbborough| DC 99414-

In [None]:
output.select('features').head(1)

[Row(features=DenseVector([34.4973, 12.6557, 39.5777, 4.0826]))]

In [None]:
final_data = output.select(['features', 'label'])

In [None]:
train_data, test_data = final_data.randomSplit([0.7, 0.3])

In [None]:
final_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                454|
|   mean| 499.91985771641924|
| stddev|   78.3257926883325|
|    min|   266.086340948469|
|    max|  765.5184619388372|
+-------+-------------------+



In [None]:
train_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                330|
|   mean|  498.5394375416201|
| stddev|  78.83140683885206|
|    min|   266.086340948469|
|    max|  765.5184619388372|
+-------+-------------------+



In [None]:
test_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                124|
|   mean|  503.5935565687075|
| stddev|  77.15923305636252|
|    min| 298.76200786180766|
|    max|  712.3963268096636|
+-------+-------------------+



In [None]:
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(labelCol='label')

+--------------------+-------------------+
|            features|Yearly Amount Spent|
+--------------------+-------------------+
|[10.0473147350711...|               null|
|[10.2565490312879...|               null|
|[10.8755595481892...|               null|
|[11.3062323446735...|               null|
|[11.4333799333344...|               null|
|[11.5847829995352...|               null|
|[11.5889485797124...|               null|
|[11.6089979362215...|               null|
|[11.6565920338517...|               null|
|[11.7329914615880...|               null|
|[11.7958866768891...|               null|
|[11.9176361831014...|               null|
|[12.0201120912087...|               null|
|[12.0641566321993...|               null|
|[12.2072984910507...|               null|
|[12.2289347111239...|               null|
|[12.2637176760343...|               null|
|[12.3519589730029...|               null|
|[12.3643416044551...|               null|
|[12.6451951408435...|               null|
+----------

In [None]:
lr_model = lr.fit(train_data)

In [None]:
test_results = lr_model.evaluate(test_data)

In [None]:
test_results.residuals.show()

+-------------------+
|          residuals|
+-------------------+
|-14.716836573391845|
| -27.80406118644663|
|  12.42701239418767|
|-13.763538848241694|
| -18.55178335596537|
| -47.32605803762925|
|-14.345477361109204|
|-2.1834813433608247|
|-40.366489393769314|
|-35.569958457724795|
| -63.84923426899053|
|-113.11063419196847|
|-43.980961474157255|
| -37.59590606624283|
|  33.02525723240569|
|-15.024476464584723|
| 11.170874370971319|
| 16.919349291227206|
|-19.063824701988608|
|  45.36688992325037|
+-------------------+
only showing top 20 rows



In [None]:
test_results.rootMeanSquaredError

64.86486237638898

In [None]:
test_results.r2

0.29380208300022304

In [None]:
lr_model.transform(test_data).show()

+--------------------+------------------+------------------+
|            features|             label|        prediction|
+--------------------+------------------+------------------+
|[10.8755595481892...|499.91985771641924| 514.6366942898111|
|[11.0313583404091...|499.91985771641924| 527.7239189028659|
|[11.2307433067610...|499.91985771641924| 487.4928453222316|
|[11.7958866768891...|499.91985771641924| 513.6833965646609|
|[12.3519589730029...|499.91985771641924| 518.4716410723846|
|[12.3643416044551...|499.91985771641924| 547.2459157540485|
|[12.4287369278682...|499.91985771641924| 514.2653350775284|
|[12.7520766109646...|499.91985771641924|502.10333905978007|
|[12.8779836962563...|499.91985771641924| 540.2863471101886|
|[13.0686385841824...|499.91985771641924|  535.489816174144|
|[13.1868128730011...|499.91985771641924| 563.7690919854098|
|[13.8913134217784...|499.91985771641924| 613.0304919083877|
|[13.9701810741304...|499.91985771641924| 543.9008191905765|
|[31.0472221394875...| 3

In [None]:
unlabeled_data = test_data.select('features')

In [None]:
unlabeled_data.show()

+--------------------+
|            features|
+--------------------+
|[10.8755595481892...|
|[11.0313583404091...|
|[11.2307433067610...|
|[11.7958866768891...|
|[12.3519589730029...|
|[12.3643416044551...|
|[12.4287369278682...|
|[12.7520766109646...|
|[12.8779836962563...|
|[13.0686385841824...|
|[13.1868128730011...|
|[13.8913134217784...|
|[13.9701810741304...|
|[31.0472221394875...|
|[31.1239743499119...|
|[31.3091926408918...|
|[31.3123495994443...|
|[31.3584771924370...|
|[31.3895854806644...|
|[31.5702008293202...|
+--------------------+
only showing top 20 rows



In [None]:
predictions = lr_model.transform(unlabeled_data)

In [None]:
predictions.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[10.8755595481892...| 514.6366942898111|
|[11.0313583404091...| 527.7239189028659|
|[11.2307433067610...| 487.4928453222316|
|[11.7958866768891...| 513.6833965646609|
|[12.3519589730029...| 518.4716410723846|
|[12.3643416044551...| 547.2459157540485|
|[12.4287369278682...| 514.2653350775284|
|[12.7520766109646...|502.10333905978007|
|[12.8779836962563...| 540.2863471101886|
|[13.0686385841824...|  535.489816174144|
|[13.1868128730011...| 563.7690919854098|
|[13.8913134217784...| 613.0304919083877|
|[13.9701810741304...| 543.9008191905765|
|[31.0472221394875...| 430.0933052552642|
|[31.1239743499119...| 453.9217966073601|
|[31.3091926408918...|447.74519430451835|
|[31.3123495994443...|452.42054365696924|
|[31.3584771924370...|478.25660115824815|
|[31.3895854806644...|429.13343576197155|
|[31.5702008293202...| 500.5786022181545|
+--------------------+------------