In [51]:
# Initialize pyspark
import findspark
findspark.init()
import pyspark

In [52]:
# Initialize and create ba spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('ECommerce').getOrCreate()

In [53]:
# Import statements to setup ML
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vectors

In [54]:
# Using Spark to read in the Ecommerce Customers csv file
data = spark.read.csv('Ecommerce_Customers.csv', header=True, inferSchema=True)

In [55]:
# Printing the first row of the dataframe
data.head()

Row(Email='mstephenson@fernandez.com', Address='835 Frank TunnelWrightmouth, MI 82180-9605', Avatar='Violet', Avg Session Length=34.49726772511229, Time on App=12.65565114916675, Time on Website=39.57766801952616, Length of Membership=4.0826206329529615, Yearly Amount Spent=587.9510539684005)

In [56]:
# Printing the schema of the dataframe
data.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



__Filtering the string columns and converting the dataframe to ML acceptable format
             --->    i.e., ("label","features")__

In [57]:
filtered_data = data.select('Avg Session Length','Time on App','Time on Website',
                            'Length of Membership','Yearly Amount Spent')

In [58]:
assembler = VectorAssembler(inputCols=['Avg Session Length','Time on App','Time on Website',
                            'Length of Membership'], outputCol='features')

In [59]:
output = assembler.transform(filtered_data)

In [60]:
output.head()

Row(Avg Session Length=34.49726772511229, Time on App=12.65565114916675, Time on Website=39.57766801952616, Length of Membership=4.0826206329529615, Yearly Amount Spent=587.9510539684005, features=DenseVector([34.4973, 12.6557, 39.5777, 4.0826]))

In [61]:
final_data = output.select('Yearly Amount Spent','features')

In [62]:
final_data.show(3, truncate=False)

+-------------------+----------------------------------------------------------------------------+
|Yearly Amount Spent|features                                                                    |
+-------------------+----------------------------------------------------------------------------+
|587.9510539684005  |[34.49726772511229,12.65565114916675,39.57766801952616,4.0826206329529615]  |
|392.2049334443264  |[31.92627202636016,11.109460728682564,37.268958868297744,2.66403418213262]  |
|487.54750486747207 |[33.000914755642675,11.330278057777512,37.110597442120856,4.104543202376424]|
+-------------------+----------------------------------------------------------------------------+
only showing top 3 rows



#Splitting the resultane data into training data and testing data
#Training data is to train the model
#Testing data is to test the builted model

In [63]:
#Splitting the total data to 70% and 30% for training data and testing data respectively
train_data,test_data = final_data.randomSplit([0.7,0.3])

In [64]:
train_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                346|
|   mean| 499.04984334315134|
| stddev|  74.47678689889574|
|    min| 298.76200786180766|
|    max|  712.3963268096637|
+-------+-------------------+



In [65]:
test_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                154|
|   mean| 499.90761904262894|
| stddev|  89.49403436552568|
|    min| 256.67058229005585|
|    max|  765.5184619388373|
+-------+-------------------+



In [66]:
#Creating a linear regression model object
lr = LinearRegression(labelCol='Yearly Amount Spent', featuresCol='features')

In [67]:
# Creating a linear regression model and fitting the training data to it
lrModel = lr.fit(train_data)

__Getting the training summary of the created model__

In [68]:
training_summary = lrModel.summary

In [69]:
training_summary.residuals.show(3)

+------------------+
|         residuals|
+------------------+
|-7.966971686418219|
| 18.33880509585248|
| -6.82537604508758|
+------------------+
only showing top 3 rows



In [70]:
print("Mean Absolute Error: ",training_summary.meanAbsoluteError)
print("Mean Squared Error: ",training_summary.meanSquaredError)
print("Root Mean Squared Error: ",training_summary.rootMeanSquaredError)
print("R Squared Error: ",training_summary.r2)

Mean Absolute Error:  7.814183037105314
Mean Squared Error:  98.28621698120675
Root Mean Squared Error:  9.9139405375061
R Squared Error:  0.9822291680748632


In [71]:
# Evaluating the model against test data
test_results = lrModel.evaluate(test_data)

In [72]:
# Getting the co-effecients and intercept
print('Coeffecients: {}, Intercept: {}'.format(lrModel.coefficients,lrModel.intercept))

Coeffecients: [25.272442103488725,38.51841326144159,0.9308878998904747,61.29038433041515], Intercept: -1051.393797660776


In [73]:
# Getting the residuals
test_results.residuals.show(3)

+-------------------+
|          residuals|
+-------------------+
| 1.0927957098668344|
| -18.66530847996762|
|-6.7734865565324185|
+-------------------+
only showing top 3 rows



In [74]:
# Evaluating the model by checking the different types of error

print("Mean Absolute Error: ",test_results.meanAbsoluteError)
print("Mean Squared Error: ",test_results.meanSquaredError)
print("Root Mean Squared Error: ",test_results.rootMeanSquaredError)
print("R Squared Error: ",test_results.r2)

Mean Absolute Error:  8.082676997797558
Mean Squared Error:  100.86257606655617
Root Mean Squared Error:  10.043036197612562
R Squared Error:  0.9873243227311742


__Getting the predictions from the builted model without label column__

In [75]:
unlabelled_data = test_data.select('features')

In [76]:
predictions = lrModel.transform(unlabelled_data)

In [77]:
predictions.show(5)

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[32.8369407670213...|  255.577786580189|
|[30.8162006488763...| 284.7516494284366|
|[31.5171218025062...|282.69190720691813|
|[30.4925366965402...| 288.9120289574239|
|[33.6666156834513...|313.83639011044556|
+--------------------+------------------+
only showing top 5 rows



In [78]:
# Stopping the created spark session
spark.stop()