In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession

In [2]:
from pyspark import SparkContext, SparkConf
sc = SparkContext.getOrCreate()
sc

In [3]:
spark = SparkSession.builder.appName("lin_reg").master("local").getOrCreate()

In [4]:
dataset = spark.read.csv("Ecommerce_Customers.csv", inferSchema=True, header=True)

In [5]:
dataset.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [6]:
dataset.show()

+--------------------+--------------------+------------------+-----------+---------------+--------------------+-------------------+
|               Email|             Address|Avg Session Length|Time on App|Time on Website|Length of Membership|Yearly Amount Spent|
+--------------------+--------------------+------------------+-----------+---------------+--------------------+-------------------+
|mstephenson@ferna...|835 Frank TunnelW...|       34.49726773|12.65565115|    39.57766802|         4.082620633|         587.951054|
|   hduke@hotmail.com|4547 Archer Commo...|       31.92627203|11.10946073|    37.26895887|         2.664034182|        392.2049334|
|    pallen@yahoo.com|24645 Valerie Uni...|       33.00091476|11.33027806|    37.11059744|         4.104543202|        487.5475049|
|riverarebecca@gma...|1414 David Throug...|       34.30555663|13.71751367|    36.72128268|         3.120178783|         581.852344|
|mstephens@davidso...|14023 Rodriguez P...|       33.33067252|12.79518855|  

In [7]:
dataset.head()

Row(Email='mstephenson@fernandez.com', Address='835 Frank TunnelWrightmouth, MI 82180-9605', Avg Session Length=34.49726773, Time on App=12.65565115, Time on Website=39.57766802, Length of Membership=4.082620633, Yearly Amount Spent=587.951054)

In [8]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [9]:
dataset.columns

['Email',
 'Address',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent']

In [10]:
assembler = VectorAssembler(inputCols=['Avg Session Length', 'Time on App',
                                       'Time on Website','Length of Membership'],
                            outputCol='features')

In [11]:
output = assembler.transform(dataset)

In [12]:
output.select("features").show()

+--------------------+
|            features|
+--------------------+
|[34.49726773,12.6...|
|[31.92627203,11.1...|
|[33.00091476,11.3...|
|[34.30555663,13.7...|
|[33.33067252,12.7...|
|[33.87103788,12.0...|
|[32.0215955,11.36...|
|[32.73914294,12.3...|
|[33.9877729,13.38...|
|[31.93654862,11.8...|
|[33.99257277,13.3...|
|[33.87936082,11.5...|
|[29.53242897,10.9...|
|[33.19033404,12.9...|
|[32.38797585,13.1...|
|[30.73772037,12.6...|
|[32.1253869,11.73...|
|[32.33889932,12.0...|
|[32.18781205,14.7...|
|[32.61785606,13.9...|
+--------------------+
only showing top 20 rows



In [13]:
final_data = output.select("features", "Yearly Amount Spent")

In [14]:
train_data, test_data = final_data.randomSplit([0.7, 0.3])

In [15]:
train_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                358|
|   mean| 503.70442906983214|
| stddev|    79.777548163862|
|    min|        256.6705823|
|    max|        765.5184619|
+-------+-------------------+



In [16]:
test_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                142|
|   mean|  488.2453065028171|
| stddev|   77.3092346924966|
|    min|        266.0863409|
|    max|        666.1255917|
+-------+-------------------+



In [17]:
from pyspark.ml.regression import LinearRegression

In [18]:
lr = LinearRegression(labelCol='Yearly Amount Spent')

In [19]:
model = lr.fit(train_data)

In [20]:
import pandas as pd

In [21]:
pd.DataFrame({"Coefficients":model.coefficients}, index=['Avg Session Length', 'Time on App',
                                       'Time on Website','Length of Membership'])

Unnamed: 0,Coefficients
Avg Session Length,25.729174
Time on App,38.53065
Time on Website,0.383943
Length of Membership,60.880091


In [22]:
intercept = model.intercept
print ("The Intercept of the model is : %f" %intercept)

The Intercept of the model is : -1044.660618


In [23]:
res = model.evaluate(test_data)

In [24]:
unlabeled_data = test_data.select("features")

In [25]:
predictions = model.transform(unlabeled_data)

In [26]:
res.predictions.show()

+--------------------+-------------------+------------------+
|            features|Yearly Amount Spent|        prediction|
+--------------------+-------------------+------------------+
|[30.81620065,11.8...|        266.0863409|285.06362879977337|
|[30.83643267,13.1...|        467.5019004|471.93287941013523|
|[30.87948434,13.2...|           490.2066| 493.9929765269285|
|[31.1695068,13.97...|        427.3565308|  418.347461029088|
|[31.26810421,12.1...|        423.4705332|427.95699998181453|
|[31.28344748,12.7...|        591.7810894| 568.7892999192802|
|[31.30919264,11.9...|        432.7207178| 430.2796359505885|
|[31.57413802,12.9...|        544.4092722|  558.100164416745|
|[31.57613197,12.5...|         541.226584| 542.6281277918915|
|[31.72420252,13.1...|        503.3878873| 509.6123690735085|
|[31.81642833,14.2...|        501.1224915| 519.0742568933376|
|[31.81861657,11.2...|        446.4186734| 448.9250947775331|
|[31.82934646,11.2...|         385.152338|385.12897170378665|
|[31.853

In [27]:
print("MAE:", res.meanAbsoluteError)

MAE: 8.208810359078393


In [28]:
print("MSE:", res.meanSquaredError)

MSE: 107.59992652800518


In [29]:
print("RMSE:", res.rootMeanSquaredError)

RMSE: 10.373038442423955


In [30]:
print("R2", res.r2)

R2 0.9818691377496387


In [31]:
print("Adj R2: ", res.r2adj)

Adj R2:  0.9813397695087522
