In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkContext
from pyspark.sql import SparkSession

In [3]:
sc = SparkContext(master='local',appName='Chapter 6')
spark = SparkSession(sc)

In [4]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.classification import LogisticRegression
from pyspark.sql.functions import corr
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [5]:
data = spark.read.csv('./Ecommerce_Customers.csv',inferSchema=True,header=True)

In [6]:
data.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [10]:
data.toPandas().head()

Unnamed: 0,Email,Address,Avatar,Avg Session Length,Time on App,Time on Website,Length of Membership,Yearly Amount Spent
0,mstephenson@fernandez.com,"835 Frank TunnelWrightmouth, MI 82180-9605",Violet,34.497268,12.655651,39.577668,4.082621,587.951054
1,hduke@hotmail.com,"4547 Archer CommonDiazchester, CA 06566-8576",DarkGreen,31.926272,11.109461,37.268959,2.664034,392.204933
2,pallen@yahoo.com,"24645 Valerie Unions Suite 582Cobbborough, DC ...",Bisque,33.000915,11.330278,37.110597,4.104543,487.547505
3,riverarebecca@gmail.com,"1414 David ThroughwayPort Jason, OH 22070-1220",SaddleBrown,34.305557,13.717514,36.721283,3.120179,581.852344
4,mstephens@davidson-herman.com,"14023 Rodriguez PassagePort Jacobville, PR 372...",MediumAquaMarine,33.330673,12.795189,37.536653,4.446308,599.406092


In [11]:
data.columns

['Email',
 'Address',
 'Avatar',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent']

In [12]:
assembler = VectorAssembler( inputCols=['Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership'], outputCol='features')

In [13]:
data_pre = assembler.transform(data)

In [20]:
data_pre[['features']].show(2)

+--------------------+
|            features|
+--------------------+
|[34.4972677251122...|
|[31.9262720263601...|
+--------------------+
only showing top 2 rows



In [21]:
final_data = data_pre.select('features','Yearly Amount Spent')

In [23]:
final_data.show(2,truncate=False)

+--------------------------------------------------------------------------+-------------------+
|features                                                                  |Yearly Amount Spent|
+--------------------------------------------------------------------------+-------------------+
|[34.49726772511229,12.65565114916675,39.57766801952616,4.0826206329529615]|587.9510539684005  |
|[31.92627202636016,11.109460728682564,37.268958868297744,2.66403418213262]|392.2049334443264  |
+--------------------------------------------------------------------------+-------------------+
only showing top 2 rows



In [24]:
train_data, test_data = final_data.randomSplit([0.7,0.3])

In [25]:
train_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                354|
|   mean|  499.2120417089198|
| stddev|  74.69251741122945|
|    min| 256.67058229005585|
|    max|  765.5184619388373|
+-------+-------------------+



In [26]:
test_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                146|
|   mean| 499.56134496121814|
| stddev|  89.81731143521556|
|    min|   266.086340948469|
|    max|  712.3963268096637|
+-------+-------------------+



In [27]:
lr = LinearRegression(featuresCol='features',labelCol='Yearly Amount Spent',predictionCol='Predict_Yearly Amount Spent')

In [29]:
lrModel = lr.fit(train_data)

In [30]:
print('coefficients: {} intercept: {}'.format(lrModel.coefficients,lrModel.intercept))

coefficients: [25.826271094091297,38.84249504685359,0.7027160031558892,61.26698980952605] intercept: -1065.2871761448077


In [31]:
test_results = lrModel.evaluate(test_data)

In [32]:
test_results.residuals.show(5)

+------------------+
|         residuals|
+------------------+
| 11.24288073384605|
|11.216752068121991|
|-17.22902729907935|
|-3.460911793601042|
| 7.827267590949759|
+------------------+
only showing top 5 rows



In [33]:
#check

In [35]:
test_results.rootMeanSquaredError

9.867392259564586

In [36]:
test_results.meanSquaredError

97.36543000411508

In [37]:
test_results.r2

0.9878473906381638

In [38]:
#predict

In [39]:
test_model = lrModel.transform(test_data)

In [41]:
test_model.select('Predict_Yearly Amount Spent','Yearly Amount Spent').show()

+---------------------------+-------------------+
|Predict_Yearly Amount Spent|Yearly Amount Spent|
+---------------------------+-------------------+
|         397.39747033878143|  408.6403510726275|
|          450.5639901281079|  461.7807421962299|
|         283.31536824754835|   266.086340948469|
|          493.6675117784557|  490.2065999848547|
|         486.81134216594296|  494.6386097568927|
|         492.86797206871324|  487.5554580579016|
|          461.1261256872465| 448.93329320767435|
|          564.1213854935861|  557.2526867470547|
|          417.4852528614533|  427.3565308022928|
|         425.65709348709674|   418.602742095224|
|          495.5535883535297|  489.8124879964614|
|         280.85144812347517|  275.9184206503857|
|          542.5682534591463|  541.2265839893283|
|          381.3288829639705|  376.3369007569242|
|           468.734037510103|  475.2634237275485|
|          416.8007374835663| 416.35835357990084|
|          349.5779536013497| 347.77692663187264|


In [42]:
#Luu tru & model

In [43]:
lrModel.save('lrModel_Ecommerce_Customers')

In [44]:
from pyspark.ml.regression import LinearRegressionModel

In [45]:
lrModel2 = LinearRegressionModel.load('lrModel_Ecommerce_Customers')

In [46]:
#Predict new values

In [47]:
unlabel_data = test_data.select('features')

In [48]:
predictions = lrModel2.transform(unlabel_data)

In [49]:
predictions.show(5)

+--------------------+---------------------------+
|            features|Predict_Yearly Amount Spent|
+--------------------+---------------------------+
|[29.5324289670579...|         397.39747033878143|
|[30.7377203726281...|          450.5639901281079|
|[30.8162006488763...|         283.31536824754835|
|[30.8794843441274...|          493.6675117784557|
|[30.9716756438877...|         486.81134216594296|
+--------------------+---------------------------+
only showing top 5 rows

