In [6]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('lrex').getOrCreate()

In [7]:
from pyspark.ml.regression import LinearRegression

In [8]:
data=spark.read.format('libsvm').load('sample_linear_regression_data.txt')

In [9]:
data.show()

+-------------------+--------------------+
|              label|            features|
+-------------------+--------------------+
| -9.490009878824548|(10,[0,1,2,3,4,5,...|
| 0.2577820163584905|(10,[0,1,2,3,4,5,...|
| -4.438869807456516|(10,[0,1,2,3,4,5,...|
|-19.782762789614537|(10,[0,1,2,3,4,5,...|
| -7.966593841555266|(10,[0,1,2,3,4,5,...|
| -7.896274316726144|(10,[0,1,2,3,4,5,...|
| -8.464803554195287|(10,[0,1,2,3,4,5,...|
| 2.1214592666251364|(10,[0,1,2,3,4,5,...|
| 1.0720117616524107|(10,[0,1,2,3,4,5,...|
|-13.772441561702871|(10,[0,1,2,3,4,5,...|
| -5.082010756207233|(10,[0,1,2,3,4,5,...|
|  7.887786536531237|(10,[0,1,2,3,4,5,...|
| 14.323146365332388|(10,[0,1,2,3,4,5,...|
|-20.057482615789212|(10,[0,1,2,3,4,5,...|
|-0.8995693247765151|(10,[0,1,2,3,4,5,...|
| -19.16829262296376|(10,[0,1,2,3,4,5,...|
|  5.601801561245534|(10,[0,1,2,3,4,5,...|
|-3.2256352187273354|(10,[0,1,2,3,4,5,...|
| 1.5299675726687754|(10,[0,1,2,3,4,5,...|
| -0.250102447941961|(10,[0,1,2,3,4,5,...|
+----------

In [26]:
#by the default the featureCol is features, labelCol is label etc.. if changed, it may give error
lr=LinearRegression(featuresCol='features', labelCol='label', predictionCol='prediction')

In [27]:
lrModel=lr.fit(data)

In [28]:
lrModel.coefficients


DenseVector([0.0073, 0.8314, -0.8095, 2.4412, 0.5192, 1.1535, -0.2989, -0.5129, -0.6197, 0.6956])

In [29]:
lrModel.intercept

0.14228558260358093

In [30]:
summary=lrModel.summary

In [31]:
#adjusted r2
summary.r2

0.027839179518600154

In [32]:
summary.rootMeanSquaredError

10.16309157133015

In [33]:
# Convert the data on training and test dataset
train_data, test_data=data.randomSplit([0.7,0.3])

In [34]:
train_data, test_data

(DataFrame[label: double, features: vector],
 DataFrame[label: double, features: vector])

In [35]:
train_data.describe().show() , test_data.describe().show()

+-------+------------------+
|summary|             label|
+-------+------------------+
|  count|               355|
|   mean|0.7816565196662694|
| stddev|10.141347459806367|
|    min|-23.51088409032297|
|    max| 27.78383192005107|
+-------+------------------+

+-------+-------------------+
|summary|              label|
+-------+-------------------+
|  count|                146|
|   mean| -1.019087428505722|
| stddev| 10.662886865626113|
|    min|-28.571478869743427|
|    max| 26.903524792043335|
+-------+-------------------+



(None, None)

In [36]:
#Build the model on train data
train_model=lr.fit(train_data)

In [37]:
# Test this model on the test data
test_result=train_model.evaluate(test_data)

In [38]:
#explore the result of test data 
test_result.residuals.show(5)

+-------------------+
|          residuals|
+-------------------+
|-29.364398699947976|
|-28.441369104802952|
|-27.945213185436195|
|-25.197515631475447|
|-24.958280012632954|
+-------------------+
only showing top 5 rows



In [39]:
# Deploy the model - To deploy on the data that has no labels
unlabeled_data=test_data.select('features')

In [40]:
unlabeled_data.show(5)

+--------------------+
|            features|
+--------------------+
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
+--------------------+
only showing top 5 rows



In [41]:
# Based on the model above, Predict using transform method 
Prediction=train_model.transform(unlabeled_data)

In [42]:
Prediction.show(5)

+--------------------+-------------------+
|            features|         prediction|
+--------------------+-------------------+
|(10,[0,1,2,3,4,5,...| 0.7929198302045486|
|(10,[0,1,2,3,4,5,...|0.39535106702631945|
|(10,[0,1,2,3,4,5,...| 1.1397297569531228|
|(10,[0,1,2,3,4,5,...|-1.5386915511262758|
|(10,[0,1,2,3,4,5,...| 1.4708398916964436|
+--------------------+-------------------+
only showing top 5 rows



In [43]:
# Evaluation metrics for Regression
#1.) Mean Absolute Error - Mean of the absolute values of the error
#2.) Mean Squared Error - implies the high absolute value of error has sort of penalty as they are sqaured which makes bigger value 
# so problem, the unit has changed as it has become squared
#3.) Root Mean Absolute Error - This makes the unit as that of y 
#4.) R Squared Values/Coefficient of determination

In [51]:
#Working on realistic data (excel format)
data=spark.read.csv('Ecommerce_Customers.csv', inferSchema=True, header=True)

In [52]:
data.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [76]:
data.show(5)

+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+
|               Email|             Address|          Avatar|Avg Session Length|       Time on App|   Time on Website|Length of Membership|Yearly Amount Spent|
+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+
|mstephenson@ferna...|835 Frank TunnelW...|          Violet| 34.49726772511229| 12.65565114916675| 39.57766801952616|  4.0826206329529615|  587.9510539684005|
|   hduke@hotmail.com|4547 Archer Commo...|       DarkGreen| 31.92627202636016|11.109460728682564|37.268958868297744|    2.66403418213262|  392.2049334443264|
|    pallen@yahoo.com|24645 Valerie Uni...|          Bisque|33.000914755642675|11.330278057777512|37.110597442120856|   4.104543202376424| 487.54750486747207|
|riverarebecca@gma...|1414 David Throug...|   

In [84]:
#First argument specifies number of rows to be taken whereas second argument tells to choose ith index row starting from 0
data.head(2)[1]

Row(Email='hduke@hotmail.com', Address='4547 Archer CommonDiazchester, CA 06566-8576', Avatar='DarkGreen', Avg Session Length=31.92627202636016, Time on App=11.109460728682564, Time on Website=37.268958868297744, Length of Membership=2.66403418213262, Yearly Amount Spent=392.2049334443264)

In [85]:
# reading individual variable data in from the first row 
for item in data.head(2)[1]:
    print (item)

hduke@hotmail.com
4547 Archer CommonDiazchester, CA 06566-8576
DarkGreen
31.92627202636016
11.109460728682564
37.268958868297744
2.66403418213262
392.2049334443264


In [95]:
# Set up dataframe for machine learning 
#import Vector assemblers and vectors
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [96]:
# AS of now, work only with numeric data and will use categorical data later on
data.columns

['Email',
 'Address',
 'Avatar',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent']

In [103]:
assembler=VectorAssembler(inputCols=['Avg Session Length','Time on App','Time on Website','Length of Membership'], outputCol="features")

In [111]:
output=assembler.transform(data)

In [112]:
output.head(1)

[Row(Email='mstephenson@fernandez.com', Address='835 Frank TunnelWrightmouth, MI 82180-9605', Avatar='Violet', Avg Session Length=34.49726772511229, Time on App=12.65565114916675, Time on Website=39.57766801952616, Length of Membership=4.0826206329529615, Yearly Amount Spent=587.9510539684005, features=DenseVector([34.4973, 12.6557, 39.5777, 4.0826]))]

In [113]:
final_data=output.select('features', 'Yearly Amount Spent')

In [114]:
final_data.show(2)

+--------------------+-------------------+
|            features|Yearly Amount Spent|
+--------------------+-------------------+
|[34.4972677251122...|  587.9510539684005|
|[31.9262720263601...|  392.2049334443264|
+--------------------+-------------------+
only showing top 2 rows



In [122]:
train_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                357|
|   mean|    502.36723515511|
| stddev|  81.04936298750846|
|    min|   266.086340948469|
|    max|  765.5184619388373|
+-------+-------------------+



In [123]:
#Build linear model 
lr=LinearRegression(labelCol='Yearly Amount Spent') # note that we dont specify featurecol and prediction argument because they 
# have their by default values as features and predictions respectively

In [124]:
lr_model=lr.fit(train_data)

In [126]:
test_result=lr_model.evaluate(test_data)

In [129]:
test_result.residuals.show(5)

+-------------------+
|          residuals|
+-------------------+
| -4.824390048360385|
|  6.962207546450145|
| 4.1576174789205425|
|-12.864120647429615|
| 3.9626678820935695|
+-------------------+
only showing top 5 rows



In [130]:
test_result.rootMeanSquaredError

9.680846841856704

In [132]:
#Deploy this model having unlabeled data (no prediction variable / label  and having only features/ independent variables)
# To do so, lets select only features data 
unlabeled_data=test_data.select('features')

In [133]:
unlabeled_data.show(2)

+--------------------+
|            features|
+--------------------+
|[30.4925366965402...|
|[30.9716756438877...|
+--------------------+
only showing top 2 rows



In [135]:
# Deploying the model on unlabeled dataset
prediction=lr_model.transform(unlabeled_data)

In [136]:
prediction.show(5)

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[30.4925366965402...| 287.2956357682749|
|[30.9716756438877...| 487.6764022104426|
|[31.0472221394875...|388.33978171010085|
|[31.0662181616375...|461.79741385510397|
|[31.3584771924370...|491.21328256738184|
+--------------------+------------------+
only showing top 5 rows

