In [1]:
import findspark
findspark.init('/home/asif/spark-2.1.0-bin-hadoop2.7')
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('LinearRegression').getOrCreate()

In [2]:
from pyspark.ml.regression import LinearRegression

In [3]:
data = spark.read.csv('Ecommerce-Customers.csv', inferSchema=True, header=True)

In [4]:
data.count()
data.head(1)

[Row(Email='mstephenson@fernandez.com', Address='835 Frank TunnelWrightmouth, MI 82180-9605', Avatar='Violet', Avg Session Length=34.49726772511229, Time on App=12.65565114916675, Time on Website=39.57766801952616, Length of Membership=4.0826206329529615, Yearly Amount Spent=587.9510539684005)]

In [5]:
data.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [6]:
for item in data.head(1)[0]:
    print(item)

mstephenson@fernandez.com
835 Frank TunnelWrightmouth, MI 82180-9605
Violet
34.49726772511229
12.65565114916675
39.57766801952616
4.0826206329529615
587.9510539684005


In [7]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [8]:
print(len(data.columns))# how many columns
data.columns    # list of names of the column 

8


['Email',
 'Address',
 'Avatar',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent']

## From this customer data we will predict 'Yearly Amount Spent' by them

In [9]:
assembler = VectorAssembler(inputCols=['Avg Session Length','Time on App','Time on Website','Length of Membership',],
                            outputCol = 'features')
# VectorAssembler convert all the inputCols to a dense vector fro each row and output it as 'features'
# we have to do this to satisfy the format which spark mllib deals with

In [10]:
dataFromAssembler = assembler.transform(data)

In [11]:
dataFromAssembler.head(1)

[Row(Email='mstephenson@fernandez.com', Address='835 Frank TunnelWrightmouth, MI 82180-9605', Avatar='Violet', Avg Session Length=34.49726772511229, Time on App=12.65565114916675, Time on Website=39.57766801952616, Length of Membership=4.0826206329529615, Yearly Amount Spent=587.9510539684005, features=DenseVector([34.4973, 12.6557, 39.5777, 4.0826]))]

In [12]:
final_data = dataFromAssembler.select('features','Yearly Amount Spent')
final_data.head(1)

[Row(features=DenseVector([34.4973, 12.6557, 39.5777, 4.0826]), Yearly Amount Spent=587.9510539684005)]

In [13]:
final_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                500|
|   mean|  499.3140382585909|
| stddev|   79.3147815497068|
|    min| 256.67058229005585|
|    max|  765.5184619388373|
+-------+-------------------+



### Now, split the final formated data for training and testing purpose

In [14]:
train_data, test_data = final_data.randomSplit([0.7,0.3])

In [15]:
train_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                349|
|   mean| 499.62743561305433|
| stddev|  78.49546687102509|
|    min| 256.67058229005585|
|    max|  765.5184619388373|
+-------+-------------------+



In [16]:
test_data.head(10)

[Row(features=DenseVector([30.4925, 11.5629, 35.9766, 1.4816]), Yearly Amount Spent=282.4712457199145),
 Row(features=DenseVector([30.5744, 11.351, 37.0888, 4.0783]), Yearly Amount Spent=442.06441375806565),
 Row(features=DenseVector([30.8364, 13.1001, 35.9077, 3.3616]), Yearly Amount Spent=467.5019004269896),
 Row(features=DenseVector([30.9717, 11.7314, 36.0746, 4.4264]), Yearly Amount Spent=494.6386097568927),
 Row(features=DenseVector([31.0472, 11.1997, 38.6887, 3.0888]), Yearly Amount Spent=392.4973991890214),
 Row(features=DenseVector([31.0613, 12.3576, 36.166, 4.0893]), Yearly Amount Spent=487.5554580579016),
 Row(features=DenseVector([31.1695, 13.9702, 36.674, 1.7852]), Yearly Amount Spent=427.3565308022928),
 Row(features=DenseVector([31.3585, 12.8099, 36.5497, 3.6377]), Yearly Amount Spent=495.1759504494754),
 Row(features=DenseVector([31.3896, 10.9942, 38.0745, 3.4289]), Yearly Amount Spent=410.0696110599829),
 Row(features=DenseVector([31.446, 12.8465, 37.8692, 3.4201]), Yea

In [17]:
test_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                151|
|   mean|  498.5896960287377|
| stddev|  81.43699458193946|
|    min|  282.4712457199145|
|    max|  689.7876041747194|
+-------+-------------------+



In [18]:
lr = LinearRegression(labelCol='Yearly Amount Spent')

In [19]:
lr_model = lr.fit(train_data)

In [20]:
test_results = lr_model.evaluate(test_data)

## A residual is the difference between what is plotted in your scatter plot at a specific point, and what the regression equation predicts "should be plotted" at this specific point. If the scatter plot and the regression equation "agree" on a y-value (no difference), the residual will be zero.

In [21]:
test_results.residuals.show() # find difference between predicted and actual value

+-------------------+
|          residuals|
+-------------------+
| -4.674837757715579|
| 0.8554925608579538|
|  -4.91345033022634|
|  7.098866857990743|
| 5.5817468460162445|
| -6.257397579181372|
|  8.944776522190978|
|  3.595851979448639|
| 1.7070453244839427|
|  2.596325155062857|
| -4.769798547385676|
|-18.494594250884802|
|-2.1329588205195478|
| -5.123598422709449|
|  6.274538543712367|
| -19.12398684671217|
|-1.5201948450259692|
| -8.140892509186358|
| -6.165462392023414|
|  6.420588163523462|
+-------------------+
only showing top 20 rows



In [22]:
test_results.rootMeanSquaredError

10.159401125116721

In [23]:
test_results.r2

0.9843332674797292

## Now, predict from only features

In [24]:
unlabeled_data = test_data.select('features')
unlabeled_data.head(1)[0]

Row(features=DenseVector([30.4925, 11.5629, 35.9766, 1.4816]))

In [25]:
prediction = lr_model.transform(unlabeled_data)

In [26]:
prediction.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[30.4925366965402...| 287.1460834776301|
|[30.5743636841713...| 441.2089211972077|
|[30.8364326747734...|472.41535075721595|
|[30.9716756438877...|  487.539742898902|
|[31.0472221394875...|386.91565234300515|
|[31.0613251567161...|  493.812855637083|
|[31.1695067987115...|418.41175428010183|
|[31.3584771924370...|491.58009847002677|
|[31.3895854806643...|408.36256573549895|
|[31.4459724827577...| 482.2806397800657|
|[31.5257524169682...|448.73542535726756|
|[31.5702008293202...| 564.4400863922897|
|[31.5761319713222...| 543.3595428098479|
|[31.6253601348306...|381.46049917963364|
|[31.6548096756927...|468.98888518383615|
|[31.8164283341993...| 520.2464783503685|
|[31.8186165667690...| 447.9388682151616|
|[31.8854062999117...|398.24416548166187|
|[31.9453957483445...| 663.1853863296753|
|[31.9764800614612...|324.17385787057674|
+--------------------+------------