# In this example, we will build a linear regression model to predict how much a customer would spend. 

## Required data file: customers.csv

### Resource: 

### Spark MLlib classification and regression documentation
### https://spark.apache.org/docs/latest/ml-classification-regression.html#linear-regression


### Create a spark session

In [2]:
# These two lines are not needed for CCAST OnDemand
# import findspark
# findspark.init()

In [3]:
from pyspark.sql import SparkSession

In [23]:
spark = SparkSession.builder.appName('lrex').getOrCreate()

### Read and explore the data

In [5]:
data = spark.read.csv('customers.csv',inferSchema=True,header=True)

In [6]:
data.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [9]:
data.head(1)

[Row(Email='mstephenson@fernandez.com', Address='835 Frank TunnelWrightmouth, MI 82180-9605', Avatar='Violet', Avg Session Length=34.49726772511229, Time on App=12.65565114916675, Time on Website=39.57766801952616, Length of Membership=4.0826206329529615, Yearly Amount Spent=587.9510539684005)]

In [12]:
for item in data.head(1)[0]:
    print(item)

mstephenson@fernandez.com
835 Frank TunnelWrightmouth, MI 82180-9605
Violet
34.49726772511229
12.65565114916675
39.57766801952616
4.0826206329529615
587.9510539684005


### Transform the data

In [13]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [14]:
data.columns

['Email',
 'Address',
 'Avatar',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent']

In [15]:
assembler = VectorAssembler(inputCols=['Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership'], outputCol='features')

In [16]:
output = assembler.transform(data)

In [17]:
output.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)
 |-- features: vector (nullable = true)



In [18]:
final_data = output.select('features','Yearly Amount Spent')

### Split the data into training set and test set

In [19]:
train_data,test_data = final_data.randomSplit([0.7,0.3])

In [20]:
train_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                346|
|   mean| 500.38262900847946|
| stddev|  80.54377252593278|
|    min| 256.67058229005585|
|    max|  765.5184619388373|
+-------+-------------------+



### Build a linear regression model

In [24]:
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(labelCol='Yearly Amount Spent')

In [25]:
lr_model =lr.fit(train_data)

### Evaluate the model

In [26]:
test_results = lr_model.evaluate(test_data)

In [27]:
test_results.residuals.show()

+-------------------+
|          residuals|
+-------------------+
| -6.287859243905871|
|0.08989789997571052|
| -4.939498673968899|
|-3.9165023101049314|
| 19.585689037098575|
| 3.6038199118160037|
| -7.338988315936035|
| -4.113112744666807|
|-14.439838375160662|
| 16.266565539155465|
| -11.66509308992886|
| -9.405292216463693|
|-14.048795403189615|
| -5.401623076059536|
| 5.3451597091835765|
| -17.86734895961149|
|  4.341801763331432|
|0.07344740146521644|
| -9.416827743210433|
| 17.445429404199558|
+-------------------+
only showing top 20 rows



In [29]:
print("RMSE: %f" % test_results.rootMeanSquaredError)

RMSE: 9.685310


In [31]:
print("R-square: %f" % test_results.r2)

R-square: 0.983944


### Deploy the model on test set

In [32]:
unlabeled_data = test_data.select('features')

In [33]:
predictions = lr_model.transform(unlabeled_data)

In [34]:
predictions.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[30.4925366965402...| 288.7591049638204|
|[30.5743636841713...|441.97451585808994|
|[30.8364326747734...| 472.4413991009585|
|[30.8794843441274...| 494.1231022949596|
|[31.3123495994443...|444.00572899084204|
|[31.3584771924370...| 491.5721305376594|
|[31.4474464941278...|425.94173041116005|
|[31.5171218025062...| 280.0315333950525|
|[31.5741380228732...| 558.8491105357475|
|[31.6098395733896...| 428.2789841119527|
|[31.8093003166791...|   548.43699245277|
|[31.8279790554652...| 449.4080397634052|
|[31.9365486184489...| 441.2481802985178|
|[31.9453957483445...| 662.4215470137115|
|[31.9480174211613...|456.57571718371423|
|[31.9563005605233...| 564.9932807068103|
|[31.9764800614612...|326.25264427076877|
|[32.0047530203648...| 463.6725337191642|
|[32.0085045178551...| 452.6140487719658|
|[32.0180740106320...|340.33768134111574|
+--------------------+------------

In [35]:
test_data.show()

+--------------------+-------------------+
|            features|Yearly Amount Spent|
+--------------------+-------------------+
|[30.4925366965402...|  282.4712457199145|
|[30.5743636841713...| 442.06441375806565|
|[30.8364326747734...|  467.5019004269896|
|[30.8794843441274...|  490.2065999848547|
|[31.3123495994443...|  463.5914180279406|
|[31.3584771924370...|  495.1759504494754|
|[31.4474464941278...|   418.602742095224|
|[31.5171218025062...|  275.9184206503857|
|[31.5741380228732...|  544.4092721605869|
|[31.6098395733896...| 444.54554965110816|
|[31.8093003166791...|  536.7718993628412|
|[31.8279790554652...|  440.0027475469415|
|[31.9365486184489...|  427.1993848953282|
|[31.9453957483445...|  657.0199239376519|
|[31.9480174211613...|  461.9208768928978|
|[31.9563005605233...|  547.1259317471988|
|[31.9764800614612...|  330.5944460341002|
|[32.0047530203648...| 463.74598112062944|
|[32.0085045178551...|  443.1972210287554|
|[32.0180740106320...|  357.7831107453153|
+----------

### Output the prediction results

In [36]:
test_results = test_data.join(predictions, "features").select(['Yearly Amount Spent','prediction'])

In [37]:
test_results.show()

+-------------------+------------------+
|Yearly Amount Spent|        prediction|
+-------------------+------------------+
|  282.4712457199145| 288.7591049638204|
| 442.06441375806565|441.97451585808994|
|  467.5019004269896| 472.4413991009585|
|  490.2065999848547| 494.1231022949596|
|  463.5914180279406|444.00572899084204|
|  495.1759504494754| 491.5721305376594|
|   418.602742095224|425.94173041116005|
|  275.9184206503857| 280.0315333950525|
|  544.4092721605869| 558.8491105357475|
| 444.54554965110816| 428.2789841119527|
|  536.7718993628412|   548.43699245277|
|  440.0027475469415| 449.4080397634052|
|  427.1993848953282| 441.2481802985178|
|  657.0199239376519| 662.4215470137115|
|  461.9208768928978|456.57571718371423|
|  547.1259317471988| 564.9932807068103|
|  330.5944460341002|326.25264427076877|
| 463.74598112062944| 463.6725337191642|
|  443.1972210287554| 452.6140487719658|
|  357.7831107453153|340.33768134111574|
+-------------------+------------------+
only showing top

In [38]:
# optional: write the dataframe to a csv file
test_results.write.csv('output')