In [1]:
from pyspark.sql import SparkSession

In [2]:
Spark = SparkSession.builder.appName('Cust').getOrCreate()

In [3]:
from pyspark.ml.regression import LinearRegression

In [5]:
Data = Spark.read.csv('Ecommerce_Customers.csv',inferSchema=True,header=True)

In [8]:
Data.show(40)

+--------------------+--------------------+------------------+-----------+---------------+--------------------+-------------------+
|               Email|             Address|Avg Session Length|Time on App|Time on Website|Length of Membership|Yearly Amount Spent|
+--------------------+--------------------+------------------+-----------+---------------+--------------------+-------------------+
|mstephenson@ferna...|835 Frank TunnelW...|       34.49726773|12.65565115|    39.57766802|         4.082620633|         587.951054|
|   hduke@hotmail.com|4547 Archer Commo...|       31.92627203|11.10946073|    37.26895887|         2.664034182|        392.2049334|
|    pallen@yahoo.com|24645 Valerie Uni...|       33.00091476|11.33027806|    37.11059744|         4.104543202|        487.5475049|
|riverarebecca@gma...|1414 David Throug...|       34.30555663|13.71751367|    36.72128268|         3.120178783|         581.852344|
|mstephens@davidso...|14023 Rodriguez P...|       33.33067252|12.79518855|  

In [10]:
Data.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [11]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [13]:
feature = VectorAssembler(inputCols=['Avg Session Length','Time on App','Time on Website','Length of Membership'],outputCol='Independent Feature')

In [15]:
output=feature.transform(Data)

In [16]:
output.show()

+--------------------+--------------------+------------------+-----------+---------------+--------------------+-------------------+--------------------+
|               Email|             Address|Avg Session Length|Time on App|Time on Website|Length of Membership|Yearly Amount Spent| Independent Feature|
+--------------------+--------------------+------------------+-----------+---------------+--------------------+-------------------+--------------------+
|mstephenson@ferna...|835 Frank TunnelW...|       34.49726773|12.65565115|    39.57766802|         4.082620633|         587.951054|[34.49726773,12.6...|
|   hduke@hotmail.com|4547 Archer Commo...|       31.92627203|11.10946073|    37.26895887|         2.664034182|        392.2049334|[31.92627203,11.1...|
|    pallen@yahoo.com|24645 Valerie Uni...|       33.00091476|11.33027806|    37.11059744|         4.104543202|        487.5475049|[33.00091476,11.3...|
|riverarebecca@gma...|1414 David Throug...|       34.30555663|13.71751367|    36.7

In [17]:
output.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)
 |-- Independent Feature: vector (nullable = true)



In [18]:
finalized_features = output.select('Independent Feature','Yearly Amount Spent')

In [20]:
finalized_features.show()

+--------------------+-------------------+
| Independent Feature|Yearly Amount Spent|
+--------------------+-------------------+
|[34.49726773,12.6...|         587.951054|
|[31.92627203,11.1...|        392.2049334|
|[33.00091476,11.3...|        487.5475049|
|[34.30555663,13.7...|         581.852344|
|[33.33067252,12.7...|         599.406092|
|[33.87103788,12.0...|        637.1024479|
|[32.0215955,11.36...|        521.5721748|
|[32.73914294,12.3...|        549.9041461|
|[33.9877729,13.38...|         570.200409|
|[31.93654862,11.8...|        427.1993849|
|[33.99257277,13.3...|        492.6060127|
|[33.87936082,11.5...|        522.3374046|
|[29.53242897,10.9...|        408.6403511|
|[33.19033404,12.9...|        573.4158673|
|[32.38797585,13.1...|        470.4527333|
|[30.73772037,12.6...|        461.7807422|
|[32.1253869,11.73...|        457.8476959|
|[32.33889932,12.0...|        407.7045475|
|[32.18781205,14.7...|        452.3156755|
|[32.61785606,13.9...|        605.0610388|
+----------

In [21]:
train_data,test_date = finalized_features.randomSplit([0.75,0.25])

In [22]:
regress = LinearRegression(featuresCol='Independent Feature',labelCol='Yearly Amount Spent')
regress=regress.fit(train_data)

In [23]:
regress.coefficients

DenseVector([25.8269, 39.064, 0.6889, 61.5305])

In [24]:
regress.intercept

-1068.7812231228709

In [25]:
Pred_result = regress.evaluate(test_date)

In [26]:
Pred_result.predictions.show()

+--------------------+-------------------+------------------+
| Independent Feature|Yearly Amount Spent|        prediction|
+--------------------+-------------------+------------------+
|[30.57436368,11.3...|        442.0644138| 440.7708355908012|
|[31.04722214,11.1...|        392.4973992|387.28460779354737|
|[31.12397435,12.3...|        486.9470538| 507.3438687430721|
|[31.12809005,13.2...|        557.2526867| 564.2935682120171|
|[31.30919264,11.9...|        432.7207178| 428.7151492036037|
|[31.36621217,11.1...|        430.5888826|425.70369059402196|
|[31.5261979,12.04...|        409.0945262| 417.7321918063433|
|[31.73663569,10.7...|        496.9334463| 492.9140779936288|
|[31.81248256,10.8...|         392.810345|394.67387685254175|
|[31.8209982,10.77...|         424.675281| 416.0297169897606|
|[31.90962683,11.3...|        563.4460357| 550.6360487488989|
|[31.92627203,11.1...|        392.2049334|379.35223490361886|
|[31.95490386,10.9...|        439.9978799|430.69145096816715|
|[32.030