In [1]:
import findspark
findspark.init('/home/asif/spark-2.1.0-bin-hadoop2.7')
from pyspark.sql import SparkSession # to load dataframe we need this
spark = SparkSession.builder.appName('RegressionTree').getOrCreate()


In [2]:
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator

In [3]:
data = spark.read.csv('Ecommerce-Customers.csv', inferSchema=True, header=True)

In [4]:
data.count()

500

In [5]:
print(data.head(1))
data.printSchema()

[Row(Email='mstephenson@fernandez.com', Address='835 Frank TunnelWrightmouth, MI 82180-9605', Avatar='Violet', Avg Session Length=34.49726772511229, Time on App=12.65565114916675, Time on Website=39.57766801952616, Length of Membership=4.0826206329529615, Yearly Amount Spent=587.9510539684005)]
root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [6]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [7]:
data.columns 

['Email',
 'Address',
 'Avatar',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent']

In [8]:
assembler = VectorAssembler(inputCols=['Avg Session Length','Time on App','Time on Website','Length of Membership',],
                            outputCol = 'features')


In [9]:
dataFromAssembler = assembler.transform(data)
dataFromAssembler.head(1)

[Row(Email='mstephenson@fernandez.com', Address='835 Frank TunnelWrightmouth, MI 82180-9605', Avatar='Violet', Avg Session Length=34.49726772511229, Time on App=12.65565114916675, Time on Website=39.57766801952616, Length of Membership=4.0826206329529615, Yearly Amount Spent=587.9510539684005, features=DenseVector([34.4973, 12.6557, 39.5777, 4.0826]))]

In [10]:
final_data = dataFromAssembler.select('features','Yearly Amount Spent')

In [11]:
train_data, test_data = final_data.randomSplit([0.7,0.3])

# Decision Tree Regression

In [12]:
dtr = DecisionTreeRegressor(featuresCol="features", labelCol='Yearly Amount Spent', maxDepth=15, maxBins=32,)

In [13]:
dtr_model = dtr.fit(train_data)

In [14]:
test_results = dtr_model.transform(test_data)

In [15]:
test_results.printSchema()

root
 |-- features: vector (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)
 |-- prediction: double (nullable = true)



In [16]:
test_results.select("prediction", "Yearly Amount Spent", "features").show(5)

+------------------+-------------------+--------------------+
|        prediction|Yearly Amount Spent|            features|
+------------------+-------------------+--------------------+
| 494.6386097568927| 442.06441375806565|[30.5743636841713...|
|352.55010816300023|   266.086340948469|[30.8162006488763...|
| 467.5019004269896|  490.2065999848547|[30.8794843441274...|
| 495.1759504494754| 448.93329320767435|[31.0662181616375...|
| 545.9454921414049|  557.2526867470547|[31.1280900496166...|
+------------------+-------------------+--------------------+
only showing top 5 rows



In [17]:
evaluator = RegressionEvaluator(labelCol='Yearly Amount Spent', predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(test_results)
rmse

30.63152885331705

# Random forest regression

In [18]:
rtr = RandomForestRegressor(featuresCol="features", labelCol='Yearly Amount Spent', maxDepth=15, maxBins=32, numTrees=200)
rtr_model = rtr.fit(train_data)
test_results = rtr_model.transform(test_data)
test_results.printSchema()

root
 |-- features: vector (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)
 |-- prediction: double (nullable = true)



In [19]:
test_results.select("prediction", "Yearly Amount Spent", "features").show(5)

+------------------+-------------------+--------------------+
|        prediction|Yearly Amount Spent|            features|
+------------------+-------------------+--------------------+
|471.25442331859574| 442.06441375806565|[30.5743636841713...|
|331.20034621211033|   266.086340948469|[30.8162006488763...|
| 500.3278647592669|  490.2065999848547|[30.8794843441274...|
| 472.9877855739715| 448.93329320767435|[31.0662181616375...|
| 559.1316920382626|  557.2526867470547|[31.1280900496166...|
+------------------+-------------------+--------------------+
only showing top 5 rows



In [20]:
evaluator = RegressionEvaluator(labelCol='Yearly Amount Spent', predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(test_results)
rmse

19.455856791067728

# Gradient-boosted tree regression

In [21]:
gbtr = GBTRegressor(featuresCol="features", labelCol='Yearly Amount Spent',maxDepth=5, maxBins=32,maxIter=200)
gbtr_model = gbtr.fit(train_data)
test_results = gbtr_model.transform(test_data)
test_results.printSchema()

root
 |-- features: vector (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)
 |-- prediction: double (nullable = true)



In [22]:
test_results.select("prediction", "Yearly Amount Spent", "features").show(5)

+------------------+-------------------+--------------------+
|        prediction|Yearly Amount Spent|            features|
+------------------+-------------------+--------------------+
|480.47789649141544| 442.06441375806565|[30.5743636841713...|
|  333.079246561134|   266.086340948469|[30.8162006488763...|
| 535.8892222027864|  490.2065999848547|[30.8794843441274...|
| 454.4855092744397| 448.93329320767435|[31.0662181616375...|
| 556.4848906830747|  557.2526867470547|[31.1280900496166...|
+------------------+-------------------+--------------------+
only showing top 5 rows



In [23]:
evaluator = RegressionEvaluator(labelCol='Yearly Amount Spent', predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(test_results)
rmse

28.93823575504471