In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.ml import *
import pyspark.sql.functions as f
from pyspark.ml.feature import VectorAssembler
from datetime import timedelta
from pyspark.sql.window import Window
import numpy as np

# Import all the data

First let's set up the spark app

In [4]:
spark_application_name = "Projet_AHSG"
spark = (SparkSession.builder.appName(spark_application_name).getOrCreate())
col = [StructField("Date",TimestampType()), StructField("High",FloatType()), StructField("Low",FloatType()), StructField("Open",FloatType()), StructField("Close",FloatType()), StructField("Volume",FloatType()), StructField("Adj Close",FloatType()), StructField("company_name",StringType())]
schema = StructType(col)

We chose to use the Apple dataset

In [5]:
apple = spark.read.schema(schema).csv("stocks_data/APPLE.csv", header=True, sep=',')

apple.show()

[Stage 0:>                                                          (0 + 1) / 1]

+-------------------+-------+-------+-------+-------+----------+---------+------------+
|               Date|   High|    Low|   Open|  Close|    Volume|Adj Close|company_name|
+-------------------+-------+-------+-------+-------+----------+---------+------------+
|2017-01-03 00:00:00|29.0825|  28.69|  28.95|29.0375|1.151276E8| 27.27764|       APPLE|
|2017-01-04 00:00:00|29.1275|28.9375|28.9625| 29.005| 8.44724E7|27.247108|       APPLE|
|2017-01-05 00:00:00| 29.215|28.9525|  28.98|29.1525| 8.87744E7|27.385668|       APPLE|
|2017-01-06 00:00:00|  29.54|29.1175| 29.195|29.4775|1.270076E8|27.690971|       APPLE|
|2017-01-09 00:00:00|29.8575| 29.485|29.4875|29.7475|1.342476E8|27.944603|       APPLE|
|2017-01-10 00:00:00| 29.845| 29.575|29.6925|29.7775| 9.78484E7|27.972786|       APPLE|
|2017-01-11 00:00:00|29.9825|  29.65| 29.685|29.9375|1.103544E8|28.123089|       APPLE|
|2017-01-12 00:00:00| 29.825|29.5525| 29.725|29.8125|1.083448E8|28.005665|       APPLE|
|2017-01-13 00:00:00| 29.905|29.

                                                                                

# Cleaning the data

We are going to keep only certain columns nd add a new column with the Adj Close of the next day

In [6]:
columnsToKeep = ['Date', 'Open', 'Close', 'Volume', 'Adj Close']

apple = apple.select(columnsToKeep)
apple.cache().count()

w = Window().partitionBy().orderBy(f.col("Date"))
apple = apple.select("*", f.lag("Adj Close", offset=-1).over(w).alias("Prediction Adj Close")).na.drop()

apple.show()

22/06/20 15:38:20 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/06/20 15:38:20 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/06/20 15:38:21 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+-------------------+-------+-------+----------+---------+--------------------+
|               Date|   Open|  Close|    Volume|Adj Close|Prediction Adj Close|
+-------------------+-------+-------+----------+---------+--------------------+
|2017-01-03 00:00:00|  28.95|29.0375|1.151276E8| 27.27764|           27.247108|
|2017-01-04 00:00:00|28.9625| 29.005| 8.44724E7|27.247108|           27.385668|
|2017-01-05 00:00:00|  28.98|29.1525| 8.87744E7|27.385668|           27.690971|
|2017-01-06 00:00:00| 29.195|29.4775|1.270076E8|27.690971|           27.944603|
|2017-01-09 00:00:00|29.4875|29.7475|1.342476E8|27.944603|           27.972786|
|2017-01-10 00:00:00|29.6925|29.7775| 9.78484E7|27.972786|           28.123089|
|2017-01-11 00:00:00| 29.685|29.9375|1.103544E8|28.123089|           28.005665|
|2017-01-12 00:00:00| 29.725|29.8125|1.083448E8|28.005665|            27.95635|
|2017-01-13 00:00:00|29.7775|  29.76|1.044476E8| 27.95635|             28.1818|
|2017-01-17 00:00:00| 29.585|   30.0|1.3

We check for missing data

In [7]:
apple.select([f.count(f.when(f.isnull(c), c)).alias(c) for c in apple.columns]).show()

22/06/20 15:38:23 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/06/20 15:38:23 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+----+----+-----+------+---------+--------------------+
|Date|Open|Close|Volume|Adj Close|Prediction Adj Close|
+----+----+-----+------+---------+--------------------+
|   0|   0|    0|     0|        0|                   0|
+----+----+-----+------+---------+--------------------+



We don't have any missing data so we can move on to our models

# Linear Regression

We need to split our dataset to 2 datasets: train (80%) and test (20%).

In [8]:
trainDF, testDF = apple.randomSplit([.8, .2], seed=42)

## Creation of the VectorAssembler

First we need to tranform our data into a VectorAssembler to use the Linear Regression model

In [9]:
vecAssembler = VectorAssembler(inputCols=['Open', 'Close', 'Volume', 'Adj Close'], outputCol="features")

vecTrainDF = vecAssembler.transform(trainDF)

vecTrainDF.select("features").show(10)

22/06/20 15:38:24 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/06/20 15:38:24 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/06/20 15:38:25 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+--------------------+
|            features|
+--------------------+
|[28.9500007629394...|
|[28.9624996185302...|
|[29.1949996948242...|
|[29.4874992370605...|
|[29.6924991607666...|
|[29.7250003814697...|
|[29.5849990844726...|
|[30.0,29.99749946...|
|[29.8500003814697...|
|[30.1124992370605...|
+--------------------+
only showing top 10 rows



Now we can create our model

In [10]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(featuresCol="features", labelCol="Prediction Adj Close")
lrModel = lr.fit(vecTrainDF)

22/06/20 15:38:26 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/06/20 15:38:26 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/06/20 15:38:26 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/06/20 15:38:27 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/06/20 15:38:27 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/06/20 15:38:27 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/06/20 1

## Inspect the model

Here are the coefficients of our model

In [11]:
for c, name in zip(lrModel.coefficients, ['Open', 'Close', 'Volume', 'Adj Close']):
    print(f"The coefficient for {name} is {c}")

print(f"The intercept value of the model is {lrModel.intercept}")

The coefficient for Open is -0.030349073227634984
The coefficient for Close is -0.2995791658656899
The coefficient for Volume is -1.776606778327005e-10
The coefficient for Adj Close is 1.3261763243384581
The intercept value of the model is 0.7839625536161241


Let's take a look at the Root Mean Square Error

In [12]:
print("The Root Mean Square Error is ", lrModel.summary.rootMeanSquaredError)

The Root Mean Square Error is  1.3645789823437324


## Pipeline

We create a pipeline with our VectorAssembler and our Linear Regression model

In [13]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[vecAssembler, lr])
pipelineModel = pipeline.fit(trainDF)

22/06/20 15:38:29 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/06/20 15:38:29 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/06/20 15:38:29 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/06/20 15:38:29 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/06/20 15:38:29 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/06/20 15:38:29 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/06/20 1

Now we are going to observe the predictions.

In [14]:
predDF = pipelineModel.transform(testDF)

predDF.select("Date", "Open", "Close", "Adj Close", "Prediction Adj Close", "prediction").show(10)

22/06/20 15:38:31 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/06/20 15:38:31 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/06/20 15:38:31 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+-------------------+-------+-------+---------+--------------------+------------------+
|               Date|   Open|  Close|Adj Close|Prediction Adj Close|        prediction|
+-------------------+-------+-------+---------+--------------------+------------------+
|2017-01-05 00:00:00|  28.98|29.1525|27.385668|           27.690971|27.473417290438125|
|2017-01-11 00:00:00| 29.685|29.9375|28.123089|           28.005665|28.190967997910864|
|2017-01-13 00:00:00|29.7775|  29.76| 27.95635|             28.1818|28.021260662770693|
|2017-01-23 00:00:00|   30.0|  30.02| 28.20059|           28.174757|28.263408814091097|
|2017-01-31 00:00:00|30.2875|30.3375|28.498844|            30.23672|28.535810202849714|
|2017-02-06 00:00:00|32.2825|32.5725|30.598392|             30.8896|30.605962518115074|
|2017-02-14 00:00:00|33.3675| 33.755|31.846703|           31.962276| 31.86972578691473|
|2017-02-23 00:00:00| 34.345|34.1325| 32.20287|           32.233524| 32.20814782057938|
|2017-03-09 00:00:00| 34.685|  3

Let's see the average gap between the real prediction of Adj Close and the prediction given by our model

In [15]:
prediction_adj_close_sum = predDF.select(f.sum('Prediction Adj Close')).first()[0]
prediction_sum = predDF.select(f.sum('prediction')).first()[0]
print("The average difference between the real value and the prediction is ", (prediction_sum - prediction_adj_close_sum) / predDF.count())

22/06/20 15:38:31 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/06/20 15:38:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/06/20 15:38:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/06/20 15:38:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/06/20 15:38:33 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/06/20 15:38:33 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


The average difference between the real value and the prediction is  0.02203868434511571



# Baseline Model

Now that we've seen a Linear Regression model, let's use a Baseline Model: the Regression Evaluator

In [16]:
from pyspark.ml.evaluation import RegressionEvaluator

avgAdjClose = trainDF.select(f.avg("Prediction Adj Close")).first()[0]
print("The average Adj Close is ", avgAdjClose)
predDF = testDF.withColumn("avgPrediction", f.lit(avgAdjClose))

regressionMeanEvaluator = RegressionEvaluator(predictionCol="avgPrediction", labelCol="Prediction Adj Close", metricName="rmse")

22/06/20 15:38:33 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/06/20 15:38:33 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


The average Adj Close is  55.41336029587807


Let's observe the RMSE for the Baseline Model

In [17]:
print(f"The RMSE for predicting the average prediction adj close is: {regressionMeanEvaluator.evaluate(predDF):.2f}")

22/06/20 15:38:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/06/20 15:38:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/06/20 15:38:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


The RMSE for predicting the average prediction adj close is: 23.58
