In [12]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.ml import *
import pyspark.sql.functions as f
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import VectorAssembler
import matplotlib.pyplot as plt
from datetime import timedelta
from pyspark.sql.window import Window
import numpy as np

# Import all the data

In [13]:
spark_application_name = "Projet_AHSG"
spark = (SparkSession.builder.appName(spark_application_name).getOrCreate())
col = [StructField("Date",TimestampType()), StructField("High",FloatType()), StructField("Low",FloatType()), StructField("Open",FloatType()), StructField("Close",FloatType()), StructField("Volume",FloatType()), StructField("Adj Close",FloatType()), StructField("company_name",StringType())]
schema = StructType(col)

In [14]:
apple = spark.read.schema(schema).csv("stocks_data/APPLE.csv", header=True, sep=',')
apple = spark.read.schema(schema).csv("stocks_data/TESLA.csv", header=True, sep=',')

apple.show()

+-------------------+------+------+------+------+---------+---------+------------+
|               Date|  High|   Low|  Open| Close|   Volume|Adj Close|company_name|
+-------------------+------+------+------+------+---------+---------+------------+
|2017-01-03 00:00:00|44.066|42.192|42.972|43.398|2.96165E7|   43.398|       TESLA|
|2017-01-04 00:00:00|  45.6|42.862| 42.95|45.398|5.60675E7|   45.398|       TESLA|
|2017-01-05 00:00:00|45.496| 44.39|45.284| 45.35|2.95585E7|    45.35|       TESLA|
|2017-01-06 00:00:00|46.062| 45.09|45.386|45.802|2.76395E7|   45.802|       TESLA|
|2017-01-09 00:00:00|46.384|  45.6|45.794|46.256|1.98975E7|   46.256|       TESLA|
|2017-01-10 00:00:00|  46.4|45.378|  46.4|45.974|   1.83E7|   45.974|       TESLA|
|2017-01-11 00:00:00|45.996|45.336|45.814|45.946| 1.8254E7|   45.946|       TESLA|
|2017-01-12 00:00:00| 46.14|45.116|45.812|45.918| 1.8951E7|   45.918|       TESLA|
|2017-01-13 00:00:00| 47.57|45.918|  46.0| 47.55| 3.0465E7|    47.55|       TESLA|
|201

# Cleaning the data

We are going to keep only certain columns nd add a new column with the Adj Close of the next day

In [15]:
columnsToKeep = ['Date', 'Open', 'Close', 'Volume', 'Adj Close']

apple = apple.select(columnsToKeep)
apple.cache().count()

w = Window().partitionBy().orderBy(f.col("Date"))
apple = apple.select("*", f.lag("Adj Close", offset=-1).over(w).alias("Prediction Adj Close")).na.drop()

apple.show()

22/06/20 14:57:39 WARN CacheManager: Asked to cache already cached data.
22/06/20 14:57:40 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/06/20 14:57:40 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/06/20 14:57:40 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+-------------------+------+------+---------+---------+--------------------+
|               Date|  Open| Close|   Volume|Adj Close|Prediction Adj Close|
+-------------------+------+------+---------+---------+--------------------+
|2017-01-03 00:00:00|42.972|43.398|2.96165E7|   43.398|              45.398|
|2017-01-04 00:00:00| 42.95|45.398|5.60675E7|   45.398|               45.35|
|2017-01-05 00:00:00|45.284| 45.35|2.95585E7|    45.35|              45.802|
|2017-01-06 00:00:00|45.386|45.802|2.76395E7|   45.802|              46.256|
|2017-01-09 00:00:00|45.794|46.256|1.98975E7|   46.256|              45.974|
|2017-01-10 00:00:00|  46.4|45.974|   1.83E7|   45.974|              45.946|
|2017-01-11 00:00:00|45.814|45.946| 1.8254E7|   45.946|              45.918|
|2017-01-12 00:00:00|45.812|45.918| 1.8951E7|   45.918|               47.55|
|2017-01-13 00:00:00|  46.0| 47.55| 3.0465E7|    47.55|              47.116|
|2017-01-17 00:00:00| 47.34|47.116|2.30875E7|   47.116|              47.672|

We check for missing data

In [16]:
apple.select([f.count(f.when(f.isnull(c), c)).alias(c) for c in apple.columns]).show()

22/06/20 14:57:41 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/06/20 14:57:41 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+----+----+-----+------+---------+--------------------+
|Date|Open|Close|Volume|Adj Close|Prediction Adj Close|
+----+----+-----+------+---------+--------------------+
|   0|   0|    0|     0|        0|                   0|
+----+----+-----+------+---------+--------------------+



# Linear Regression

We need to split our dataset to 2 datasets: train (80%) and test (20%).

In [17]:
trainDF, testDF = apple.randomSplit([.8, .2], seed=42)

## Creation of the VectorAssembler

In [18]:
vecAssembler = VectorAssembler(inputCols=['Open', 'Close', 'Volume', 'Adj Close'], outputCol="features")

vecTrainDF = vecAssembler.transform(trainDF)

vecTrainDF.select("features").show(10)

22/06/20 14:57:42 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/06/20 14:57:42 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/06/20 14:57:42 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+--------------------+
|            features|
+--------------------+
|[42.9720001220703...|
|[42.9500007629394...|
|[45.3860015869140...|
|[45.7939987182617...|
|[46.4000015258789...|
|[45.8120002746582...|
|[47.3400001525878...|
|[47.3300018310546...|
|[49.4500007629394...|
|[49.0919990539550...|
+--------------------+
only showing top 10 rows



In [19]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(featuresCol="features", labelCol="Prediction Adj Close")
lrModel = lr.fit(vecTrainDF)

22/06/20 14:57:42 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/06/20 14:57:42 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/06/20 14:57:42 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/06/20 14:57:43 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/06/20 14:57:43 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/06/20 14:57:43 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/06/20 1

## Inspect the model

In [20]:
for c, name in zip(lrModel.coefficients, ['Open', 'Close', 'Volume', 'Adj Close']):
    print(f"The coefficient for {name} is {c}")

print(f"The intercept value of the model is {lrModel.intercept}")

The coefficient for Open is -0.050785793917411164
The coefficient for Close is 0.5301098096178442
The coefficient for Volume is -2.692036739690856e-09
The coefficient for Adj Close is 0.5301098096178442
The intercept value of the model is -0.2656314043805144


Let's take a look at the Root Mean Square Error

In [21]:
print("The Root Mean Square Error is ", lrModel.summary.rootMeanSquaredError)

The Root Mean Square Error is  6.926493437023043


## Pipeline

In [22]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[vecAssembler, lr])
pipelineModel = pipeline.fit(trainDF)

22/06/20 14:57:44 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/06/20 14:57:44 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/06/20 14:57:45 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/06/20 14:57:45 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/06/20 14:57:45 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/06/20 14:57:45 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/06/20 1

In [23]:
predDF = pipelineModel.transform(testDF)

predDF.select("Date", "Open", "Close", "Adj Close", "Prediction Adj Close", "prediction").show(10)

22/06/20 14:57:46 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/06/20 14:57:46 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/06/20 14:57:46 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+-------------------+------+------+---------+--------------------+------------------+
|               Date|  Open| Close|Adj Close|Prediction Adj Close|        prediction|
+-------------------+------+------+---------+--------------------+------------------+
|2017-01-05 00:00:00|45.284| 45.35|    45.35|              45.802|45.435970230316826|
|2017-01-11 00:00:00|45.814|45.946|   45.946|              45.918| 46.07137755574065|
|2017-01-13 00:00:00|  46.0| 47.55|    47.55|              47.116|47.729651261917496|
|2017-01-23 00:00:00| 49.17|49.784|   49.784|              50.922|49.934905361855755|
|2017-01-31 00:00:00|49.848|50.386|   50.386|              49.848| 50.56762231728801|
|2017-02-06 00:00:00|  50.2|51.554|   51.554|              51.496|51.795532953819276|
|2017-02-14 00:00:00|55.806|56.196|   56.196|              55.952| 56.38144966798079|
|2017-02-23 00:00:00|  52.8|51.198|   51.198|                51.4| 51.13324342293169|
|2017-03-09 00:00:00|49.526| 48.98|    48.98|         


# Baseline Model

In [24]:
from pyspark.ml.evaluation import RegressionEvaluator

avgAdjClose = trainDF.select(f.avg("Prediction Adj Close")).first()[0]
print(avgAdjClose)
predDF = testDF.withColumn("avgPrediction", f.lit(avgAdjClose))

regressionMeanEvaluator = RegressionEvaluator(predictionCol="avgPrediction", labelCol="Prediction Adj Close", metricName="rmse")

print(f"The RMSE for predicting the average prediction adj close is: {regressionMeanEvaluator.evaluate(predDF):.2f}")

22/06/20 14:57:53 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/06/20 14:57:53 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


107.79907364943296
The RMSE for predicting the average prediction adj close is: 101.55


22/06/20 14:57:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/06/20 14:57:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/06/20 14:57:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
