"""
@Author: Samarth BM

@Date: 2021-12-04

@Last Modified by: Samarth BM

@Title : To read the cleaned data from hdfs and perform the linear regression on it to get the stock data prediction.

"""

In [30]:
from pyspark.sql import *
spark = SparkSession.builder.getOrCreate()
from pyspark.sql.functions import *

In [31]:
stock_data= spark.read.option("inferSchema", "true").csv("CleanedStockData", header=True)

In [32]:
stock_data.show()

+-------------------+------+------+------+------+--------+
|               time|  open|  high|   low| close|  volume|
+-------------------+------+------+------+------+--------+
|2021-12-01 19:31:00|117.03|117.03|117.03|117.03|   150.0|
|2021-12-01 19:03:00|117.43|117.43|117.43|117.43|   250.0|
|2021-12-01 18:26:00| 117.1| 117.1| 117.1| 117.1|   438.0|
|2021-12-01 18:01:00|117.15|117.15|117.15|117.15|   100.0|
|2021-12-01 17:48:00|116.95|116.95|116.95|116.95|   151.0|
|2021-12-01 17:17:00|116.93|116.93|116.93|116.93|   600.0|
|2021-12-01 17:15:00|116.93|116.93|116.93|116.93|   194.0|
|2021-12-01 17:11:00|116.96|116.96|116.96|116.96|   105.0|
|2021-12-01 17:10:00|116.99|116.99|116.99|116.99|   116.0|
|2021-12-01 17:01:00| 117.0| 117.0| 117.0| 117.0|   305.0|
|2021-12-01 16:48:00| 117.2| 117.2| 117.2| 117.2|   100.0|
|2021-12-01 16:39:00|117.01|117.01|117.01|117.01|   514.0|
|2021-12-01 16:26:00|117.43|117.43|117.43|117.43|   125.0|
|2021-12-01 16:17:00| 117.0| 117.0| 117.0| 117.0|   250.

In [33]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.sql.functions import percent_rank
from pyspark.sql import Window

Storing all the independent variable to a vector and naming it as features

In [34]:
assembler=VectorAssembler(inputCols=["open","high","low"],outputCol="features")
output=assembler.transform(stock_data)
output.select("features").show(truncate=False)

+----------------------+
|features              |
+----------------------+
|[117.03,117.03,117.03]|
|[117.43,117.43,117.43]|
|[117.1,117.1,117.1]   |
|[117.15,117.15,117.15]|
|[116.95,116.95,116.95]|
|[116.93,116.93,116.93]|
|[116.93,116.93,116.93]|
|[116.96,116.96,116.96]|
|[116.99,116.99,116.99]|
|[117.0,117.0,117.0]   |
|[117.2,117.2,117.2]   |
|[117.01,117.01,117.01]|
|[117.43,117.43,117.43]|
|[117.0,117.0,117.0]   |
|[116.92,116.92,116.92]|
|[116.92,116.92,116.92]|
|[116.92,116.92,116.92]|
|[116.92,116.92,116.67]|
|[117.03,117.07,116.85]|
|[117.1,117.11,117.02] |
+----------------------+
only showing top 20 rows



In [35]:
final_data=output.select("time","features","close").sort("time",ascending=True)

In [36]:
final_data.show()

+-------------------+--------------------+-------------+
|               time|            features|        close|
+-------------------+--------------------+-------------+
|2021-11-02 04:05:00|[118.987568658,11...|118.987568658|
|2021-11-02 04:15:00|[119.100755406,11...|119.100755406|
|2021-11-02 06:35:00|[118.657440643,11...|118.657440643|
|2021-11-02 06:42:00|[118.657440643,11...|118.657440643|
|2021-11-02 06:43:00|[118.657440643,11...|118.657440643|
|2021-11-02 06:48:00|[118.629143956,11...|118.629143956|
|2021-11-02 07:12:00|[118.846085223,11...|118.846085223|
|2021-11-02 07:26:00|[119.034729803,11...|119.034729803|
|2021-11-02 07:32:00|[119.034729803,11...|119.298832214|
|2021-11-02 07:38:00|[119.24223884,119...| 119.24223884|
|2021-11-02 07:41:00|[119.081890948,11...|119.034729803|
|2021-11-02 08:00:00|[119.034729803,11...|119.034729803|
|2021-11-02 08:01:00|[119.034729803,11...|119.119619864|
|2021-11-02 08:09:00|[119.270535527,11...|119.308264443|
|2021-11-02 08:18:00|[119.66668

Creating trained data and test data by using random split

In [37]:
#trainData,testData=final_data.randomSplit([0.75,0.25])

In [38]:
# regressor=LinearRegression(featuresCol="features", labelCol="close")
# regressor=regressor.fit(trainData)
# print("Coefficients: " + str(regressor.coefficients))
# print("Intercept: " + str(regressor.intercept))

In [39]:
# predict = regressor.transform(testData)
# predict.show(5)

In [40]:
# from pyspark.ml.evaluation import RegressionEvaluator
# regression_Evaluator = RegressionEvaluator(
# predictionCol="prediction",
# labelCol="close",
# metricName="rmse")
# rmse = regression_Evaluator.evaluate(predict)
# print(f"RMSE is {rmse:}")

Creating train data and test data based on rank

In [41]:
data = final_data.withColumn("rank",percent_rank().over(Window.partitionBy().orderBy("time")))

In [42]:
train_data = data.where("rank <= .8").drop("rank")
test_data = data.where("rank > .8").drop("rank")

In [43]:
train_data.count()

21/12/05 18:51:51 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


7457

In [44]:
test_data.count()

21/12/05 18:51:52 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


1865

In [45]:
#test_data.write.parquet("TestData")


To perform linear regression on test data and get the coefficients and intercept

In [46]:
linear_regression=LinearRegression(featuresCol='features',labelCol='close')
linear_model=linear_regression.fit(train_data)
print("Coefficients: " + str(linear_model.coefficients))
print("Intercept: " + str(linear_model.intercept))

21/12/05 18:51:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
21/12/05 18:51:54 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
21/12/05 18:51:55 WARN Instrumentation: [0938d18e] regParam is zero, which might cause numerical instability and overfitting.
21/12/05 18:51:56 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


Coefficients: [-0.46548467268012933,0.7584587176498301,0.7068613986940332]
Intercept: 0.01795461473737477




Using above model on test data to predict the output

In [47]:
predictions = linear_model.transform(test_data)
predictions.show()


21/12/05 18:51:58 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+-------------------+--------------------+--------+------------------+
|               time|            features|   close|        prediction|
+-------------------+--------------------+--------+------------------+
|2021-11-24 13:00:00|[116.6598,116.69,...|  116.69|116.68146295873287|
|2021-11-24 13:01:00|[116.67,116.67,11...|  116.65|116.65461859901134|
|2021-11-24 13:02:00|[116.67,116.67,11...|116.6567|116.64401567803094|
|2021-11-24 13:03:00|[116.64,116.65,11...|116.6405|116.63220812287791|
|2021-11-24 13:04:00|[116.66,116.66,11...| 116.645|116.64815455156817|
|2021-11-24 13:05:00|[116.655,116.69,1...|  116.69|116.67344779488067|
|2021-11-24 13:06:00|[116.7,116.715,11...|  116.71|116.71012777105987|
|2021-11-24 13:07:00|[116.71,116.745,1...|116.7389|116.73529529984951|
|2021-11-24 13:08:00|[116.73,116.75,11...|  116.75|116.74391512795803|
|2021-11-24 13:09:00|[116.74,116.76,11...| 116.755|116.75391348239468|
|2021-11-24 13:10:00|[116.76,116.79,11...|  116.79| 116.7811413477451|
|2021-



In [54]:
selected_data=predictions.select("time","close","prediction")
selected_data.show()
selected_data.write.option("header",True).csv("file:///home/samarth/SelectedData")

21/12/05 18:56:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
21/12/05 18:56:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+-------------------+--------+------------------+
|               time|   close|        prediction|
+-------------------+--------+------------------+
|2021-11-24 13:00:00|  116.69|116.68146295873287|
|2021-11-24 13:01:00|  116.65|116.65461859901134|
|2021-11-24 13:02:00|116.6567|116.64401567803094|
|2021-11-24 13:03:00|116.6405|116.63220812287791|
|2021-11-24 13:04:00| 116.645|116.64815455156817|
|2021-11-24 13:05:00|  116.69|116.67344779488067|
|2021-11-24 13:06:00|  116.71|116.71012777105987|
|2021-11-24 13:07:00|116.7389|116.73529529984951|
|2021-11-24 13:08:00|  116.75|116.74391512795803|
|2021-11-24 13:09:00| 116.755|116.75391348239468|
|2021-11-24 13:10:00|  116.79| 116.7811413477451|
|2021-11-24 13:11:00|  116.75|116.76046162427708|
|2021-11-24 13:12:00|  116.72|116.70340123810696|
|2021-11-24 13:13:00|  116.69|116.68513188496829|
|2021-11-24 13:14:00| 116.725|116.70736742385076|
|2021-11-24 13:15:00|  116.66|116.68219539614616|
|2021-11-24 13:16:00|116.6043|116.60847132000099|




Using Root mean square error(RMSE) method to find the accuracy of the data

In [48]:
from pyspark.ml.evaluation import RegressionEvaluator
regression_Evaluator = RegressionEvaluator(
predictionCol="prediction",
labelCol="close",
metricName="rmse")
rmse = regression_Evaluator.evaluate(predictions)
print(f"RMSE is {rmse:}")

21/12/05 18:51:59 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


RMSE is 0.0479839992428812




In [49]:
#linear_model.save("hdfs://localhost:9000/StockDataProject/StockDataModel")

In [50]:
from pyspark.ml.regression import LinearRegressionModel
lrcvModel = LinearRegressionModel.load('StockDataModel')
output = lrcvModel.transform(test_data)
output.show()

21/12/05 18:52:01 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+-------------------+--------------------+--------+------------------+
|               time|            features|   close|        prediction|
+-------------------+--------------------+--------+------------------+
|2021-11-24 13:00:00|[116.6598,116.69,...|  116.69|116.68183314022377|
|2021-11-24 13:01:00|[116.67,116.67,11...|  116.65| 116.6572241598078|
|2021-11-24 13:02:00|[116.67,116.67,11...|116.6567|116.64635080298864|
|2021-11-24 13:03:00|[116.64,116.65,11...|116.6405|116.63394920360098|
|2021-11-24 13:04:00|[116.66,116.66,11...| 116.645|116.65084842065444|
|2021-11-24 13:05:00|[116.655,116.69,1...|  116.69|116.67322990760411|
|2021-11-24 13:06:00|[116.7,116.715,11...|  116.71|116.71181097028789|
|2021-11-24 13:07:00|[116.71,116.745,1...|116.7389| 116.7352623978591|
|2021-11-24 13:08:00|[116.73,116.75,11...|  116.75| 116.7451743536033|
|2021-11-24 13:09:00|[116.74,116.76,11...| 116.755|116.75517454502973|
|2021-11-24 13:10:00|[116.76,116.79,11...|  116.79|116.78153810072766|
|2021-

