In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import HashingTF, Tokenizer, VectorAssembler
import pyspark.sql.functions as F
from pyspark.sql.window import Window
# isnan, when, count, datediff, mean, lag, 
# col, month, year, weekofyear, date_format, dayofmonth, avg, stddev, desc, sum

In [None]:
import os
pwd = os.getcwd()
exploration_abs_path = f"{pwd}/exploration.ipynb"
exploration_abs_path

In [None]:
%run {pwd + "/exploration.ipynb"}

In [None]:
spark = create_spark_session("Spark_Application_Name")

# load data
all_datasets = []
for f in ['AMAZON.csv', 'APPLE.csv', 'FACEBOOK.csv', 'GOOGLE.csv',
            'MICROSOFT.csv', 'TESLA.csv', 'ZOOM.csv']:
    print(f"{f}:")
    df = load_data(spark, 'stocks_data/' + f)
    all_datasets.append(df)

Input:  
    - 4xHigh  
    - 4xOpen  
    - 4xClose  
    - 4xLow  
    - RSI (not implemented)  
    - Volatilité  
    - Momentum  
    - Upside/Downside  
    - IMI  
    - MFI  
  
  
régression polynomiale  
régression quantile  
réseau de neurones récurrent  

In [None]:
# create a window to get lagged columns
win = Window.partitionBy('Year').orderBy('Date')

In [None]:
# create lagged columns (High, Low, Open and Close offseted by up to four)
df_with_lag = df.withColumn('Year', F.year('Date'))
for i in range(1, 4):
    for s in ["High", "Low", "Open", "Close"]:
        df_with_lag = df_with_lag.withColumn(s + str(i), F.lag(F.col(s), i).over(win))

In [None]:
s = 'Date'
i = 3
df_with_lag = df_with_lag.withColumn(s + str(i), F.lag(F.col(s), i).over(win))

In [None]:
df_with_lag.select(['Close', 'Close1', 'Close2', 'Close3']).show()

In [None]:
format_date = "%Y-%m-%d %H:%M:%S"
vol = compute_volatility(df_with_lag)
vol.select(['Volatility', 'Close', 'Date']).show()

In [None]:
# add momentum to the dataframe
df_with_momentum = compute_momentum(df_with_lag).withColumnRenamed('daily_average', 'Momentum')

In [None]:
# add label (the next Close value)
df_labeled = df_with_momentum.withColumn('label', F.lag(F.col('Close'), -1).over(win))

In [None]:
df_labeled.toDF(*df_labeled.columns)

In [None]:
# all the features given to the model
features = [
    "High", "High1", "High2", "High3",
    "Low", "Low1", "Low2", "Low3",
    "Open", "Open1", "Open2", "Open3",
    "Close", "Close1", "Close2", "Close3",
    "Momentum"
]

# drop columns containing null values
final_df = df_labeled.select(features + ['label']).fillna(0)
final_df.show()

In [None]:
# vectorise features
assembler = VectorAssembler(
    inputCols=  #features,
    [
        "High",
        "Low",
        "Open",
        "Close",
    ],
    outputCol="features"
)

# create model type
lr = LinearRegression()

# create the pipeline that will create the model
pipeline = Pipeline(stages=[assembler, lr])

In [None]:
#df_labeled = df_with_lag.withColumn('label', F.col('Close'))

In [None]:
final_df

In [None]:
# Fit the pipeline to training documents.
model = pipeline.fit(final_df)

In [None]:
# test the model
predDF = model.transform(df)

In [None]:
predDF.show()

In [None]:
df_labeled.select(['Open', 'Close', 'label']).show()