### Imports

In [33]:
import numpy as np
import pandas as pd
import yfinance as yf
import tensorflow as tf
import plotly.express as px
import plotly.graph_objects as go
from tensorflow import keras
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score
from datetime import datetime
from copy import deepcopy

### Loading and Processing Data

## Stock Price Forecasting

### Preparing Data

In [None]:
sp500 = sp500_og.copy()[["Close"]]
sp500.index = sp500.index.map(lambda ts: ts.date())
sp500.tail()

Unnamed: 0_level_0,Close
Date,Unnamed: 1_level_1
2023-07-17,4522.790039
2023-07-18,4554.97998
2023-07-19,4565.720215
2023-07-20,4534.870117
2023-07-21,4536.339844


In [None]:
def window_data(data, n=3):
    windowed_data = pd.DataFrame()
    for i in range(n, 0, -1):
        windowed_data[f'Target-{i}'] = data['Close'].shift(i)
    windowed_data['Target'] = data['Close']
    return windowed_data.dropna()

In [None]:
windowed = window_data(sp500) # gets us the values before the target, and then the target that we actually predict
windowed.head()

Unnamed: 0_level_0,Target-3,Target-2,Target-1,Target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1990-01-05,359.690002,358.76001,355.670013,352.200012
1990-01-08,358.76001,355.670013,352.200012,353.790009
1990-01-09,355.670013,352.200012,353.790009,349.619995
1990-01-10,352.200012,353.790009,349.619995,347.309998
1990-01-11,353.790009,349.619995,347.309998,348.529999


In [None]:
def df_to_date_X_y(df):
    dates = df.index
    dates = pd.to_datetime(dates)
    df = df.to_numpy()
    middle_matrix = df[:, :-1]
    X = middle_matrix.reshape((len(dates), middle_matrix.shape[1], 1))
    X = X.astype(np.float32)
    y = df[:, -1].astype(np.float32)
    return (dates, X, y)

In [None]:
(dates, X, y) = df_to_date_X_y(windowed)
dates.shape, X.shape, y.shape

((8450,), (8450, 3, 1), (8450,))

In [None]:
q_80 = int(len(dates)*.8)
dates_train, X_train, y_train = dates[:q_80], X[:q_80], y[:q_80]
dates_test, X_test, y_test = dates[q_80:], X[q_80:], y[q_80:]

### Creating and Training First Model

In [None]:
model = keras.Sequential([
        keras.layers.Input((3,1)),
        keras.layers.LSTM(units=64, return_sequences=True), # LSTM model is good for predicting series (i.e forecasting stocks
        keras.layers.LSTM(units=32),
        keras.layers.Dense(units=128, activation="relu"),
        keras.layers.Dense(units=128, activation="relu"),
        keras.layers.Dense(units=64, activation="relu"),
        keras.layers.Dense(units=1, activation="linear")
    ])

In [None]:
def compile_fit_model(model, X_train, y_train):
    model.compile(optimizer=keras.optimizers.legacy.Adam(learning_rate=0.001),
              loss=keras.losses.MeanSquaredError(),
              metrics=["mean_absolute_error"])

    model.fit(x=X_train, y=y_train, epochs=112)

In [None]:
compile_fit_model(model, X_train, y_train)

Epoch 1/112


Epoch 2/112
Epoch 3/112
Epoch 4/112
Epoch 5/112
Epoch 6/112
Epoch 7/112
Epoch 8/112
Epoch 9/112
Epoch 10/112
Epoch 11/112
Epoch 12/112
Epoch 13/112
Epoch 14/112
Epoch 15/112
Epoch 16/112
Epoch 17/112
Epoch 18/112
Epoch 19/112
Epoch 20/112
Epoch 21/112
Epoch 22/112
Epoch 23/112
Epoch 24/112
Epoch 25/112
Epoch 26/112
Epoch 27/112
Epoch 28/112
Epoch 29/112
Epoch 30/112
Epoch 31/112
Epoch 32/112
Epoch 33/112
Epoch 34/112
Epoch 35/112
Epoch 36/112
Epoch 37/112
Epoch 38/112
Epoch 39/112
Epoch 40/112
Epoch 41/112
Epoch 42/112
Epoch 43/112
Epoch 44/112
Epoch 45/112
Epoch 46/112
Epoch 47/112
Epoch 48/112
Epoch 49/112
Epoch 50/112
Epoch 51/112
Epoch 52/112
Epoch 53/112
Epoch 54/112
Epoch 55/112
Epoch 56/112
Epoch 57/112
Epoch 58/112
Epoch 59/112
Epoch 60/112
Epoch 61/112
Epoch 62/112
Epoch 63/112
Epoch 64/112
Epoch 65/112
Epoch 66/112
Epoch 67/112
Epoch 68/112
Epoch 69/112
Epoch 70/112
Epoch 71/112
Epoch 72/112
Epoch 73/112
Epoch 74/112
Epoch 75/112
Epoch 76/112
Epoch 77/112
Epoch 78/112
Epoch 7

### Plotting Observations

In [None]:
def plot_train(X_train, y_train):
    train_predictions = model.predict(X_train).flatten()
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=dates_train, y=train_predictions, mode="lines", name="Training Predictions"))
    fig.add_trace(go.Scatter(x=dates_train, y=y_train, mode="lines", name="Training Observations"))
    fig.update_layout(xaxis_title="Dates", yaxis_title="Value")
    fig.show()

In [None]:
plot_train(X_train, y_train)



In [None]:
def plot_test(X_test, y_test):
    test_predictions = model.predict(X_test).flatten()
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=dates_test, y=test_predictions, mode="lines", name="Testing Predictions"))
    fig.add_trace(go.Scatter(x=dates_test, y=y_test, mode="lines", name="Testing Observations"))
    fig.update_layout(xaxis_title="Dates", yaxis_title="Value")
    fig.show()

In [None]:
plot_test(X_test, y_test)

 1/53 [..............................] - ETA: 0s



#### Change Model to Predict Recursively for 3 Years

In [None]:
start_date = pd.Timestamp("2020-07-23").date()
windowed_yr = windowed.loc[windowed.index >= start_date]
windowed_yr.head()

Unnamed: 0_level_0,Target-3,Target-2,Target-1,Target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-07-23,3251.840088,3257.300049,3276.02002,3235.659912
2020-07-24,3257.300049,3276.02002,3235.659912,3215.629883
2020-07-27,3276.02002,3235.659912,3215.629883,3239.409912
2020-07-28,3235.659912,3215.629883,3239.409912,3218.439941
2020-07-29,3215.629883,3239.409912,3218.439941,3258.439941


In [None]:
(dates, X, y) = df_to_date_X_y(windowed_yr)
dates.shape, X.shape, y.shape

((754,), (754, 3, 1), (754,))

In [None]:
q_80 = int(len(dates)*.8)
dates_train, X_train, y_train = dates[:q_80], X[:q_80], y[:q_80]
dates_test, X_test, y_test = dates[q_80:], X[q_80:], y[q_80:]

In [None]:
compile_fit_model(model, X_train, y_train)

Epoch 1/112
Epoch 2/112
Epoch 3/112
Epoch 4/112
Epoch 5/112
Epoch 6/112
Epoch 7/112
Epoch 8/112
Epoch 9/112
Epoch 10/112
Epoch 11/112
Epoch 12/112
Epoch 13/112
Epoch 14/112
Epoch 15/112
Epoch 16/112
Epoch 17/112
Epoch 18/112
Epoch 19/112
Epoch 20/112
Epoch 21/112
Epoch 22/112
Epoch 23/112
Epoch 24/112
Epoch 25/112
Epoch 26/112
Epoch 27/112
Epoch 28/112
Epoch 29/112
Epoch 30/112
Epoch 31/112
Epoch 32/112
Epoch 33/112
Epoch 34/112
Epoch 35/112
Epoch 36/112
Epoch 37/112
Epoch 38/112
Epoch 39/112
Epoch 40/112
Epoch 41/112
Epoch 42/112
Epoch 43/112
Epoch 44/112
Epoch 45/112
Epoch 46/112
Epoch 47/112
Epoch 48/112
Epoch 49/112
Epoch 50/112
Epoch 51/112
Epoch 52/112
Epoch 53/112
Epoch 54/112
Epoch 55/112
Epoch 56/112
Epoch 57/112
Epoch 58/112
Epoch 59/112
Epoch 60/112
Epoch 61/112
Epoch 62/112
Epoch 63/112
Epoch 64/112
Epoch 65/112
Epoch 66/112
Epoch 67/112
Epoch 68/112
Epoch 69/112
Epoch 70/112
Epoch 71/112
Epoch 72/112
Epoch 73/112
Epoch 74/112
Epoch 75/112
Epoch 76/112
Epoch 77/112
Epoch 78

In [None]:
plot_train(X_train, y_train)



In [None]:
plot_test(X_test, y_test)



### Long-Term Recursive Prediction

In [None]:
recursive_predictions = []
for target_date in dates_test:
    last_window = deepcopy(X_train[-1])
    next_pred = model.predict(np.array([last_window])).flatten()
    recursive_predictions.append(next_pred)
    last_window[-1] = next_pred



In [None]:
def plot_recursive(y_test, recursive_predictions):
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=dates_test, y=recursive_predictions, mode="lines", name="Testing Predictions"))
    fig.add_trace(go.Scatter(x=dates_test, y=y_test, mode="lines", name="Testing Observations"))
    fig.update_layout(xaxis_title="Dates", yaxis_title="Value")
    fig.show()

In [None]:
plot_recursive(y_test, recursive_predictions)

In [None]:
# yea, that didnt't work... looks like it's hard to predict stocks
# let's try atleast predicting if the stock will go up or down

In [None]:
model = RandomForestClassifier(n_estimators=200, min_samples_split=50, random_state=1)
predictions_new = backtest(sp500, model, new_predictors)

In [None]:
predictions_new["Target"].value_counts

<bound method IndexOpsMixin.value_counts of Date
2003-11-14 00:00:00-05:00    0
2003-11-17 00:00:00-05:00    0
2003-11-18 00:00:00-05:00    1
2003-11-19 00:00:00-05:00    0
2003-11-20 00:00:00-05:00    1
                            ..
2023-07-14 00:00:00-04:00    1
2023-07-17 00:00:00-04:00    1
2023-07-18 00:00:00-04:00    1
2023-07-19 00:00:00-04:00    0
2023-07-20 00:00:00-04:00    1
Name: Target, Length: 4952, dtype: int64>

In [None]:
precision_score(predictions_new["Target"], predictions_new["Predictions"])

0.5691554467564259

In [2]:
# this model will be trained on historical data of the S&P 500 Index as a general indicator of the market
# NOTE: data is up to 2020, this is so model is more generalizable without data from COVID years
sp500_og = yf.Ticker("^GSPC")
sp500_og = sp500_og.history(period="max")

In [3]:
sp500_og = sp500_og.drop(["Dividends", "Stock Splits"], axis=1)
sp500_og = sp500_og.loc["1990-01-01":].copy() # only use data after 1990
sp500_og.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1990-01-02 00:00:00-05:00,353.399994,359.690002,351.980011,359.690002,162070000
1990-01-03 00:00:00-05:00,359.690002,360.589996,357.890015,358.76001,192330000
1990-01-04 00:00:00-05:00,358.76001,358.76001,352.890015,355.670013,177000000
1990-01-05 00:00:00-05:00,355.670013,355.670013,351.350006,352.200012,158530000
1990-01-08 00:00:00-05:00,352.200012,354.23999,350.540009,353.790009,140110000


In [4]:
px.line(sp500_og, y="Close")

## Stock Price Movement Prediction 

### Preparing Data

In [5]:
sp500 = sp500_og.copy()
sp500["Tomorrow"] = sp500["Close"].shift(-1)
sp500["Target"] = (sp500["Tomorrow"] > sp500["Close"]).astype(int) # we want to predict if price goes up or down tommorow

In [6]:
# X_train, X_test, y_train, y_test = train_test_split(x=pd.get_dummies(sp500), y=sp500["Target"], test_size=.2)

### Creating and Testing First Model

In [7]:
model = RandomForestClassifier(n_estimators=100, min_samples_split=100, random_state=1) # by locking random state we can more easily control our model
# NOTE: this data will be re-split later, this is just a start
train = sp500.iloc[:100] # all but last 100 rows
test = sp500.iloc[-100:] # only last 100 rows
predictors = ["Close", "Volume", "Open", "High", "Low"]

In [8]:
model.fit(X=train[predictors], y=train["Target"])
preds = model.predict(test[predictors])
preds = pd.Series(preds, index=test.index, name="Predictions")
precision_score(test["Target"], preds)
combined = pd.concat([test["Target"], preds], axis=1)

In [9]:
px.line(combined)

### Backtesting Model

In [10]:
def predict(train, test, predictors, model):
    model.fit(X=train[predictors], y=train["Target"])
    preds = model.predict_proba(test[predictors])[:,1]
    preds[preds >= .6] = 1
    preds[preds < .6] = 0
    preds = pd.Series(preds, index=test.index, name="Predictions")
    precision_score(test["Target"], preds)
    combined = pd.concat([test["Target"], preds], axis=1)
    return combined

In [11]:
pred1 = predict(train, test, predictors, model)
px.line(pred1)


Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.



In [12]:
def backtest(data, model, predictors, start=2500, step=250): # start with 10 years and predict one year at a time
    all_preds = []
    for i in range(start, data.shape[0], step):
        train = data.iloc[0:i].copy()
        test = data.iloc[i:(i+step)].copy()
        preds = predict(train, test, predictors, model)
        all_preds.append(preds)
    return pd.concat(all_preds)

In [13]:
predictions = backtest(sp500, model, predictors)


Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.



In [14]:
predictions["Predictions"].value_counts()

Predictions
0.0    5044
1.0     909
Name: count, dtype: int64

In [15]:
precision_score(predictions["Target"], predictions["Predictions"]) # model is not very good right now :(

0.5324532453245324

#### Adding Predictors

In [16]:
horizons = [2,5,60,250,1000] # look at means at each of these horizons
new_predictors = []
for h in horizons:
    rolling_avgs = sp500.rolling(h).mean()
    
    ratio_col = f"Close_Ratio_{h}"
    sp500[ratio_col] = sp500["Close"] / rolling_avgs["Close"]
    trend_col = f"Trend_{h}"
    sp500[trend_col] = sp500.shift(1).rolling(h).sum()["Target"]

    new_predictors += [ratio_col, trend_col]

In [17]:
sp500 = sp500.dropna()
sp500.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Tomorrow,Target,Close_Ratio_2,Trend_2,Close_Ratio_5,Trend_5,Close_Ratio_60,Trend_60,Close_Ratio_250,Trend_250,Close_Ratio_1000,Trend_1000
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2023-07-14 00:00:00-04:00,4514.609863,4527.759766,4499.560059,4505.419922,3647450000,4522.790039,1,0.999488,1.0,1.008537,4.0,1.058002,32.0,1.117622,120.0,1.184298,532.0
2023-07-17 00:00:00-04:00,4508.859863,4532.850098,4504.899902,4522.790039,3538240000,4554.97998,1,1.001924,1.0,1.007318,4.0,1.060553,33.0,1.121161,121.0,1.18839,533.0
2023-07-18 00:00:00-04:00,4521.779785,4562.299805,4514.589844,4554.97998,4090010000,4565.720215,1,1.003546,2.0,1.009285,4.0,1.066329,34.0,1.128448,121.0,1.196367,533.0
2023-07-19 00:00:00-04:00,4563.870117,4578.430176,4557.47998,4565.720215,4115670000,4534.870117,0,1.001178,2.0,1.007487,4.0,1.067044,34.0,1.130431,121.0,1.198702,534.0
2023-07-20 00:00:00-04:00,4554.379883,4564.740234,4527.560059,4534.870117,3761770000,4536.339844,1,0.99661,1.0,0.999584,3.0,1.058194,33.0,1.122197,120.0,1.190127,534.0


### Updating Model and Re-Predicting

In [None]:
model = RandomForestClassifier(n_estimators=200, min_samples_split=50, random_state=1)
predictions_new = backtest(sp500, model, new_predictors)

In [None]:
predictions_new["Target"].value_counts

In [None]:
precision_score(predictions_new["Target"], predictions_new["Predictions"])

In [None]:
# this model is pretty decent at predicting the movements of the stock market