In [11]:
# In this tutorial, we'll learn how to predict tomorrow's S&P 500 index price using historical data. We'll also learn how to avoid common issues that make most stock price models overfit in the real world.

# We'll start by downloading S&P 500 prices using a package called yfinance.  Then, we'll clean up the data with pandas, and get it ready for machine learning.  

# We'll train a random forest model and make predictions using backtesting.  Then, we'll improve the model by adding predictors.  We'll end with next steps you can use to improve the model on your own.

In [12]:
import yfinance as yf
sp500 = yf.Ticker("^GSPC")
sp500 = sp500.history(period = "max")
sp500

YFRateLimitError: Too Many Requests. Rate limited. Try after a while.

In [ ]:
sp500.index #column allows us to index and slice the dataframe easily

In [ ]:
sp500.plot.line(y="Close", use_index=True)

In [ ]:
del sp500["Dividends"]
del sp500["Stock Splits"] #these columns are appropriate for individual stocks, not an index, so we don't actually need them

In [ ]:
# Some people try to predict the exact stock price, like whether it will be $17 or $18 tomorrow. But even if you're good at that, you can still lose money. What really matters when trading is not the exact price, but whether the price will go up or down. You could predict the price closely and still be wrong about the direction, which is what actually affects profits.

In [ ]:
sp500["Tomorrow"] = sp500["Close"].shift(-1)
sp500

In [ ]:
sp500["Target"] = (sp500["Tomorrow"] > sp500["Close"]).astype(int)
sp500
# Target is set up by comparing tomorrow's price and today's price (is tomorrow's price > today's price?) then converted to int for ML

In [ ]:
#with stock market data, the bulk data may not be great since the market might have shifted fundamentally years ago, so we'll remove all the historical data before 1990
sp500 = sp500.loc["1990-01-01":].copy()
# If you don't put .copy(), you can something get a Pandas setting with copy warning when you try to subset a dataframe and then later assign back to it

In [None]:
# A RandomForestClassifier is a great default model for most ML because 
# 1) random forests work by training a bunch of individual decision trees with randomized params and then avg those results from those decision trees, thus they're resistant to overfitting
# 2) run relatively quickly
# 3) they can pick up non-linear tendencies in the data

# n_estimators is the number of individual decision trees we wanna train, the higher it is, general the better accuracy (up to a point), but we'll set it low so we can run the prog quickly
# min_samples_split helps us protect against overfitting, decision trees have a tendency to overfit if they built the tree too deeply, ***the higher we set it, the less accurate the model will be, but the less it will overfit
# setting a random_state means that if we run the same model twice, the random numbers that are generated will be in a predictable sequence each time, using this random seed of 1. So, if we re-run the model twice, we'll get the same results, (which helps if you're updating or improving you're model and you wanna make sure it's actually the model or something you did that improved an error vs just something random
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators= 100, min_samples_split=100, random_state=1)
train = sp500.iloc[:-100]
test = sp500.iloc[-100:]

predictors = ["Open", "High", "Low", "Close", "Volume"]
model.fit(train[predictors], train["Target"])

In [ ]:
from sklearn.metrics import precision_score
import pandas as pd

predictions = model.predict(test[predictors])
predictions = pd.Series(predictions, index = test.index)

In [ ]:
precision_score(test["Target"], predictions)

In [ ]:
combined = pd.concat([test["Target"], predictions], axis = 1)
combined.plot()

In [ ]:
#building a backtesting system
def predict(train, test, predictors, model):
    model.fit(train[predictors], train["Target"])
    predictions = model.predict(test[predictors])
    predictions = pd.Series(predictions, index = test.index, name = "Predictions")
    combined = pd.concat([test["Target"], predictions], axis = 1)
    return combined

In [ ]:
# start - when you backtest you want to have a certain amount of data to train your first model. So every trading year has around 250 days, so what start is saying is "take 10 years of data and then train your first model with that data"
# step - means that we will be training a model for about a year (250 days), and then increasing by 1 year
def backtest(data, model, predictors, start=2500, step=250):
    all_predictions = []
    for i in range(start, data.shape[0], step):
        train = data.iloc[0:i].copy()
        test = data.iloc[i:(i + step)].copy()
        preds = predict(train, test, predictors, model)
        all_predictions.append(preds)
    return pd.concat(all_predictions)

In [ ]:
preds = backtest(sp500, model, predictors)

In [ ]:
#evaluating error of predictions
preds["Predictions"].value_counts()
precision_score(preds["Target"], preds["Predictions"])
preds["Target"].value_counts() / preds.shape[0]

In [ ]:
horizons = [2, 5, 60, 250, 1000]
new_predictors = []
for horizon in horizons:
    rolling_averages = sp500.rolling(horizon).mean()
    ratio_column = f"Close_ratio_{horizon}"
    sp500[ratio_column] = sp500["Close"] / rolling_averages["Close"]
    
    trend_column = f"Trend_{horizon}"
    sp500[trend_column] = sp500.shift(1).rolling(horizon).sum()["Target"]
    
    new_predictors += [ratio_column, trend_column]

In [ ]:
sp500

In [ ]:
sp500 = sp500.dropna()
sp500

In [ ]:
#improving our model
model = RandomForestClassifier(n_estimators=200, min_samples_split=50, random_state=1)

In [ ]:
def predict(train, test, predictors, model):
    model.fit(train[predictors], train["Target"])
    predictions = model.predict_proba(test[predictors])[:,1]
    predictions[predictions >= 0.6] = 1
    predictions[predictions < 0.6] = 0
    predictions = pd.Series(predictions, index = test.index, name = "Predictions")
    combined = pd.concat([test["Target"], predictions], axis = 1)
    return combined

In [ ]:
predictions = backtest(sp500, model, new_predictors)
predictions["Predictions"].value_counts()
precision_score(predictions["Target"], predictions["Predictions"])