In [None]:
import yfinance as yf

In [None]:
sp500 = yf.Ticker("^GSPC")

sp500 = sp500.history(period="max")


In [None]:
sp500

In [None]:
sp500.index


In [None]:
sp500.plot.line(y="Close", use_index=True)

In [None]:
sp500.drop(columns=["Dividends", "Stock Splits"], inplace=True)

In [None]:
sp500["Tomorrow"] = sp500["Close"].shift(-1)
sp500

In [None]:
sp500["Target"] = (sp500["Tomorrow"] > sp500["Close"]).astype(int)
sp500

In [None]:
sp500 = sp500.loc["1990-01-01":].copy()
sp500

In [None]:
train = sp500.iloc[:-100]
test = sp500.iloc[-100:]
predictors = ["Close", "Volume", "Open", "High", "Low"]

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, min_samples_split=100, random_state=1)
model.fit(train[predictors], train["Target"])

In [None]:
from sklearn.metrics import precision_score
import pandas as pd

preds = model.predict(test[predictors])
preds = pd.Series(preds, index=test.index)

In [None]:
precision = precision_score(test["Target"], preds, zero_division=0)
print(f"Precision Score: {precision}")

In [None]:
combined = pd.concat([test["Target"], preds.rename("Predictions")], axis=1)
combined.plot()

In [None]:
def predict(train, test, predictors, model):
    model.fit(train[predictors], train["Target"])
    preds = model.predict(test[predictors])
    preds = pd.Series(preds, index=test.index, name="Predictions")
    combined = pd.concat([test["Target"], preds], axis=1)
    return combined

In [106]:
def backtest(data, model, predictors, start=2500, step=250):
    all_predictions = []
    
    for i in range(start, data.shape[0], step):
        train = data.iloc[0:i].copy()
        test = data.iloc[i: (i+step)].copy()
        predictions = predict(train, test, predictors, model)
        all_predictions.append(predictions)
    return pd.concat(all_predictions)

In [107]:
predictions = backtest(sp500, model, predictors)

In [108]:
predictions["Predictions"].value_counts()

Predictions
0    3588
1    2596
Name: count, dtype: int64

In [110]:
precision_score(predictions["Target"], predictions["Predictions"])

0.5288906009244992

In [111]:
predictions["Target"].value_counts() / predictions.shape[0]


Target
1    0.534929
0    0.465071
Name: count, dtype: float64

In [116]:
horizons = [2, 5, 60, 250, 1000]
new_predictors = []

for horizon in horizons:
    rolling_averages = sp500.rolling(horizon).mean()

    ratio_column = f"Close_Ratio_{horizon}"
    sp500[ratio_column] = sp500["Close"] / rolling_averages["Close"]

    trend_column = f"Trend_{horizon}"
    sp500[trend_column] = sp500.shift(1).rolling(horizon).sum()["Target"]

    new_predictors += [ratio_column, trend_column]


In [117]:
sp500

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Tomorrow,Target,Close_Ratio_2,Trend_2,Close_Ratio_5,Trend_5,Close_Ratio_60,Trend_60,Close_Ratio_250,Trend_250,Close_Ratio_1000,Trend_1000
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1990-01-02 00:00:00-05:00,353.399994,359.690002,351.980011,359.690002,162070000,358.760010,0,,,,,,,,,,
1990-01-03 00:00:00-05:00,359.690002,360.589996,357.890015,358.760010,192330000,355.670013,0,0.998706,,,,,,,,,
1990-01-04 00:00:00-05:00,358.760010,358.760010,352.890015,355.670013,177000000,352.200012,0,0.995675,0.0,,,,,,,,
1990-01-05 00:00:00-05:00,355.670013,355.670013,351.350006,352.200012,158530000,353.790009,1,0.995098,0.0,,,,,,,,
1990-01-08 00:00:00-05:00,352.200012,354.239990,350.540009,353.790009,140110000,349.619995,0,1.002252,1.0,0.993731,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-06-14 00:00:00-04:00,5424.080078,5432.390137,5403.750000,5431.600098,3438650000,5473.229980,1,0.999803,1.0,1.005015,4.0,1.042196,33.0,1.143578,139.0,1.288757,531.0
2024-06-17 00:00:00-04:00,5431.109863,5488.500000,5420.399902,5473.229980,3447840000,5487.029785,1,1.003818,1.0,1.008521,4.0,1.049406,33.0,1.151291,140.0,1.297899,531.0
2024-06-18 00:00:00-04:00,5476.149902,5490.379883,5471.319824,5487.029785,3544330000,5473.169922,0,1.001259,2.0,1.006919,4.0,1.051203,34.0,1.153106,141.0,1.300407,532.0
2024-06-20 00:00:00-04:00,5499.990234,5505.529785,5455.560059,5473.169922,3847060000,5464.620117,0,0.998735,1.0,1.002457,3.0,1.047694,34.0,1.149140,140.0,1.296379,531.0


In [118]:
model = RandomForestClassifier(n_estimators=200, min_samples_split=50, random_state=1)

In [119]:
def predict(train, test, predictors, model):
    model.fit(train[predictors], train["Target"])
    preds = model.predict_proba(test[predictors])[:,1]
    preds[preds >=.6]= 1
    preds[preds<.6]=0
    preds = pd.Series(preds, index=test.index, name="Predictions")
    combined = pd.concat([test["Target"], preds], axis=1)
    return combined

In [120]:
predictions = backtest(sp500, model, new_predictors)

In [121]:
predictions["Predictions"].value_counts()

Predictions
0.0    5166
1.0    1018
Name: count, dtype: int64

In [123]:
precision_score(predictions["Target"], predictions["Predictions"] )

0.5432220039292731