In [3]:
!pip install yfinance


Collecting yfinance
  Obtaining dependency information for yfinance from https://files.pythonhosted.org/packages/e6/b3/388ab967a387cc92926f70e97688dd9a7189b29a0773db815ffc5289e2b5/yfinance-0.2.31-py2.py3-none-any.whl.metadata
  Downloading yfinance-0.2.31-py2.py3-none-any.whl.metadata (11 kB)
Collecting multitasking>=0.0.7 (from yfinance)
  Downloading multitasking-0.0.11-py3-none-any.whl (8.5 kB)
Collecting frozendict>=2.3.4 (from yfinance)
  Downloading frozendict-2.3.8-py311-none-any.whl (14 kB)
Collecting peewee>=3.16.2 (from yfinance)
  Downloading peewee-3.17.0.tar.gz (2.9 MB)
     ---------------------------------------- 0.0/2.9 MB ? eta -:--:--
     - -------------------------------------- 0.1/2.9 MB 2.4 MB/s eta 0:00:02
     ------- -------------------------------- 0.5/2.9 MB 5.6 MB/s eta 0:00:01
     -------------- ------------------------- 1.1/2.9 MB 7.6 MB/s eta 0:00:01
     ------------------------ --------------- 1.8/2.9 MB 9.4 MB/s eta 0:00:01
     ----------------------

In [41]:
import yfinance as yf
import pandas as pd

In [40]:
snp500 = yf.Ticker("^GSPC")

In [42]:
snp500 = snp500.history(period="max")

In [43]:
snp500.drop(columns=["Dividends", "Stock Splits"], inplace=True)
snp500["Next Day"] = snp500["Close"].shift(-1)
snp500["Price Change"] = (snp500["Next Day"] > snp500["Close"]).astype(int)

In [44]:
snp500 = snp500.loc["2000-01-01":]

In [45]:
from sklearn.ensemble import RandomForestClassifier 
clf = RandomForestClassifier(n_estimators=1000, min_samples_split=100, random_state = 1)

testing_data = snp500[-100:]
training_data = snp500[:-100]

In [46]:
predictors = ["Open", "High", "Low", "Close", "Volume"]
clf.fit(training_data[predictors], training_data["Price Change"])

In [47]:
from sklearn.metrics import precision_score
preds = pd.Series(clf.predict(testing_data[predictors]), index=testing_data.index)

In [48]:
precision_score(testing_data["Price Change"], preds)

0.49295774647887325

In [49]:
def backtesting_function(data, classifier, predictors, start=2500, step=250):
    predictions = []

    for i in range(start, len(data), step):
        training_data = data.iloc[0:i].copy()
        testing_data = data.iloc[i:i+step].copy()
        predictions.append(prediction_function(training_data, testing_data, predictors, classifier))

    return pd.concat(predictions)

In [50]:
def prediction_function(training_data, testing_data, predictors, classifier):
    classifier.fit(training_data[predictors], training_data["Price Change"])
    predictions = classifier.predict(testing_data[predictors])
    predictions = pd.Series(predictions, index=testing_data.index, name="Predicted Price Change")
    result = pd.concat([testing_data["Price Change"], predictions], axis=1)
    return result


In [51]:
prediction_values = backtesting_function(snp500, clf, predictors)

In [52]:
precision_score(prediction_values["Predicted Price Change"], prediction_values["Price Change"])

0.5121052631578947

In [53]:
past_days_to_train = [2, 5, 60, 250, 1000] #2 days ago, 1 week ago, 12 weeks ago, 50 weeks ago, 200 weeks ago
new_predictors = []
for days in past_days_to_train:
    rolling_average = snp500.rolling(days).mean()
    ratio_column = f"Close_Ratio_{rolling_average}"
    snp500[ratio_column] = snp500["Close"] / rolling_average["Close"]
    
    trend_column = f"Trend_{days}"
    snp500[trend_column] = snp500.shift(1).rolling(days).sum()["Price Change"]
    
    new_predictors+= [ratio_column, trend_column]

In [54]:
snp500