# About this notebook
This notebook looks at SPY ticker values from a dataset and uses various ML models to train and predict the future prices

## Install required libraries

In [None]:
# !pip install scikit-learn
# !pip install xgboost

# Model 1 - Random Forest Classifier
* Install scikit-learn for this one
* Use the RandomForestClassifier model from sklearn.ensemble
* Load the CSV file of the ticker as a data frame and index it by dates
* Define required features:
    * Return - fractional change of stock values at close from previous day
    * Volatility - Rolling standard deviation of Returns; rolling window set to 10
    * Momentum - Rate of change of stock from n days ago
    * SMA_10 - 10 day rolling average of closing prices
    * SMA_50 - 50 day rolling average of closing prices
    * SMA ratio - Ratio of differences rolling means (i.e., ratio of (mean_10 - mean_50) to mean_50)
* Define the target as follows:
    * Set Future Return (5) = (Closing price at [n + 5 days] - Closing price at [n days])/Closing price at [n days]
    * Set Signal as the binary classification target: if the future return is > 0, price is expected to go up, signal is 1; else the signal is 0
* Train and optimize the Random Forest Classifier
* Prediction results to be compared with XGBoost model

In [69]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt

# Load SPY data
spy_path = "../data/spy.csv"
# Read data and index it by dates
df = pd.read_csv(spy_path, parse_dates=["Date"], index_col="Date")

# Basic preprocessing
df = df[["Close", "Volume", "Open"]].dropna()

# Assign relevant features:
# Return - fractional change of stock values at close from previous day
# Volatility - Rolling standard deviation of Returns; rolling window set to 10
# Momentum - Rate of change of stock from 10 days ago
# SMA_10 - 10 day rolling average of closing prices
# SMA_50 - 50 day rolling average of closing prices
# SMA ratio - Ratio of differences rolling means (i.e., ratio of (mean_10 - mean_50) to mean_50)
df["Return"] = df["Close"].pct_change()
df["Volatility"] = df["Return"].rolling(window=10).std()
df["Momentum"] = df["Close"] / df["Close"].shift(10) - 1
df["SMA_10"] = df["Close"].rolling(window=10).mean()
df["SMA_50"] = df["Close"].rolling(window=50).mean()
# df["SMA_ratio"] = df["SMA_10"] / df["SMA_50"] - 1
df = df.dropna()

# Label generation: Predict if return over next 5 days is positive
df["FutureReturn5"] = df["Close"].shift(-5) / df["Close"] - 1
df["Signal"] = (df["FutureReturn5"] > 0).astype(int)
df = df.dropna()

# Features and target
# features = ["Return", "Volatility", "Momentum", "SMA_ratio"]
features = ["Return", "Volatility", "Momentum", ]
X = df[features]
y = df["Signal"]


# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, test_size=0.2)

# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Metrics
# report = classification_report(y_test, y_pred, output_dict=True)
print(classification_report(y_test, y_pred))

# Confusion matrix
# conf_matrix = confusion_matrix(y_test, y_pred)
# # Feature importance
feature_importances = pd.Series(model.feature_importances_, index=features).sort_values(ascending=False)
# feature_importances
# import ace_tools as tools; tools.display_dataframe_to_user(name="Classification Report", dataframe=pd.DataFrame(report).T)

# (conf_matrix, feature_importances)


              precision    recall  f1-score   support

           0       0.41      0.28      0.33       138
           1       0.63      0.75      0.68       224

    accuracy                           0.57       362
   macro avg       0.52      0.51      0.51       362
weighted avg       0.54      0.57      0.55       362



# Model 2 - XGBoost
* Install the xgboost library

In [68]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Reuse your features and target
# features = ["Return", "Volatility", "Momentum", "SMA_ratio"]
features = ["Return", "Volatility", "Momentum", ]
X = df[features]
y = df["Signal"]

# Train-test split (keep it consistent)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, test_size=0.2)

# Instantiate XGBoost classifier
model = xgb.XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    use_label_encoder=False,
    max_depth=5,
    learning_rate=0.1,
    n_estimators=100,
    random_state=42
)

# Train the model
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.34      0.17      0.23       138
           1       0.61      0.79      0.69       224

    accuracy                           0.56       362
   macro avg       0.47      0.48      0.46       362
weighted avg       0.51      0.56      0.51       362



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
