In [7]:
import yfinance as yf
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split

#Gathers monthly prices and returns for stocks and SPY 
target_stocks = ['AAPL', 'MSFT', 'AMZN', 'GOOGL', 'NVDA', 'META', 'TSLA', 'JPM', 'V', 'NFLX', 'COST', 'MA', 'WMT', 'BAC', 'UNH', 'KO', 'ORCL', 'CRM', 'CSCO', 'PLTR', 'WFC', 'DIS', 'T', 'UBER']
spy = ["SPY"]
stock_data = yf.download(target_stocks + spy, start="2015-01-01", end="2025-01-01", auto_adjust=True, progress=False)["Close"]
monthly_prices = stock_data.resample("ME").last()
monthly_returns = monthly_prices.pct_change()
stock_ret = monthly_returns.drop(columns='SPY')
spy_ret = monthly_returns['SPY']

momentum = stock_ret.rolling(3).mean()
volatility = stock_ret.rolling(3).std()
sharpe = momentum / volatility #Sharpe estimate doesn't subtract RFR
excess_ret = stock_ret.sub(spy_ret, axis=0)

df = pd.concat([momentum.stack().rename("mom"), 
                 volatility.stack().rename('vol'), 
                 sharpe.stack().rename('sharpe'), 
                 excess_ret.stack().rename('excess_ret')], axis=1).reset_index().rename(columns={'level1':'Ticker'})
df['target'] = df.groupby('Ticker')['excess_ret'].shift(-1) #Shows next months excess return for each stock
df = df.dropna()

train_cutoff = '2021-01-01'
train_df = df[df['Date'] < train_cutoff]
test_df = df[df['Date'] >= train_cutoff]

#Separate training and testing periods
X_train = train_df[['mom', 'vol', 'sharpe']]
Y_train = train_df['target']
X_test = test_df[['mom', 'vol', 'sharpe']]
Y_test = test_df['target']

model = xgb.XGBRegressor(
    n_estimators = 300, #200-400
    max_depth = 4, #3-5
    learning_rate = 0.05, #0.02-0.05
    subsample = 0.8, #0.7-0.9
    colsample_bytree = 0.8, #0.8-1
    objective = 'reg:squarederror',
    eval_metric = 'rmse',
    random_state = 21
)
model.fit(X_train, Y_train)

test_df = test_df.copy() #Prevents from writing into view
test_df['pred'] = model.predict(X_test)
test_df['rank'] = test_df.groupby('Date')['pred'].rank(ascending=False)
test_df['quintile'] = pd.qcut(test_df['rank'], 5, labels=False) + 1

performance = test_df.groupby('quintile')['target'].mean()
print(performance)

top_stocks = (
    test_df[test_df['rank'] <= 10].sort_values(['Date', 'rank']).loc[:, ['Date','Ticker', 'pred', 'target', 'rank']]
)
print(top_stocks.head(10))

quintile
1    0.014982
2    0.002966
3    0.003481
4    0.008422
5    0.004202
Name: target, dtype: float64
           Date Ticker      pred    target  rank
1554 2021-01-31   TSLA  0.365247 -0.176546   1.0
1552 2021-01-31   PLTR  0.100699 -0.348442   2.0
1558 2021-01-31    WFC  0.040368  0.186534   3.0
1539 2021-01-31   COST  0.039055 -0.086758   4.0
1544 2021-01-31    JPM  0.037930  0.115973   5.0
1555 2021-01-31   UBER  0.034165 -0.011705   6.0
1542 2021-01-31    DIS  0.031330  0.096295   7.0
1551 2021-01-31   ORCL  0.014131  0.039711   8.0
1553 2021-01-31      T  0.013825 -0.053652   9.0
1536 2021-01-31   AAPL  0.013800 -0.107517  10.0
