<a href="https://colab.research.google.com/github/Radyko/MLQuantStrategy/blob/main/AlgorithminTradingUnsupervisedLearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pandas_ta

In [12]:
from statsmodels.regression.rolling import RollingOLS
import pandas_datareader.data as web
import matplotlib.pyplot as plt
import statsmodels.api as sm
import pandas as pd
import numpy as np
import datetime as dt
import yfinance as yf
import pandas_ta
import warnings
warnings.filterwarnings('ignore')

sp500 = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0]

sp500['Symbol'] = sp500['Symbol'].str.replace('.', '-')

symbols_list = sp500['Symbol'].unique().tolist()

end_date = '2024-08-01'

start_date = pd.to_datetime(end_date) - pd.DateOffset(365*8)

df = yf.download(tickers=symbols_list,
                 start=start_date,
                 end=end_date).stack()

df.index.names = ['date', 'ticker']

df.columns = df.columns.str.lower()

df


[*********************100%%**********************]  503 of 503 completed


Unnamed: 0_level_0,Price,adj close,close,high,low,open,volume
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2016-08-03,A,44.222771,47.090000,47.180000,46.900002,47.099998,1508900.0
2016-08-03,AAL,32.271374,33.480000,33.959999,33.169998,33.470001,8254200.0
2016-08-03,AAPL,24.263586,26.447500,26.459999,26.192499,26.202499,120810400.0
2016-08-03,ABBV,47.158062,66.570000,67.000000,66.220001,66.809998,6355100.0
2016-08-03,ABT,38.829514,44.950001,45.090000,44.830002,45.070000,7783100.0
...,...,...,...,...,...,...,...
2024-07-31,XYL,133.500000,133.500000,135.429993,132.600006,134.880005,2439500.0
2024-07-31,YUM,132.830002,132.830002,134.029999,131.610001,132.759995,1943100.0
2024-07-31,ZBH,111.349998,111.349998,113.080002,110.120003,111.510002,1592800.0
2024-07-31,ZBRA,351.190002,351.190002,359.690002,348.989990,357.779999,634400.0


# 2. Calculate features and technical indicators for each stock.


*   Garman-Klass Volatility
*   RSI
*   Bollinger Bands
*   ATR
*   MACD
*   Dollar Volume

Garman Klass Volatility is an approximation used to measure intraday volatility of a given asset.

Garman-Klass Volatility = (((ln(High) - ln(Low))^2)/2) -(2ln(2)-1)(ln(Adj Close) - ln(Open))^2.


In [13]:
df['garman_klass_vol'] = ((np.log(df['high']) - np.log(df['low']))**2)/2 - (2*np.log(2)-1)*((np.log(df['adj close']) - np.log(df['open']))**2)
df['rsi'] = df.groupby(level = 1)['adj close'].transform(lambda x: pandas_ta.rsi(close=x, length = 20))
pandas_ta.bbands(close=df.xs('AAPL', level=1)['adj close'], length = 20)

def calculate_bbands(x):
    bbands_result = pandas_ta.bbands(close=np.log1p(pd.Series(x)), length = 20)
    return bbands_result if bbands_result is not None else pd.DataFrame()  # Return an empty DataFrame if bbands_result is None
    #function needed, as iloc cannot work if input is None.

df['bb_lower'] = df.groupby(level=1)['adj close'].transform(lambda x: calculate_bbands(x).iloc[:,0] if not calculate_bbands(x).empty else None)

df['bb_middle'] = df.groupby(level=1)['adj close'].transform(lambda x: calculate_bbands(x).iloc[:,1] if not calculate_bbands(x).empty else None)

df['bb_upper'] = df.groupby(level=1)['adj close'].transform(lambda x: calculate_bbands(x).iloc[:,2] if not calculate_bbands(x).empty else None)

def compute_atr(stock_data):
  atr = pandas_ta.atr(high=stock_data['high'], low=stock_data['low'], close = stock_data['close'], length = 14)
  return atr.sub(atr.mean()).div(atr.std())


df['atr'] = df.groupby(level = 1, group_keys = False).apply(compute_atr)

def compute_macd(close):
  macd_result = pandas_ta.macd(close=close, length = 20)
  if macd_result is not None:  # Check if macd_result is not None
    macd = macd_result.iloc[:,0]
    return macd.sub(macd.mean()).div(macd.std())
  else:
    return None  # Return None if macd_result is None

df['macd'] = df.groupby(level = 1, group_keys = False)['adj close'].apply(compute_macd)

df['dollar_volume'] = (df['adj close'] * df['volume'])/1e6
#dollar volume in millions

df

Unnamed: 0_level_0,Price,adj close,close,high,low,open,volume,garman_klass_vol,rsi,bb_lower,bb_middle,bb_upper,atr,macd,dollar_volume
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2016-08-03,A,44.222771,47.090000,47.180000,46.900002,47.099998,1508900.0,-0.001517,,,,,,,66.727739
2016-08-03,AAL,32.271374,33.480000,33.959999,33.169998,33.470001,8254200.0,-0.000237,,,,,,,266.374373
2016-08-03,AAPL,24.263586,26.447500,26.459999,26.192499,26.202499,120810400.0,-0.002231,,,,,,,2931.293535
2016-08-03,ABBV,47.158062,66.570000,67.000000,66.220001,66.809998,6355100.0,-0.046807,,,,,,,299.694200
2016-08-03,ABT,38.829514,44.950001,45.090000,44.830002,45.070000,7783100.0,-0.008564,,,,,,,302.213987
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-07-31,XYL,133.500000,133.500000,135.429993,132.600006,134.880005,2439500.0,0.000182,44.378010,4.888463,4.928355,4.968247,1.892789,-0.278173,325.673250
2024-07-31,YUM,132.830002,132.830002,134.029999,131.610001,132.759995,1943100.0,0.000166,53.203270,4.838324,4.866018,4.893712,0.721489,-0.686611,258.101977
2024-07-31,ZBH,111.349998,111.349998,113.080002,110.120003,111.510002,1592800.0,0.000351,52.383683,4.666365,4.702034,4.737703,-0.568448,0.190412,177.358278
2024-07-31,ZBRA,351.190002,351.190002,359.690002,348.989990,357.779999,634400.0,0.000322,67.599872,5.729809,5.790869,5.851929,0.521713,0.873963,222.794938


#3. Aggregate to monthly level and filter top 150 most liquid stocks for each month.



*  To reduce training time and experiment with features and strategies, we convert the business-daily data to month-end frequency.



In [24]:
last_cols = [c for c in df.columns.unique(0) if c not in ['dollar_volume', 'volume', 'open', 'high', 'low', 'close']]

data = pd.concat([df.unstack('ticker')['dollar_volume'].resample('M').mean().stack('ticker').to_frame('dollar_volume'),
df.unstack()[last_cols].resample('M').last().stack('ticker')], axis = 1).dropna()

data['dollar_volume'] = (data['dollar_volume'].unstack('ticker').rolling(5*12).mean().stack())

data['dollar_vol_rank'] = (data.groupby('date')['dollar_volume'].rank(ascending = False))

data = data[data['dollar_vol_rank'] < 150].drop(['dollar_vol_rank','dollar_volume'], axis = 1)

data

Unnamed: 0_level_0,Unnamed: 1_level_0,adj close,garman_klass_vol,rsi,bb_lower,bb_middle,bb_upper,atr,macd
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2021-08-31,AAL,19.940001,0.000342,47.500676,2.968642,3.040327,3.112012,-0.387914,-0.258989
2021-08-31,AAPL,149.329407,-0.000139,61.340698,4.961857,4.989564,5.017272,0.119032,0.448019
2021-08-31,ABBV,107.355637,-0.005356,61.441944,4.618325,4.658887,4.699448,-0.757495,0.396425
2021-08-31,ABT,119.564293,-0.001141,62.864694,4.748958,4.774887,4.800817,-0.050727,1.067105
2021-08-31,ACN,322.019958,-0.000796,70.692928,5.710939,5.749764,5.788590,-0.330353,1.229811
...,...,...,...,...,...,...,...,...,...
2024-07-31,VZ,40.520000,0.000144,51.735733,3.687073,3.728617,3.770161,-0.191153,-0.152062
2024-07-31,WFC,58.890114,0.000024,50.270815,4.063394,4.095216,4.127037,0.633665,0.131169
2024-07-31,WMT,68.639999,0.000074,51.666269,4.241554,4.261496,4.281437,0.787756,0.648607
2024-07-31,XOM,118.589996,0.000021,57.851744,4.716281,4.755448,4.794615,0.597315,0.679436


\*   Calculate 5-year rolling average of dollar volume for each stock before filtering.

