# **Unsupervised Learning Trading Strategy**

### **1. Download/Load SP500 stocks prices data** 

In [6]:
from statsmodels.regression.rolling import RollingOLS
import pandas_datareader.data as web
import matplotlib.pyplot as plt
import statsmodels.api as sm
import pandas as pd
import numpy as np
import datetime as dt
import yfinance as yf
import pandas_ta
import warnings

warnings.filterwarnings("ignore")



In [20]:
# 1. Get S&P 500 tickers
sp500 = pd.read_html(
    "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies",
    storage_options={
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36"
    }
)[0]

sp500["Symbol"] = sp500["Symbol"].str.replace(".", "-", regex=False)
symbols_list = sp500["Symbol"].unique().tolist()


# 2. Define date range
end_date = "2025-12-15"
start_date = pd.to_datetime(end_date) - pd.DateOffset(days=365 * 8)


# 3. Download price data
df = yf.download(
    tickers=symbols_list,
    start=start_date,
    end=end_date,
    auto_adjust=False
)


# 4. Reshape to match the course format
df = (
    df
    .stack()                                # wide â†’ long
    .rename_axis(["date", "ticker"])        # name index levels
)

df.columns = df.columns.str.lower()         # lowercase column names

df.head()


[*********************100%***********************]  503 of 503 completed


Unnamed: 0_level_0,Price,adj close,close,high,low,open,volume
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2017-12-18,A,63.697105,67.660004,68.510002,67.480003,67.889999,2306300.0
2017-12-18,AAPL,41.316132,44.105,44.299999,43.715,43.720001,117684400.0
2017-12-18,ABBV,69.745361,98.190002,99.099998,97.550003,97.669998,4768300.0
2017-12-18,ABT,49.003803,56.400002,56.52,55.419998,55.52,6053900.0
2017-12-18,ACGL,28.961201,30.456667,30.853333,30.416668,30.726667,1212300.0


# **2. Calculate features and technical indicators for each stock**

- Garman-Klass Volatility  
- RSI (relative strength index)
- Bollinger Bands (lower upper and middle bands)
- ATR (Average true range)
- MACD  
- Dollar Volume  

#### Garman-Klass Volatility

$$
\sigma_{GK}^2 =
\frac{(\ln(H) - \ln(L))^2}{2}
- (2\ln(2) - 1)(\ln(C) - \ln(O))^2
$$

Where:
- $H$ = High  
- $L$ = Low  
- $C$ = Adjusted Close  
- $O$ = Open  


In [30]:
# df["garman_klass_vol"] = ((np.log(df["high"]) - np.log(df["low"]))**2 / 2 - (2 * np.log(2) - 1) * (np.log(df["adj close"]) - np.log(df["open"]))**2)

# df["rsi"] = (df.groupby(level=1)["adj close"].transform(lambda x: pandas_ta.rsi(close=x, length=20)))

df["bb_low"] = df.groupby(level=1)["adj close"].transform(
    lambda x: pandas_ta.bbands(close=np.log1p(x), length=20).iloc[:, 0]
)

df["bb_mid"] = df.groupby(level=1)["adj close"].transform(
    lambda x: pandas_ta.bbands(close=np.log1p(x), length=20).iloc[:, 1]
)

df["bb_high"] = df.groupby(level=1)["adj close"].transform(
    lambda x: pandas_ta.bbands(close=np.log1p(x), length=20).iloc[:, 2]
)
df.head()



Unnamed: 0_level_0,Price,adj close,close,high,low,open,volume,garman_klass_vol,rsi,bb_low,bb_mid,bb_high
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2017-12-18,A,63.697105,67.660004,68.510002,67.480003,67.889999,2306300.0,-0.001455,,,,
2017-12-18,AAPL,41.316132,44.105,44.299999,43.715,43.720001,117684400.0,-0.001147,,,,
2017-12-18,ABBV,69.745361,98.190002,99.099998,97.550003,97.669998,4768300.0,-0.04368,,,,
2017-12-18,ABT,49.003803,56.400002,56.52,55.419998,55.52,6053900.0,-0.005828,,,,
2017-12-18,ACGL,28.961201,30.456667,30.853333,30.416668,30.726667,1212300.0,-0.001251,,,,


### **3. Calculate monthly returns**