# Unsupervised Learning Trading Strategy

* Download/Load SP500 stocks prices data.
* Calculate different features and indicators on each stock.
* Aggregate on monthly level and filter top 150 most liquid stocks.
* Calculate Monthly Returns for different time-horizons.
* Download Fama-French Factors and Calculate Rolling Factor Betas.
* For each month fit a K-Means Clustering Algorithm to group similar assets based on their features.
* For each month select assets based on the cluster and form a portfolio based on Efficient Frontier max sharpe ratio optimization.
* Visualize Portfolio returns and compare to SP500 returns.

# All Packages Needed:
* pandas, numpy, matplotlib, statsmodels, pandas_datareader, datetime, yfinance, sklearn, PyPortfolioOpt

## 1. Download/Load SP500 stocks prices data.

In [1]:
from statsmodels.regression.rolling import RollingOLS
import pandas_datareader.data as web
import matplotlib.pyplot as plt
import statsmodels.api as sm
import pandas as pd
import numpy as np
import datetime as dt
import yfinance as yf
import pandas_ta
import warnings
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt

In [2]:
# import data

warnings.filterwarnings('ignore')

sp500 = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0]

sp500['Symbol'] = sp500['Symbol'].str.replace('.', '-')

symbols_list = sp500['Symbol'].unique().tolist()

end_date = '2024-06-30'

start_date = pd.to_datetime(end_date)-pd.DateOffset(365*8)

df = yf.download(tickers=symbols_list,
                 start=start_date,
                 end=end_date).stack()

df.index.names = ['date', 'ticker']

df.columns = df.columns.str.lower()

df

[                       0%                       ]

[                       0%                       ]  2 of 503 completed

[                       1%                       ]  3 of 503 completed

[                       1%                       ]  4 of 503 completed

[                       1%                       ]  5 of 503 completed

[                       1%                       ]  6 of 503 completed

[                       1%                       ]  7 of 503 completed

[*                      2%                       ]  8 of 503 completed

[*                      2%                       ]  9 of 503 completed

[*                      2%                       ]  10 of 503 completed

[*                      2%                       ]  11 of 503 completed

[*                      2%                       ]  12 of 503 completed

[*                      3%                       ]  13 of 503 completed

[*                      3%                       ]  14 of 503 completed

[*                      3%                       ]  15 of 503 completed

[*                      3%                       ]  16 of 503 completed

[*                      3%                       ]  17 of 503 completed

[**                     4%                       ]  18 of 503 completed

[**                     4%                       ]  19 of 503 completed

[**                     4%                       ]  20 of 503 completed

[**                     4%                       ]  21 of 503 completed

[**                     4%                       ]  22 of 503 completed

[**                     5%                       ]  23 of 503 completed

[**                     5%                       ]  24 of 503 completed

[**                     5%                       ]  25 of 503 completed

[**                     5%                       ]  26 of 503 completed

[**                     5%                       ]  27 of 503 completed

[***                    6%                       ]  28 of 503 completed

[***                    6%                       ]  29 of 503 completed

[***                    6%                       ]  30 of 503 completed

[***                    6%                       ]  31 of 503 completed

[***                    6%                       ]  31 of 503 completed

[***                    7%                       ]  33 of 503 completed

[***                    7%                       ]  34 of 503 completed

[***                    7%                       ]  35 of 503 completed

[***                    7%                       ]  36 of 503 completed

[***                    7%                       ]  37 of 503 completed

[***                    7%                       ]  37 of 503 completed

[****                   8%                       ]  39 of 503 completed

[****                   8%                       ]  40 of 503 completed

[****                   8%                       ]  41 of 503 completed

[****                   8%                       ]  42 of 503 completed

[****                   9%                       ]  43 of 503 completed

[****                   9%                       ]  44 of 503 completed

[****                   9%                       ]  45 of 503 completed

[****                   9%                       ]  46 of 503 completed

[****                   9%                       ]  47 of 503 completed

[*****                 10%                       ]  48 of 503 completed

[*****                 10%                       ]  49 of 503 completed

[*****                 10%                       ]  50 of 503 completed

[*****                 10%                       ]  51 of 503 completed

[*****                 10%                       ]  52 of 503 completed

[*****                 11%                       ]  53 of 503 completed

[*****                 11%                       ]  54 of 503 completed

[*****                 11%                       ]  55 of 503 completed

[*****                 11%                       ]  56 of 503 completed

[*****                 11%                       ]  57 of 503 completed

[******                12%                       ]  58 of 503 completed

[******                12%                       ]  59 of 503 completed

[******                12%                       ]  60 of 503 completed

[******                12%                       ]  61 of 503 completed[******                12%                       ]  61 of 503 completed

[******                13%                       ]  63 of 503 completed

[******                13%                       ]  64 of 503 completed[******                13%                       ]  64 of 503 completed

[******                13%                       ]  66 of 503 completed

[******                13%                       ]  67 of 503 completed

[*******               14%                       ]  68 of 503 completed

[*******               14%                       ]  69 of 503 completed

[*******               14%                       ]  70 of 503 completed

[*******               14%                       ]  71 of 503 completed[*******               14%                       ]  71 of 503 completed

[*******               15%                       ]  73 of 503 completed

[*******               15%                       ]  74 of 503 completed

[*******               15%                       ]  75 of 503 completed

[*******               15%                       ]  76 of 503 completed

[*******               15%                       ]  77 of 503 completed[*******               15%                       ]  77 of 503 completed

[********              16%                       ]  79 of 503 completed

[********              16%                       ]  80 of 503 completed

[********              16%                       ]  81 of 503 completed

[********              16%                       ]  82 of 503 completed

[********              17%                       ]  83 of 503 completed

[********              17%                       ]  84 of 503 completed

[********              17%                       ]  85 of 503 completed

[********              17%                       ]  86 of 503 completed

[********              17%                       ]  86 of 503 completed

[********              17%                       ]  88 of 503 completed

[*********             18%                       ]  89 of 503 completed

[*********             18%                       ]  90 of 503 completed

[*********             18%                       ]  91 of 503 completed

[*********             18%                       ]  92 of 503 completed

[*********             18%                       ]  93 of 503 completed

[*********             19%                       ]  94 of 503 completed

[*********             19%                       ]  95 of 503 completed

[*********             19%                       ]  96 of 503 completed

[*********             19%                       ]  97 of 503 completed

[*********             19%                       ]  98 of 503 completed

[**********            20%                       ]  99 of 503 completed

[**********            20%                       ]  100 of 503 completed

[**********            20%                       ]  100 of 503 completed

[**********            20%                       ]  102 of 503 completed

[**********            20%                       ]  103 of 503 completed

[**********            21%                       ]  104 of 503 completed

[**********            21%                       ]  105 of 503 completed

[**********            21%                       ]  106 of 503 completed

[**********            21%                       ]  107 of 503 completed

[**********            21%                       ]  108 of 503 completed

[***********           22%                       ]  109 of 503 completed

[***********           22%                       ]  109 of 503 completed

[***********           22%                       ]  111 of 503 completed

[***********           22%                       ]  112 of 503 completed

[***********           22%                       ]  113 of 503 completed

[***********           23%                       ]  114 of 503 completed

[***********           23%                       ]  115 of 503 completed

KeyboardInterrupt: 

In [None]:
df1 = df
df2 = df.copy()
df3 = df

In [None]:
num, colu = df.loc[pd.IndexSlice['2024-06-06',:],:].shape
num
ticker=df.loc['2024-06-06',:].index
ticker

In [None]:
df_pred=df

In [None]:
features = ['Year', 'Month', 'Week', 'Day of the month', 'log_return', 'sma_50', 'sma_200', 'rsi']
targets = ['close', 'adj close', 'volume', 'high', 'low', 'open']

i = 0
for i in range(0,num):

    df_aapl = df1.loc[pd.IndexSlice[:, ticker[i]], :]
    df_aapl=df_aapl.reset_index()
    df_aapl['log_return'] = np.log(df_aapl['close'] / df_aapl['close'].shift(1))
    df_aapl['sma_50'] = df_aapl['close'].rolling(window=50).mean()
    df_aapl['sma_200'] = df_aapl['close'].rolling(window=200).mean()
    df_aapl['rsi'] = 100 - (100 / (1 + (df_aapl['close'].diff().rolling(window=14).mean() / df_aapl['close'].diff().rolling(window=14).std())))

    last_date = df_aapl['date'].max()
    new_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=150)
    new_data = {
    'date': new_dates,
    'ticker': ticker[i],
    'adj close': 0,
    'close': 0,
    'high': 0,
    'low': 0,
    'open': 0,
    'volume': 0,
    'log_return':0,
    'sma_200':0,
    'sma_50':0,
    'rsi':0
    }
    new_df = pd.DataFrame(new_data)
    df_aapl = pd.concat([df_aapl, new_df], ignore_index=True)

    df_aapl['Month'] = df_aapl['date'].dt.month
    df_aapl['Year'] = df_aapl['date'].dt.year
    df_aapl['Week'] = df_aapl['date'].dt.isocalendar().week
    df_aapl['Day of the month'] = df_aapl['date'].dt.day
    
    df_aapl.set_index('date', inplace=True)
    X = df_aapl[features]
    y = df_aapl[targets]
    
    train_size = len(df_aapl) - 150
    X_train, X_test = X.iloc[:train_size], X.iloc[train_size:]
    y_train, y_test = y.iloc[:train_size], y.iloc[train_size:]

    # Train the XGBoost model for each target
    models = {}
    for target in targets:
        model = XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1)
        model.fit(X_train, y_train[target])
        models[target] = model

    # Make predictions for each target
    predictions = {}
    for target in targets:
        predictions[target] = models[target].predict(X_test)

    # Combine predictions into a DataFrame
    predicted_df = pd.DataFrame(predictions, index=X_test.index)
    predicted_df['ticker']=ticker[i]
#     predicted_df = predicted_df.reset_index()
#     predicted_df.set_index('ticker', append=True, inplace=True)
    predicted_df.set_index(['ticker', predicted_df.index], inplace=True)
    predicted_df.index.names = ['ticker', 'date']
    predicted_df = predicted_df.reorder_levels(['date', 'ticker'])
  
    df = pd.concat([df2, predicted_df], ignore_index=False)


In [None]:
df_final = df
df_final1 = df

In [None]:
df = df.sort_index()
df

In [None]:
# df_final = df_final.sortlevel()
df_final = df_final.groupby(level=[0,1]).sum()
df_final

## 2. Calculate features and technical indicators for each stock.

* Garman-Klass Volatility
* RSI
* Bollinger Bands
* ATR
* MACD
* Dollar Volume

\begin{equation}
\text{Garman-Klass Volatility} = \frac{(\ln(\text{High}) - \ln(\text{Low}))^2}{2} - (2\ln(2) - 1)(\ln(\text{Adj Close}) - \ln(\text{Open}))^2
\end{equation}

In [None]:
df['garman_klass_vol'] = ((np.log(df['high'])-np.log(df['low']))**2)/2-(2*np.log(2)-1)*((np.log(df['adj close'])-np.log(df['open']))**2)

df['rsi'] = df.groupby(level=1)['adj close'].transform(lambda x: pandas_ta.rsi(close=x, length=20))

df['bb_low'] = df.groupby(level=1)['adj close'].transform(lambda x: pandas_ta.bbands(close=np.log1p(x), length=20).iloc[:,0])
                                                          
df['bb_mid'] = df.groupby(level=1)['adj close'].transform(lambda x: pandas_ta.bbands(close=np.log1p(x), length=20).iloc[:,1])
                                                          
df['bb_high'] = df.groupby(level=1)['adj close'].transform(lambda x: pandas_ta.bbands(close=np.log1p(x), length=20).iloc[:,2])

def compute_atr(stock_data):
    atr = pandas_ta.atr(high=stock_data['high'],
                        low=stock_data['low'],
                        close=stock_data['close'],
                        length=14)
    return atr.sub(atr.mean()).div(atr.std())

df['atr'] = df.groupby(level=1, group_keys=False).apply(compute_atr)

def compute_macd(close):
    macd = pandas_ta.macd(close=close, length=20).iloc[:,0]
    return macd.sub(macd.mean()).div(macd.std())

df['macd'] = df.groupby(level=1, group_keys=False)['adj close'].apply(compute_macd)

df['dollar_volume'] = (df['adj close']*df['volume'])/1e6

df

In [None]:
df_feature = df

## 3. Aggregate to monthly level and filter top 150 most liquid stocks for each month.

* To reduce training time and experiment with features and strategies, we convert the business-daily data to month-end frequency.

In [None]:
last_cols = [c for c in df.columns.unique(0) if c not in ['dollar_volume', 'volume', 'open',
                                                          'high', 'low', 'close']]

data = (pd.concat([df.unstack('ticker')['dollar_volume'].resample('M').mean().stack('ticker').to_frame('dollar_volume'),
                   df.unstack()[last_cols].resample('M').last().stack('ticker')],
                  axis=1)).dropna()

data

In [None]:
data_copy = data

* Calculate 5-year rolling average of dollar volume for each stocks before filtering.

In [None]:
data['dollar_volume'] = (data.loc[:, 'dollar_volume'].unstack('ticker').rolling(5*12, min_periods=12).mean().stack())

data['dollar_vol_rank'] = (data.groupby('date')['dollar_volume'].rank(ascending=False))

data = data[data['dollar_vol_rank']<150].drop(['dollar_volume', 'dollar_vol_rank'], axis=1)

data

data_copy2 = data

## 4. Calculate Monthly Returns for different time horizons as features.

* To capture time series dynamics that reflect, for example, momentum patterns, we compute historical returns using the method .pct_change(lag), that is, returns over various monthly periods as identified by lags.

In [None]:
def calculate_returns(df):

    outlier_cutoff = 0.005

    lags = [1, 2, 3, 6, 9, 12]

    for lag in lags:

        df[f'return_{lag}m'] = (df['adj close']
                              .pct_change(lag)
                              .pipe(lambda x: x.clip(lower=x.quantile(outlier_cutoff),
                                                     upper=x.quantile(1-outlier_cutoff)))
                              .add(1)
                              .pow(1/lag)
                              .sub(1))
    return df
    
    
data = data.groupby(level=1, group_keys=False).apply(calculate_returns).dropna()

data
data_final = data

## 5. Download Fama-French Factors and Calculate Rolling Factor Betas.

* We will introduce the Fama—French data to estimate the exposure of assets to common risk factors using linear regression.

* The five Fama—French factors, namely market risk, size, value, operating profitability, and investment have been shown empirically to explain asset returns and are commonly used to assess the risk/return profile of portfolios. Hence, it is natural to include past factor exposures as financial features in models.

* We can access the historical factor returns using the pandas-datareader and estimate historical exposures using the RollingOLS rolling linear regression.

In [None]:
factor_data = web.DataReader('F-F_Research_Data_5_Factors_2x3',
                               'famafrench',
                               start='2010')[0].drop('RF', axis=1)

factor_data.index = factor_data.index.to_timestamp()

factor_data = factor_data.resample('M').last().div(100)

factor_data.index.name = 'date'

data = data.reset_index()
if data['date'].dt.tz is not None:
    data['date'] = data['date'].dt.tz_localize(None)
data = data.set_index(['date', 'ticker'])

factor_data = factor_data.join(data['return_1m']).sort_index()

factor_data

* Filter out stocks with less than 10 months of data.

In [None]:
observations = factor_data.groupby(level=1).size()

valid_stocks = observations[observations >= 10]

factor_data = factor_data[factor_data.index.get_level_values('ticker').isin(valid_stocks.index)]

factor_data

* Calculate Rolling Factor Betas.

In [None]:
betas = (factor_data.groupby(level=1,
                            group_keys=False)
         .apply(lambda x: RollingOLS(endog=x['return_1m'], 
                                     exog=sm.add_constant(x.drop('return_1m', axis=1)),
                                     window=min(24, x.shape[0]),
                                     min_nobs=len(x.columns)+1)
         .fit(params_only=True)
         .params
         .drop('const', axis=1)))

betas 

* Join the rolling factors data to the main features dataframe.

In [None]:
factors = ['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA']

data = (data.join(betas.groupby('ticker').shift()))

data.loc[:, factors] = data.groupby('ticker', group_keys=False)[factors].apply(lambda x: x.fillna(x.mean()))

data = data.drop('adj close', axis=1)

data = data.dropna()

data.info()

In [None]:
data

### At this point we have to decide on what ML model and approach to use for predictions etc.


## 6. For each month fit a K-Means Clustering Algorithm to group similar assets based on their features.

### K-Means Clustering
* You may want to initialize predefined centroids for each cluster based on your research.

* For visualization purpose of this tutorial we will initially rely on the ‘k-means++’ initialization.

* Then we will pre-define our centroids for each cluster.

In [None]:
target_rsi_values = [30, 45, 55, 70]

initial_centroids = np.zeros((len(target_rsi_values), 18))

initial_centroids[:, 6] = target_rsi_values

initial_centroids

In [None]:
from sklearn.cluster import KMeans

# data = data.drop('cluster', axis=1)

def get_clusters(df):
    df['cluster'] = KMeans(n_clusters=4,
                           random_state=0,
                           init=initial_centroids).fit(df).labels_
    return df

data = data.dropna().groupby('date', group_keys=False).apply(get_clusters)

data

In [None]:
def plot_clusters(data):

    cluster_0 = data[data['cluster']==0]
    cluster_1 = data[data['cluster']==1]
    cluster_2 = data[data['cluster']==2]
    cluster_3 = data[data['cluster']==3]

    plt.scatter(cluster_0.iloc[:,0] , cluster_0.iloc[:,6] , color = 'red', label='cluster 0')
    plt.scatter(cluster_1.iloc[:,0] , cluster_1.iloc[:,6] , color = 'green', label='cluster 1')
    plt.scatter(cluster_2.iloc[:,0] , cluster_2.iloc[:,6] , color = 'blue', label='cluster 2')
    plt.scatter(cluster_3.iloc[:,0] , cluster_3.iloc[:,6] , color = 'black', label='cluster 3')
    
    plt.legend()
    plt.show()
    return


In [None]:
plt.style.use('ggplot')

for i in data.index.get_level_values('date').unique().tolist():
    
    g = data.xs(i, level=0)
    
    plt.title(f'Date {i}')
    
    plot_clusters(g)

### Apply pre-defined centroids.

## 7. For each month select assets based on the cluster and form a portfolio based on Efficient Frontier max sharpe ratio optimization

* First we will filter only stocks corresponding to the cluster we choose based on our hypothesis.

* Momentum is persistent and my idea would be that stocks clustered around RSI 70 centroid should continue to outperform in the following month - thus I would select stocks corresponding to cluster 3.


In [None]:
filtered_df = data[data['cluster']==3].copy()

filtered_df = filtered_df.reset_index(level=1)

filtered_df.index = filtered_df.index+pd.DateOffset(1)

filtered_df = filtered_df.reset_index().set_index(['date', 'ticker'])

dates = filtered_df.index.get_level_values('date').unique().tolist()

fixed_dates = {}

for d in dates:
    
    fixed_dates[d.strftime('%Y-%m-%d')] = filtered_df.xs(d, level=0).index.tolist()
    
fixed_dates

### Define portfolio optimization function

* We will define a function which optimizes portfolio weights using PyPortfolioOpt package and EfficientFrontier optimizer to maximize the sharpe ratio.

* To optimize the weights of a given portfolio we would need to supply last 1 year prices to the function.

* Apply signle stock weight bounds constraint for diversification (minimum half of equaly weight and maximum 10% of portfolio).

In [None]:
!pip install PyPortfolioOpt

In [None]:
from pypfopt.efficient_frontier import EfficientFrontier
from pypfopt import risk_models
from pypfopt import expected_returns

def optimize_weights(prices, lower_bound=0):
    
    returns = expected_returns.mean_historical_return(prices=prices,
                                                      frequency=252)
    
    cov = risk_models.sample_cov(prices=prices,
                                 frequency=252)
    
    ef = EfficientFrontier(expected_returns=returns,
                           cov_matrix=cov,
                           weight_bounds=(lower_bound, .1),
                           solver='SCS')
    
    weights = ef.max_sharpe()
    
    return ef.clean_weights()


* Download Fresh Daily Prices Data only for short listed stocks.

In [None]:
stocks = data.index.get_level_values('ticker').unique().tolist()

# new_df = yf.download(tickers=stocks,
#                      start=data.index.get_level_values('date').unique()[0]-pd.DateOffset(months=12),
#                      end=data.index.get_level_values('date').unique()[-1])

# new_df

In [None]:
start_date = df_final.index.get_level_values('date').unique()[0] - pd.DateOffset(months=12)
end_date = df_final.index.get_level_values('date').unique()[-1]

df_final = df_final.rename(index={0: "Date", 1: "Ticker"}, columns ={'adj close': 'Adj Close', 'volume': 'Volume'})

# Slice the dataframe using pd.IndexSlice
new_df = df_final.loc[pd.IndexSlice[start_date:end_date, stocks], :]
new_df = new_df.unstack()
new_df

* Calculate daily returns for each stock which could land up in our portfolio.

* Then loop over each month start, select the stocks for the month and calculate their weights for the next month.

* If the maximum sharpe ratio optimization fails for a given month, apply equally-weighted weights.

* Calculated each day portfolio return.

In [None]:
returns_dataframe = np.log(new_df['Adj Close']).diff()
returns_dataframe

In [None]:
fixed_dates.keys()

In [None]:
for start_date in fixed_dates.keys():
    end_date = (pd.to_datetime(start_date)+pd.offsets.MonthEnd(0)).strftime('%Y-%m-%d')
    cols = fixed_dates[start_date]
    optimization_start_date = (pd.to_datetime(start_date)-pd.DateOffset(months=12)).strftime('%Y-%m-%d')
    optimization_end_date = (pd.to_datetime(start_date)-pd.DateOffset(days=1)).strftime('%Y-%m-%d')
#     optimization_df = new_df[optimization_start_date:optimization_end_date]['Adj Close'][cols]
     
    print(start_date)
    print(end_date)
    print(cols)
    print(optimization_start_date)
    print(optimization_end_date)

In [None]:


portfolio_df = pd.DataFrame()

for start_date in fixed_dates.keys():
    
    try:

        end_date = (pd.to_datetime(start_date)+pd.offsets.MonthEnd(0)).strftime('%Y-%m-%d')

        cols = fixed_dates[start_date]

        optimization_start_date = (pd.to_datetime(start_date)-pd.DateOffset(months=12)).strftime('%Y-%m-%d')

        optimization_end_date = (pd.to_datetime(start_date)-pd.DateOffset(days=1)).strftime('%Y-%m-%d')
        
        optimization_df = new_df[optimization_start_date:optimization_end_date]['Adj Close'][cols]
        
        success = False
        try:
            weights = optimize_weights(prices=optimization_df,
                                   lower_bound=round(1/(len(optimization_df.columns)*2),4))

            weights = pd.DataFrame(weights, index=pd.Series(0))
            
            success = True
        except:
            print(f'Max Sharpe Optimization failed for {start_date}, Continuing with Equal-Weights')
        
        if success==False:
            weights = pd.DataFrame([1/len(optimization_df.columns) for i in range(len(optimization_df.columns))],
                                     index=optimization_df.columns.tolist(),
                                     columns=pd.Series(0)).T
        
        temp_df = returns_dataframe[start_date:end_date]

        temp_df = temp_df.stack().to_frame('return').reset_index(level=0)\
                   .merge(weights.stack().to_frame('weight').reset_index(level=0, drop=True),
                          left_index=True,
                          right_index=True)\
                   .reset_index().set_index(['date', 'ticker']).unstack().stack()

        temp_df.index.names = ['date', 'ticker']

        temp_df['weighted_return'] = temp_df['return']*temp_df['weight']

        temp_df = temp_df.groupby(level=0)['weighted_return'].sum().to_frame('Strategy Return')

        portfolio_df = pd.concat([portfolio_df, temp_df], axis=0)
    
    except Exception as e:
        print(e)

portfolio_df = portfolio_df.drop_duplicates()

portfolio_df

## 8. Visualize Portfolio returns and compare to SP500 returns.

In [None]:
spy = yf.download(tickers='SPY',
                  start='2015-01-01',
                  end=dt.date.today())

spy_ret = np.log(spy[['Adj Close']]).diff().dropna().rename({'Adj Close':'SPY Buy&Hold'}, axis=1)

portfolio_df = portfolio_df.merge(spy_ret,
                                  left_index=True,
                                  right_index=True)

portfolio_df

In [None]:
portfolio_df = portfolio_df.merge(spy_ret,
                                  left_index=True,
                                  right_index=True)

portfolio_df

In [None]:
import matplotlib.ticker as mtick

plt.style.use('ggplot')

portfolio_cumulative_return = np.exp(np.log1p(portfolio_df).cumsum())-1

portfolio_cumulative_return[:'2023-09-29'].plot(figsize=(16,6))

plt.title('Unsupervised Learning Trading Strategy Returns Over Time')

plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter(1))

plt.ylabel('Return')

plt.show()

In [None]:
os.makedirs('results', exist_ok=True)

In [None]:

stocks_df = pd.DataFrame(stocks, columns=['stock'])
results_dir = os.path.join('..', 'results')

os.makedirs(results_dir, exist_ok=True)

file_path = os.path.join(results_dir, 'ML_result.csv')

stocks_df.to_csv(file_path, index=False)

print(f"saved to {file_path}")
