In [2]:
import os

import numpy as np
import pandas as pd
import yfinance as yf
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

from ray.optimizer.strategies import *
from ray.optimizer.portfolios import *
from ray.utils.file_utilities import s3_download

### Reference:

- https://github.com/zhuodannychen/Portfolio-Optimization
- https://pypi.org/project/yfinance/
- https://www.kenwuyang.com/en/post/portfolio-optimization-with-python/
- https://lumos-datascience.medium.com/mean-variance-portfolio-optimization-using-python-8485fccb9f8b
- https://pyportfolioopt.readthedocs.io/en/latest/RiskModels.html


In [3]:
# refresh data
path = "CRSP/crsp_2018-2023_clean_3.parquet"
file_path = s3_download(path)

# load from local
# file_path = "/home/rcao/repos/capstone/data/crsp_2018-2023_clean_3.parquet"

file_path

'/Users/puzheng/repo/capstone-2024-summer/data/crsp_2018-2023_clean_3.parquet'

In [4]:
df = pd.read_parquet(file_path)
# df["date"] = pd.to_datetime(df["date"])
df.dtypes

date                              datetime64[ns]
permno_id                                 object
ncusip_id                                 object
cusip_id                                  object
ticker                                    object
share_code                                object
share_code_type                           object
share_code_detail                         object
exchange_code                             object
company_name                              object
primary_exchange                          object
trading_status                            object
security_status                           object
naics                                     object
naics_sector                              object
naics_sector_name                         object
naics_secondary                           object
ask_or_high_price                        float64
ask_price                                float64
bid_or_low_price                         float64
bid_price           

In [5]:
df_pivot = df.pivot(index="date", columns="ticker", values="return")
df_pivot.head()

# delete columns with NaN values in them
df_pivot = df_pivot.dropna(axis=1)
df_pivot.columns

Index(['A', 'AAL', 'AAPL', 'ABBV', 'ABT', 'ACGL', 'ACN', 'ADBE', 'ADI', 'ADM',
       ...
       'WYNN', 'XEL', 'XOM', 'XRAY', 'XYL', 'YUM', 'ZBH', 'ZBRA', 'ZION',
       'ZTS'],
      dtype='object', name='ticker', length=482)

In [6]:
date_cutoff = pd.Timestamp("2023-10-01")

In [7]:
train = df_pivot[:date_cutoff]
train_span = len(train)
train_span

1446

In [8]:
test = df_pivot[date_cutoff:]
test_span = len(test)
test_span

63

In [9]:
train.index

DatetimeIndex(['2018-01-02', '2018-01-03', '2018-01-04', '2018-01-05',
               '2018-01-08', '2018-01-09', '2018-01-10', '2018-01-11',
               '2018-01-12', '2018-01-16',
               ...
               '2023-09-18', '2023-09-19', '2023-09-20', '2023-09-21',
               '2023-09-22', '2023-09-25', '2023-09-26', '2023-09-27',
               '2023-09-28', '2023-09-29'],
              dtype='datetime64[ns]', name='date', length=1446, freq=None)

Reference: https://github.com/zhuodannychen/Portfolio-Optimization


In [10]:
# TICKERS = ['QQQ', 'TQQQ', 'TMF']
# Technology, Consumers, Industrial, Materials, Financials, Energy, Healthcare
# TICKERS = ["VGT", "VDC", "VIS", "VAW", "VFH", "VDE", "VHT"]
TICKERS = df_pivot.columns.tolist()
TOTAL_BALANCE = 10000
start_date = "2018-01-02"
end_date = "2023-09-29"


# Calculating mean (expected returns), covariance (expected volatility), and correlation
hist_mean = train.mean(axis=0).to_frame()
hist_mean.columns = ["mu"]
hist_cov = train.cov()  # to be updated later from the ML generated cosine similarity
hist_corr = train.corr()
print(hist_mean.T)
print(hist_cov)
print(hist_corr)

ticker         A       AAL      AAPL      ABBV       ABT      ACGL       ACN  \
mu      0.000549 -0.000276  0.001211  0.000634  0.000567  0.000869  0.000694   

ticker     ADBE       ADI       ADM  ...      WYNN       XEL       XOM  \
mu      0.00101  0.000776  0.000689  ...  0.000215  0.000347  0.000649   

ticker      XRAY       XYL      YUM       ZBH      ZBRA      ZION       ZTS  
mu     -0.000183  0.000436  0.00049  0.000178  0.000908  0.000236  0.000788  

[1 rows x 482 columns]
ticker         A       AAL      AAPL      ABBV       ABT      ACGL       ACN  \
ticker                                                                         
A       0.000332  0.000211  0.000203  0.000123  0.000186  0.000161  0.000198   
AAL     0.000211  0.001384  0.000241  0.000106  0.000140  0.000311  0.000237   
AAPL    0.000203  0.000241  0.000411  0.000117  0.000178  0.000157  0.000226   
ABBV    0.000123  0.000106  0.000117  0.000304  0.000130  0.000119  0.000125   
ABT     0.000186  0.000140  0.

In [11]:
# add SPY as a benchmark

symbols = ["SPY"]
index_data = yf.download(tickers=" ".join(symbols), start="2017-12-31", end="2023-12-30")["Adj Close"]
index = pd.DataFrame(index_data)
index = index.rename(columns={"Adj Close": "SnP500"})
index.describe()

ind_ret = np.log(index["SnP500"]) - np.log(index["SnP500"].shift(1))
ind_ret = ind_ret.dropna()

[*********************100%%**********************]  1 of 1 completed


### simulate randomized portfolios


In [12]:
# # simulate randomized portfolios
# n_portfolios = 2000
# portfolio_returns = []
# portfolio_stds = []

# for i in range(n_portfolios):
#     weights = np.random.rand(len(TICKERS))
#     weights = weights / sum(weights)
#     port_return = portfolio_return(weights, hist_mean)
#     port_std = portfolio_std(weights, hist_cov)
#     sharpe_ratio = portfolio_sharpe(port_return, port_std)

#     portfolio_returns.append(port_return)
#     portfolio_stds.append(port_std)

In [13]:
# ------------ Optimized portfolios ------------------#
equally_weighted_weights = np.array(equal_weight(len(TICKERS)))
gmv_weights = np.array(minimum_variance(train))
max_sharpe_weights = np.array(max_sharpe(train))

port_weight_dict = {
    "Equally Weighted": equally_weighted_weights,
    "Global Minimum Variance": gmv_weights,
    "Max Sharpe Ratio": max_sharpe_weights,
}

In [14]:
portfolio_return_dict = {k: portfolio_return(v, train) for k, v in port_weight_dict.items()}
portfolio_return_dict["Index"] = ind_ret[:date_cutoff]

In [15]:
for name, ret in portfolio_return_dict.items():
    print(f"---------- {name} ----------")
    # print("Weights:", equally_weighted_weights)
    annulized_return = ret.mean() * 250
    annulized_vol = ret.std() * np.sqrt(250)

    print("Annualized Return:", annulized_return)
    print("Volatility:", annulized_vol)
    print("Sharpe Ratio:", annulized_return / annulized_vol)
    if name != "Index" and name != "Equally Weighted":
        print_non_zero_weights(TICKERS, port_weight_dict[name])

    print()

---------- Equally Weighted ----------
Annualized Return: 0.14271964117640304
Volatility: 0.21859089857128777
Sharpe Ratio: 0.6529075186076821

---------- Global Minimum Variance ----------
Annualized Return: 0.08163487433439956
Volatility: 0.13594219052718717
Sharpe Ratio: 0.6005116882243658
ABBV: 0.0010908768996342137
BMY: 0.06945019550274804
CBOE: 0.04335526007761363
CHKP: 0.058054007082789044
CHRW: 0.031458441234898445
CLX: 0.03976572853050882
CMG: 0.008577121377573766
CPB: 0.020400693710766717
DGX: 0.015115277396893537
DPZ: 0.055381761904963335
EA: 0.02704020960755207
FSLR: 0.003742955133719826
GILD: 0.019154304476296565
GIS: 0.004494088662602888
HLT: 0.05480722777920478
HRL: 0.059563323986494686
INCY: 0.0015573572193704251
JNJ: 0.04062789910306309
K: 0.05552329696477618
KR: 0.04548151283572144
MCD: 0.006502306101395308
MO: 0.00678960254607105
MRK: 0.02525084818622406
NEM: 0.06649279256843088
PANW: 0.002189533902102171
PFE: 0.008330659593143781
PSA: 0.0212952244891898
SJM: 0.04894

In [16]:
# ----------- Efficient Frontier ------#
# this is slow for a large portfolio. Decided to use the PyPortfolioOpt library to generate this (see Baseline-Pypfopt.ipynb)

# target_returns = np.linspace(0.06, 0.17, 100)
# efficient_frontier_risk = []
# for ret in target_returns:
#     optimal = minimize(
#         fun=portfolio_std,
#         args=hist_cov,
#         x0=equally_weighted_weights,
#         bounds=[BOUND for x in range(len(TICKERS))],
#         constraints=(
#             {"type": "eq", "fun": lambda x: portfolio_return(x, hist_mean) - ret},
#             {"type": "eq", "fun": lambda weights: np.sum(weights) - 1},
#         ),
#         method="SLSQP",
#     )
#     efficient_frontier_risk.append(optimal["fun"])

## In Sample Results


In [17]:
portfolio_performance(port_weight_dict, train, index=ind_ret[date_cutoff:], verbose=True)

---------- Equally Weighted ----------
Annualized Return: 14.27%
Volatility:  21.86%
Sharpe Ratio:  0.65
Total Return:  98.63%

---------- Global Minimum Variance ----------
Annualized Return: 8.16%
Volatility:  13.59%
Sharpe Ratio:  0.60
Total Return:  52.00%

---------- Max Sharpe Ratio ----------
Annualized Return: 38.92%
Volatility:  21.83%
Sharpe Ratio:  1.78
Total Return:  725.98%

---------- Index ----------
Annualized Return: 43.69%
Volatility:  12.03%
Sharpe Ratio:  3.63
Total Return:  11.43%



In [21]:
# Print out portfolio value over time
date_range = train.index
equally_weighted_train = portfolio_culmulative_return(equally_weighted_weights, train)
gmv_train = portfolio_culmulative_return(gmv_weights, train)
max_sharpe_train = portfolio_culmulative_return(max_sharpe_weights, train)
index_train = ind_ret[:date_cutoff].add(1).cumprod().subtract(1).multiply(100)

back = pd.DataFrame(
    {
        "EqualWeight": equally_weighted_train,
        "MinVar": gmv_train,
        "MaxSharpe": max_sharpe_train,
        "SnP500": index_train,
    }
)
back = back.drop(back.tail(1).index)

back.interpolate(method="linear", inplace=True)

fig = px.line(back, x=back.index, y=back.columns, title="Training Portfolio Performance (2018 Jan - 2023 Sep)")
fig.update_xaxes(title_text="Date")
fig.update_yaxes(title_text="Cumulative Return in %")

fig.show()

### Out Of Sample Testing


In [19]:
# Print out portfolio value over time
date_range = test.index
index_ret = ind_ret[date_cutoff:]

back = pd.DataFrame(
    {
        "EqualWeight": portfolio_culmulative_return(equally_weighted_weights, test),
        "MinVar": portfolio_culmulative_return(gmv_weights, test),
        "MaxSharpe": portfolio_culmulative_return(max_sharpe_weights, test),
        "SnP500": index_ret.add(1).cumprod().subtract(1).multiply(100),
    }
)
back = back.drop(back.tail(1).index)

back.interpolate(method="linear", inplace=True)

fig = px.line(back, x=back.index, y=back.columns, title="Testing Portfolio Performance (2023 Oct-Dec)")
fig.update_xaxes(title_text="Date")
fig.update_yaxes(title_text="Cumulative Return in %")

fig.show()

In [20]:
portfolio_performance(port_weight_dict, test, index=index_ret, verbose=True)

---------- Equally Weighted ----------
Annualized Return: 46.47%
Volatility:  14.93%
Sharpe Ratio:  3.11
Total Return:  12.10%

---------- Global Minimum Variance ----------
Annualized Return: 27.32%
Volatility:  10.96%
Sharpe Ratio:  2.49
Total Return:  6.96%

---------- Max Sharpe Ratio ----------
Annualized Return: 38.75%
Volatility:  15.44%
Sharpe Ratio:  2.51
Total Return:  9.92%

---------- Index ----------
Annualized Return: 43.69%
Volatility:  12.03%
Sharpe Ratio:  3.63
Total Return:  11.43%

