In [3]:
import os

import boto3
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import pyarrow.parquet as pq
import seaborn as sns

In [4]:
s3 = boto3.client("s3")
bucket = "capstone-bucket-4-friends"

csv_key = "crsp_2023_clean.csv"
file_path = f"../data/{csv_key}"
s3.download_file(bucket, csv_key, file_path)

In [5]:
df = pd.read_csv(file_path)
df["date"] = pd.to_datetime(df["date"])

In [6]:
df.dtypes

PERMNO             int64
date      datetime64[ns]
PRC              float64
RET              float64
dtype: object

In [7]:
df.groupby("PERMNO").count()

Unnamed: 0_level_0,date,PRC,RET
PERMNO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10026,250,250,250
10028,250,250,250
10032,250,250,250
10044,250,250,250
10065,250,250,250
...,...,...,...
93426,250,250,250
93427,250,250,250
93429,250,250,250
93434,250,250,250


In [44]:
sampled_permno = df['PERMNO'].sample(100).tolist()

In [45]:
df_sample = df[df['PERMNO'].isin(sampled_permno)]
df_pivot = df_sample.pivot(index="date", columns="PERMNO", values="RET")
df_pivot.head()

PERMNO,11174,11618,13138,13326,13361,13369,13739,13877,13947,14081,...,91313,91320,91758,91945,92170,92471,92933,93204,93419,93426
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-01-03,-0.031276,0.001691,0.000329,-0.022388,0.00394,-0.011194,0.041997,-0.030293,0.00289,-0.000856,...,0.0003,-0.013853,0.010788,0.004715,0.031537,-0.003272,-0.004839,-0.041488,-0.027749,0.011902
2023-01-04,0.003429,0.012025,0.000658,0.012344,0.01182,0.008113,0.018099,-0.012015,0.096542,0.014567,...,-0.003599,0.024634,-0.003557,0.015086,0.012102,0.001768,0.005155,0.05036,0.02537,-0.001278
2023-01-05,-0.008542,-0.034813,0.000657,-0.005134,-0.00267,0.006363,0.017179,-0.00747,0.03548,-0.008446,...,0.010235,0.005365,0.009282,-0.010898,0.000629,0.011344,-0.002161,-0.027793,-0.053608,0.012033
2023-01-06,0.043653,0.028294,0.001314,0.046767,0.016064,0.040543,0.021442,0.029232,-0.002538,0.024915,...,-0.021156,0.044862,-0.013088,0.027379,0.010063,0.030409,0.010416,0.0323,0.045752,0.010625
2023-01-09,-0.036874,0.007141,0.0,0.013865,0.003623,-0.012332,0.017829,-0.008673,-0.040712,0.001454,...,0.003044,-0.001513,-0.006093,0.005525,0.00934,-0.005806,0.005587,-0.015045,-0.015625,0.013517


In [38]:
date_cutoff = pd.Timestamp("2023-10-01")

In [46]:
train = df_pivot[:date_cutoff]
train_span = len(train)
train_span

187

In [47]:
test = df_pivot[date_cutoff:]
test_span = len(test)
test_span

63

In [48]:
train.index

DatetimeIndex(['2023-01-03', '2023-01-04', '2023-01-05', '2023-01-06',
               '2023-01-09', '2023-01-10', '2023-01-11', '2023-01-12',
               '2023-01-13', '2023-01-17',
               ...
               '2023-09-18', '2023-09-19', '2023-09-20', '2023-09-21',
               '2023-09-22', '2023-09-25', '2023-09-26', '2023-09-27',
               '2023-09-28', '2023-09-29'],
              dtype='datetime64[ns]', name='date', length=187, freq=None)

Reference: https://lumos-datascience.medium.com/mean-variance-portfolio-optimization-using-python-8485fccb9f8b

In [49]:
from pypfopt import expected_returns, risk_models
from pypfopt.cla import CLA
from pypfopt.efficient_frontier import EfficientFrontier
from pypfopt.expected_returns import ema_historical_return

# from pypfopt.plotting import plot_efficient_frontier, plot_weights
from pypfopt.risk_models import exp_cov

mu = expected_returns.ema_historical_return(train, returns_data=True, span=train_span)
Sigma = risk_models.exp_cov(train, returns_data=True, span=train_span)

In [51]:

# ret_ef = np.arange(0, 0.879823, 0.01)
# vol_ef = []
# for i in np.arange(0, 0.879823, 0.01):
#     ef = EfficientFrontier(mu, Sigma)
#     ef.efficient_return(i)
#     vol_ef.append(ef.portfolio_performance()[1])

ef = EfficientFrontier(mu, Sigma)
ef.min_volatility()
min_vol_ret = ef.portfolio_performance()[0]
min_vol_vol = ef.portfolio_performance()[1]

ef.max_sharpe(risk_free_rate=0.009)
max_sharpe_ret = ef.portfolio_performance()[0]
max_sharpe_vol = ef.portfolio_performance()[1]


ArpackNoConvergence: ARPACK error -1: ARPACK error -1: No convergence (1001 iterations, 0/1 eigenvectors converged)


        CVXPY note: This failure was encountered while trying to certify
        that a matrix is positive semi-definite (see [1] for a definition).
        In rare cases, this method fails for numerical reasons even when the matrix is
        positive semi-definite. If you know that you're in that situation, you can
        replace the matrix A by cvxpy.psd_wrap(A).

        [1] https://en.wikipedia.org/wiki/Definite_matrix
        

In [None]:
import yfinance as yf

# Create a list of symbols
symbols = ["SPY"]
# Create data frame
index_data = yf.download(
    tickers=" ".join(symbols), start="2022-12-30", end="2023-12-30"
)["Adj Close"]

In [None]:
index_data.describe