In [None]:
pip install hmmlearn




In [None]:
pip install yfinance



In [None]:
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from hmmlearn import hmm
import warnings
warnings.filterwarnings('ignore')

In [None]:
# time period
start_date = '2015-01-01'
end_date = '2024-12-31'

In [None]:
def getstocktickers():
    stabletickers = [
        'AAPL', 'MSFT', 'AMZN', 'GOOGL', 'META', 'NVDA', 'TSLA', 'BRK-B', 'UNH', 'JNJ',
        'JPM', 'V', 'PG', 'MA', 'HD', 'CVX', 'MRK', 'KO', 'PEP', 'ABBV', 'COST', 'AVGO',
        'WMT', 'BAC', 'DIS', 'CSCO', 'VZ', 'ABT', 'CRM', 'ADBE', 'CMCSA', 'XOM', 'ACN',
        'NFLX', 'TMO', 'MCD', 'NKE', 'LLY', 'NEE', 'INTC', 'DHR', 'ORCL', 'AMD', 'IBM',
        'TXN', 'QCOM', 'LOW', 'UNP', 'PM', 'HON', 'AMGN', 'PFE', 'COP', 'RTX', 'CAT',
        'BMY', 'SBUX', 'GS', 'INTU', 'MDT', 'LIN', 'AMT', 'MS', 'UPS', 'BLK'
    ]
    return stabletickers[:60]

In [None]:
# Get S&P 500
sp = getstocktickers()
spdata = {}

for ticker in sp:
    try:
        data = yf.download(ticker, start=start_date, end=end_date)
        if len(data) > 0:
            spdata[ticker] = data
    except Exception as e:
        print(f"Error downloading {ticker}: {e}")

# Get cryptocurrency data using Yahoo Finance
crypto = ['BTC-USD', 'ETH-USD']
cryptodata = {}

for ticker in crypto:
    try:
        data = yf.download(ticker, start=start_date, end=end_date)
        if len(data) > 0:
            cryptodata[ticker] = data
    except Exception as e:
        print(f"Error downloading {ticker}: {e}")

YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

In [None]:
spdata['SPY'] = yf.download('SPY', start=start_date, end=end_date)

[*********************100%***********************]  1 of 1 completed


In [None]:
def calculatefeatures(df):
    # Returns
    df['daily_return'] = df['Close'].pct_change()
    df['rolling_return_30d'] = df['Close'].pct_change(30)

    # Volatility
    df['rolling_vol_10d'] = df['daily_return'].rolling(10).std()

    # Volume features
    if 'Volume' in df.columns:
        df['volume_change'] = df['Volume'].pct_change()
        df['rolling_vol_ratio'] = df['Volume'] / df['Volume'].rolling(20).mean()

    return df.dropna()

In [None]:
spprocessed = {}
for ticker, data in spdata.items():
    spprocessed[ticker] = calculate_features(data)

btcprocessed = calculate_features(cryptodata['BTC-USD'])
ethprocessed = calculate_features(cryptodata['ETH-USD'])

# dataframe with daily returns
returns = pd.DataFrame()
returns['SPY'] = spprocessed['SPY']['daily_return']
returns['BTC'] = btcprocessed['daily_return']
returns['ETH'] = ethprocessed['daily_return']

# cross-market correlations
returns['SPY_BTC_corr'] = returns['SPY'].rolling(30).corr(returns['BTC'])
returns['SPY_ETH_corr'] = returns['SPY'].rolling(30).corr(returns['ETH'])


In [None]:
# Market regime detection using HMM
def detectregime(returns, n_components=3):
    X = returns.dropna().values.reshape(-1, 1)

    # Fit HMM
    model = hmm.GaussianHMM(n_components=n_components, covariance_type="full", n_iter=1000)
    model.fit(X)

    # Predict hidden states
    hiddenstates = model.predict(X)

    # Map states to market regimes (Bear, Sideways, Bull)
    smeans = []
    for i in range(n_components):
        smeans.append(X[hiddenstates == i].mean())

    sstates = np.argsort(np.array(smeans).flatten())
    regimemap = {sstates[0]: 0, sstates[1]: 1, sstates[2]: 2}
    regimes = np.array([regimemap[s] for s in hiddenstates])

    regime = pd.DataFrame(index=returns.dropna().index)
    regime['regimes'] = regimes
    regime['regime_name'] = regime['regimes'].map({0: 'Bear', 1: 'Sideways', 2: 'Bull'})

    return regime

In [None]:
# Detect market regimes
marketregimes = detectregime(returns['SPY'])

# Align data and handle weekend gaps
adata = pd.DataFrame(index=pd.date_range(start=start_date, end=end_date))

# Add S&P 500 data
adata['SPY_close'] = sp500_processed['SPY']['Close']
adata['SPY_return_30d'] = sp500_processed['SPY']['rolling_return_30d']
adata['SPY_vol_10d'] = sp500_processed['SPY']['rolling_vol_10d']

for ticker in sp:
    if ticker != 'SPY':
        adata[f'{ticker}_close'] = spprocessed[ticker]['Close']
        adata[f'{ticker}_return_30d'] = spprocessed[ticker]['rolling_return_30d']
        adata[f'{ticker}_vol_10d'] = spprocessed[ticker]['rolling_vol_10d']




In [None]:
# Add crypto data
adata['BTC_close'] = btcprocessed['Close']
ådata['BTC_return_30d'] = btcprocessed['rolling_return_30d']
adata['BTC_vol_10d'] = btcprocessed['rolling_vol_10d']

adata['ETH_close'] = ethprocessed['Close']
adata['ETH_return_30d'] = ethprocessed['rolling_return_30d']
adata['ETH_vol_10d'] = ethprocessed['rolling_vol_10d']


In [None]:
# Add market regime
adata['market_regime'] = marketregimes['regime']
aata['regime_name'] = marketregimes['regime_name']

# Flag weekends for special handling in RL environment
adata['is_weekend'] = adata.index.dayofweek >= 5

# Forward fill missing values (for weekends and holidays)
adata = aata.ffill()


In [None]:
nan_analysis = adata.isna().sum()
print("NaN values by column:")
print(nan_analysis)

NaN values by column:
SPY_close            47
SPY_return_30d       47
SPY_vol_10d          47
AAPL_close           47
AAPL_return_30d      47
                   ... 
ETH_return_30d     1073
ETH_vol_10d        1073
market_regime        47
regime_name          47
is_weekend            0
Length: 192, dtype: int64


In [None]:
first_valid_dates = {}
for column in adata.columns:
    if column.endswith('_close'):
        first_valid_index = adata[column].first_valid_index()
        if first_valid_index is not None:
            first_valid_dates[column] = first_valid_index

print("First valid dates for each asset:")
for col, date in first_valid_dates.items():
    print(f"{col}: {date}")

First valid dates for each asset:
SPY_close: 2015-02-17 00:00:00
AAPL_close: 2015-02-17 00:00:00
MSFT_close: 2015-02-17 00:00:00
AMZN_close: 2015-02-17 00:00:00
GOOGL_close: 2015-02-17 00:00:00
META_close: 2015-02-17 00:00:00
NVDA_close: 2015-02-17 00:00:00
TSLA_close: 2015-02-17 00:00:00
BRK-B_close: 2015-02-17 00:00:00
UNH_close: 2015-02-17 00:00:00
JNJ_close: 2015-02-17 00:00:00
JPM_close: 2015-02-17 00:00:00
V_close: 2015-02-17 00:00:00
PG_close: 2015-02-17 00:00:00
MA_close: 2015-02-17 00:00:00
HD_close: 2015-02-17 00:00:00
CVX_close: 2015-02-17 00:00:00
MRK_close: 2015-02-17 00:00:00
KO_close: 2015-02-17 00:00:00
PEP_close: 2015-02-17 00:00:00
ABBV_close: 2015-02-17 00:00:00
COST_close: 2015-02-17 00:00:00
AVGO_close: 2015-02-17 00:00:00
WMT_close: 2015-02-17 00:00:00
BAC_close: 2015-02-17 00:00:00
DIS_close: 2015-02-17 00:00:00
CSCO_close: 2015-02-17 00:00:00
VZ_close: 2015-02-17 00:00:00
ABT_close: 2015-02-17 00:00:00
CRM_close: 2015-02-17 00:00:00
ADBE_close: 2015-02-17 00:00:

In [None]:
# common start date that minimizes data loss
common_start_date = max(first_valid_dates.values())
print(f"\nRecommended common start date: {common_start_date}")

clean_data = adata.loc[common_start_date:].copy()

# Verify NaN values
nan_counts = clean_data.isna().sum()
print("\nNaN counts in cleaned dataset:")
print(nan_counts[nan_counts > 0])

if nan_counts.sum() > 0:
    clean_data = clean_data.ffill().bfill()
    print("\nAfter imputation, NaN counts:")
    print(clean_data.isna().sum()[clean_data.isna().sum() > 0])


Recommended common start date: 2017-12-09 00:00:00

NaN counts in cleaned dataset:
Series([], dtype: int64)


In [None]:
clean_data

Unnamed: 0,SPY_close,SPY_return_30d,SPY_vol_10d,AAPL_close,AAPL_return_30d,AAPL_vol_10d,MSFT_close,MSFT_return_30d,MSFT_vol_10d,AMZN_close,...,MDT_vol_10d,BTC_close,BTC_return_30d,BTC_vol_10d,ETH_close,ETH_return_30d,ETH_vol_10d,market_regime,regime_name,is_weekend
2017-12-09,235.169754,0.038690,0.004711,39.800705,0.079848,0.008663,77.819023,0.073929,0.017360,58.099998,...,0.012498,15178.200195,1.124733,0.105945,473.502014,0.475617,0.037869,2.0,Bull,True
2017-12-10,235.169754,0.038690,0.004711,39.800705,0.079848,0.008663,77.819023,0.073929,0.017360,58.099998,...,0.012498,15455.400391,1.335309,0.106311,441.721008,0.476079,0.042822,2.0,Bull,True
2017-12-11,235.878296,0.033371,0.004637,40.576191,0.062808,0.011191,78.808418,0.022050,0.017652,58.445999,...,0.010486,16936.800781,1.664024,0.107192,515.135986,0.637010,0.067349,2.0,Bull,False
2017-12-12,236.294617,0.039065,0.003736,40.348232,0.033572,0.011179,79.132042,0.025269,0.017313,58.254002,...,0.010009,17415.400391,1.926924,0.106564,651.431030,1.115668,0.103800,2.0,Bull,False
2017-12-13,236.268005,0.037332,0.003708,40.482189,0.022771,0.008712,78.919350,0.031241,0.016063,58.206501,...,0.009926,16408.199219,1.501444,0.111847,702.767029,1.218919,0.103744,2.0,Bull,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-12-27,593.225464,-0.000297,0.012133,255.309296,0.135350,0.012133,429.668457,0.014562,0.014135,223.750000,...,0.008445,94164.859375,-0.018733,0.026862,3328.916992,-0.089776,0.036789,2.0,Bull,False
2024-12-28,593.225464,-0.000297,0.012133,255.309296,0.135350,0.012133,429.668457,0.014562,0.014135,223.750000,...,0.008445,95163.929688,-0.005107,0.022154,3397.902344,-0.050815,0.032827,2.0,Bull,True
2024-12-29,593.225464,-0.000297,0.012133,255.309296,0.135350,0.012133,429.668457,0.014562,0.014135,223.750000,...,0.008445,93530.226562,-0.040337,0.021433,3349.513428,-0.067895,0.028143,2.0,Bull,True
2024-12-30,586.455811,-0.005309,0.012546,251.923019,0.105074,0.013197,423.979858,-0.002834,0.014422,221.300003,...,0.009326,92643.210938,-0.039460,0.021360,3356.392578,-0.094263,0.027510,2.0,Bull,False


In [None]:
clean_data.to_csv("data/clean_full_data.csv")

In [None]:
# directory for figures
import os
if not os.path.exists('figures'):
    os.makedirs('figures')

In [None]:
# Figure: Market Regimes
plt.figure(figsize=(12, 6))

# Plot S&P 500
plt.plot(adata.index, adata['SPY_close'], 'b-')


if 'market_regime' in adata.columns:
    regime_changes = adata['market_regime'].ne(adata['market_regime'].shift()).cumsum()
    regime_groups = adata.groupby(regime_changes)

    colors = ['red', 'yellow', 'green']
    for _, group in regime_groups:
        if len(group) > 0:
            regime = group['market_regime'].iloc[0]
            if pd.notna(regime) and regime < len(colors):
                plt.axvspan(group.index[0], group.index[-1], alpha=0.2, color=colors[int(regime)])

# Set labels and title
plt.title('S&P 500 with Market Regimes (Red=Bear, Yellow=Sideways, Green=Bull)')
plt.xlabel('Date')
plt.ylabel('S&P 500 Price')
plt.grid(True)

plt.tight_layout()
plt.savefig('figures/market_regimes.png')
plt.close()

# Simple Figure: Rolling Correlation
plt.figure(figsize=(12, 6))

# Get correlation data, dropping NaN values
corr_data = all_returns[['SPY_BTC_corr']].dropna()

# Plot correlation
plt.plot(corr_data.index, corr_data['SPY_BTC_corr'], 'b-')

# Add reference lines
plt.axhline(y=0, color='r', linestyle='-', alpha=0.5)
plt.axhline(y=0.5, color='g', linestyle='--', alpha=0.5)
plt.axhline(y=-0.5, color='g', linestyle='--', alpha=0.5)

# Set labels and title
plt.title('30-day Rolling Correlation: S&P 500 vs Bitcoin')
plt.xlabel('Date')
plt.ylabel('Correlation Coefficient')
plt.ylim(-1, 1)
plt.grid(True)

plt.tight_layout()
plt.savefig('figures/correlation.png')
plt.close()

# Simple Figure: Volatility Comparison
plt.figure(figsize=(12, 6))

# Get volatility data, dropping NaN values
vol_data = adata[['SPY_vol_10d', 'BTC_vol_10d']].dropna()

# Plot volatility
plt.plot(vol_data.index, vol_data['SPY_vol_10d'], 'b-', label='S&P 500')
plt.plot(vol_data.index, vol_data['BTC_vol_10d'], 'orange', label='Bitcoin')

# Set labels and title
plt.title('10-day Rolling Volatility Comparison')
plt.xlabel('Date')
plt.ylabel('Volatility (Standard Deviation)')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.savefig('figures/volatility.png')
plt.close()