In [10]:
import pandas as pd
import numpy as np
import yfinance as yf
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from scipy.optimize import minimize
import matplotlib.pyplot as plt


In [11]:
# Load S&P 500 tickers
sp500_tickers = pd.read_html("https://en.wikipedia.org/wiki/List_of_S%26P_500_companies")[0]['Symbol'].tolist()

# Download stock data
data = yf.download(sp500_tickers, start="2010-01-01", end="2023-01-01")['Adj Close']

# Remove stocks with insufficient data
data.dropna(axis=1, inplace=True)


[*********************100%***********************]  503 of 503 completed

7 Failed downloads:
['BF.B']: YFPricesMissingError('$%ticker%: possibly delisted; no price data found  (1d 2010-01-01 -> 2023-01-01)')
['SW', 'GEV', 'KVUE', 'VLTO', 'SOLV']: YFPricesMissingError('$%ticker%: possibly delisted; no price data found  (1d 2010-01-01 -> 2023-01-01) (Yahoo error = "Data doesn\'t exist for startDate = 1262322000, endDate = 1672549200")')
['BRK.B']: YFTzMissingError('$%ticker%: possibly delisted; no timezone found')


In [12]:
# Daily returns
daily_returns = data.pct_change()

# Calculate moving averages (e.g., 10-day, 50-day)
ma_10 = data.rolling(window=10).mean()
ma_50 = data.rolling(window=50).mean()

# Calculate volume and liquidity proxies
average_daily_volume = data.rolling(window=20).mean()

# Combine features into a DataFrame
features = pd.concat([
    daily_returns.mean(axis=1),
    ma_10 / ma_50,
    average_daily_volume,
], axis=1)


  features = pd.concat([


In [13]:
# Resample to monthly data
monthly_data = data.resample('M').last()
monthly_returns = monthly_data.pct_change()

# Filter top 150 liquid stocks
liquidity = data.resample('M').mean()  # Example: mean price as liquidity proxy
top_150 = liquidity.rank(axis=1, ascending=False).le(150).iloc[-1]

monthly_data = monthly_data[top_150]
monthly_returns = monthly_returns[top_150]


  monthly_data = data.resample('M').last()
  liquidity = data.resample('M').mean()  # Example: mean price as liquidity proxy


In [14]:
# Rolling returns for different time horizons
monthly_returns_3m = monthly_data.pct_change(periods=3)
monthly_returns_6m = monthly_data.pct_change(periods=6)


In [24]:
import pandas as pd
import numpy as np

# Load Fama-French data and treat the first column as the index
ff_data = pd.read_csv("ff5_model_south_korean.csv", index_col=0)

# Rename the index column to "Date" and parse it as datetime
ff_data.index.name = "Date"
ff_data.index = pd.to_datetime(ff_data.index, format='%b-%y')

# Rename columns for consistency
ff_data.rename(columns={"Mrt-Rf": "Mkt-RF"}, inplace=True)

# Select relevant factors and drop missing values
ff_factors = ff_data[['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA']].dropna()

# Display the cleaned data for verification
print(ff_factors.head())


              Mkt-RF       SMB       HML       RMW       CMA
Date                                                        
2006-09-01  0.016351  0.024321  0.006425 -0.008711  0.016506
2006-10-01 -0.009289 -0.013282  0.036378  0.024852  0.013272
2006-11-01  0.047880  0.027534  0.073978  0.053048  0.009446
2006-12-01  0.012965 -0.040579  0.048690  0.036586 -0.000362
2007-01-01 -0.054119  0.002354  0.021145 -0.001954 -0.013445


In [26]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer

# Check for missing values in the features
print(features.isnull().sum())  # Check missing values in features

# Impute missing values with the median (or other strategy)
imputer = SimpleImputer(strategy='median')  # 'mean', 'median', or other strategies
features_imputed = imputer.fit_transform(features)

# Standardize features for clustering
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features_imputed)

# Apply K-Means
kmeans = KMeans(n_clusters=5, random_state=42)
clusters = kmeans.fit_predict(scaled_features)

# Add cluster labels to the data
monthly_data['Cluster'] = clusters

# Optionally, check the resulting clusters
print(monthly_data[['Cluster']].head())


0    3272
dtype: int64




ValueError: Found array with 0 feature(s) (shape=(3272, 0)) while a minimum of 1 is required by StandardScaler.

In [None]:
def max_sharpe_ratio(weights, mean_returns, cov_matrix, risk_free_rate=0.01):
    portfolio_return = np.sum(mean_returns * weights)
    portfolio_std = np.sqrt(np.dot(weights.T, np.dot(cov_matrix, weights)))
    return -(portfolio_return - risk_free_rate) / portfolio_std

def efficient_frontier(mean_returns, cov_matrix):
    num_assets = len(mean_returns)
    args = (mean_returns, cov_matrix, 0.01)
    constraints = ({'type': 'eq', 'fun': lambda x: np.sum(x) - 1})
    bounds = tuple((0, 1) for _ in range(num_assets))

    result = minimize(max_sharpe_ratio, num_assets * [1. / num_assets], args=args, 
                      method='SLSQP', bounds=bounds, constraints=constraints)
    return result.x

# Select assets in a specific cluster
cluster_assets = monthly_data[monthly_data['Cluster'] == 0].columns[:-1]
cluster_returns = monthly_returns[cluster_assets]
mean_returns = cluster_returns.mean()
cov_matrix = cluster_returns.cov()

# Optimize portfolio
weights = efficient_frontier(mean_returns, cov_matrix)


In [None]:
# Calculate portfolio returns
portfolio_returns = (weights * monthly_returns[cluster_assets]).sum(axis=1)

# Compare to S&P 500
sp500_returns = monthly_returns.mean(axis=1)

# Plot
plt.figure(figsize=(12, 6))
plt.plot((1 + portfolio_returns).cumprod(), label='Portfolio')
plt.plot((1 + sp500_returns).cumprod(), label='S&P 500')
plt.legend()
plt.title("Portfolio vs. S&P 500")
plt.show()
