In [7]:
import yfinance as yf
import pandas as pd
import numpy as np
from scipy.optimize import minimize
from ta.trend import MACD
from ta.volatility import BollingerBands
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import os
import warnings
warnings.simplefilter("ignore", category=FutureWarning)

In [5]:
tickers = [
    'AAPL', 'MSFT', 'NVDA', 'AMZN', 'GOOG', 'JNJ', 'PFE', 'MRK', 'GSK', 'ABBV',
    'JPM', 'C', 'BAC', 'GS', 'AXP', 'KO', 'PG', 'WMT', 'COST', 'CL',
    'XOM', 'CVX', 'OXY', 'SLB', 'NEE', 'BA', 'CAT', 'DE', 'GE', 'UPS',
    'TSLA', 'MCD', 'HD', 'LVMUY', 'NKE', 'DUK', 'SO', 'AEP', 'EXC', 'FCX',
    'BHP', 'LIN', 'ALB', 'NEM', 'T', 'VZ', 'TMUS', 'S', 'XLF', 'XLK'
]

train_start = '2020-01-01'
train_end = '2024-03-30'
test_start = '2024-04-01'
test_end = '2025-04-01'

lambda_trend = 0.05
initial_investment = 10000

def download_data(name, tickers, start_date, end_date):
    os.makedirs('data', exist_ok=True)
    file_path = f'data/{name}.csv'
    if not os.path.exists(file_path):
        prices = yf.download(tickers, start=start_date, end=end_date, group_by='ticker', auto_adjust=True)
        prices.to_csv(file_path)

def load_data(name):
    file_path = f'data/{name}.csv'
    return pd.read_csv(file_path, header=[0, 1], index_col=0, parse_dates=True)

download_data('train_prices', tickers, train_start, train_end)
download_data('test_prices', tickers, test_start, test_end)

def get_returns(prices):
    if isinstance(prices.columns, pd.MultiIndex):
        close_prices = prices.xs('Close', level=1, axis=1)
    else:
        close_prices = prices[['Close']]
    close_prices = close_prices.ffill()
    monthly_prices = close_prices.resample('M').last()
    return monthly_prices.pct_change().dropna()

def extract_features(ticker, start_date, end_date):
    df = yf.Ticker(ticker).history(start=start_date, end=end_date)
    if df.empty:
        raise ValueError(f"No historical data for {ticker}")
    df.index = pd.to_datetime(df.index)
    shares_outstanding = yf.Ticker(ticker).info.get('sharesOutstanding', np.nan)
    df['IntradayReturn'] = (df['Close'] - df['Open']) / df['Open']
    df['TurnoverRatio'] = df['Volume'] / shares_outstanding if shares_outstanding else np.nan
    df['MarketCap'] = df['Close'] * shares_outstanding if shares_outstanding else np.nan
    df['MACD'] = MACD(df['Close']).macd()
    bb = BollingerBands(df['Close'])
    df['BB_Distance'] = (df['Close'] - bb.bollinger_lband()) / (bb.bollinger_hband() - bb.bollinger_lband())
    treasury = yf.Ticker('^TNX').history(start=start_date, end=end_date)
    if not treasury.empty:
        treasury.index = treasury.index.tz_localize(None)
        df.index = df.index.tz_localize(None)
        df['InterestRate'] = treasury['Close'].reindex(df.index, method='ffill') / 100
    else:
        df['InterestRate'] = np.nan
    monthly = pd.DataFrame()
    monthly['Open'] = df['Open'].resample('M').first()
    monthly['Close'] = df['Close'].resample('M').last()
    monthly['MonthlyReturn'] = (monthly['Close'] - monthly['Open']) / monthly['Open']
    monthly['MACD'] = df['MACD'].resample('M').last()
    monthly['BB_Distance'] = df['BB_Distance'].resample('M').last()
    monthly['MAD'] = df['IntradayReturn'].resample('M').apply(lambda x: np.mean(np.abs(x - x.mean())))
    monthly['TurnoverRatio'] = df['TurnoverRatio'].resample('M').mean()
    monthly['MarketCap'] = df['MarketCap'].resample('M').mean()
    monthly['InterestRate'] = df['InterestRate'].resample('M').last()
    monthly['Month'] = monthly.index.month
    monthly.dropna(inplace=True)
    monthly['Target'] = monthly['MonthlyReturn'].shift(-1)
    return monthly.dropna(subset=['Target'])

def enet_regression(df):
    features = ['MACD', 'BB_Distance', 'MAD', 'TurnoverRatio', 'MarketCap', 'InterestRate', 'Month']
    train_df = df.iloc[:int(len(df) * 0.7)]
    scaler_X = StandardScaler()
    X_train = scaler_X.fit_transform(train_df[features])
    scaler_y = StandardScaler()
    y_train = scaler_y.fit_transform(train_df['Target'].values.reshape(-1, 1)).ravel()
    model = ElasticNet(alpha=0.01, l1_ratio=0.7, random_state=42, selection='cyclic')
    model.fit(X_train, y_train)
    return model.coef_

def trend_score(ticker, features, window=3):
    try:
        df = extract_features(ticker, train_start, train_end)
    except:
        return 0.5
    scores = []
    for feature in features:
        if feature not in df.columns:
            continue
        recent = df[feature].dropna().iloc[-window:]
        if len(recent) < window:
            continue
        trend = (recent.iloc[-1] - recent.iloc[0]) / (abs(recent.iloc[0]) + 1e-6)
        if feature in ['MACD', 'TurnoverRatio'] and trend > 0:
            scores.append(1)
        elif feature in ['MarketCap', 'MAD', 'InterestRate', 'BB_Distance'] and trend < 0:
            scores.append(1)
        elif feature == 'Month':
            scores.append(0.5)
    return np.mean(scores) if scores else 0.5

def portfolio(returns, weights, initial_investment=10000):
    portfolio_returns = returns @ weights
    cumulative_returns = (1 + portfolio_returns).cumprod()
    return portfolio_returns, cumulative_returns * initial_investment

importance = []
for ticker in tickers:
    try:
        monthly_df = extract_features(ticker, train_start, train_end)
        coef = enet_regression(monthly_df)
        features_list = ['MACD', 'BB_Distance', 'MAD', 'TurnoverRatio', 'MarketCap', 'InterestRate', 'Month']
        sorted_features = [features_list[i] for i in np.argsort(np.abs(coef))[::-1]]
        importance.append({'Ticker': ticker, 'Top Features': sorted_features[:3]})
    except:
        continue
importance_df = pd.DataFrame(importance)
if importance_df.empty:
    raise ValueError("importance_df is empty. Check if extract_features() failed for all tickers.")
print(importance_df.head())

train_prices = load_data('train_prices')
monthly_returns = get_returns(train_prices)
mu = monthly_returns.mean()
Sigma = monthly_returns.cov()
tickers_with_data = monthly_returns.columns
tickers_with_features = importance_df['Ticker']
common_tickers = [ticker for ticker in tickers_with_data if ticker in tickers_with_features.values]
mu = mu[common_tickers]
Sigma = Sigma.loc[common_tickers, common_tickers]

trend_scores_1, trend_scores_2, trend_scores_3 = [], [], []
for ticker in common_tickers:
    top_features = importance_df[importance_df['Ticker'] == ticker]['Top Features'].values[0]
    trend_scores_1.append(trend_score(ticker, top_features[:1]))
    trend_scores_2.append(trend_score(ticker, top_features[:2]))
    trend_scores_3.append(trend_score(ticker, top_features[:3]))
trend_scores_1 = np.array(trend_scores_1)
trend_scores_2 = np.array(trend_scores_2)
trend_scores_3 = np.array(trend_scores_3)

n_assets = len(common_tickers)
target_return = mu.mean()
x0 = np.ones(n_assets) / n_assets
bounds = [(0, 1)] * n_assets
constraints = [{'type': 'eq', 'fun': lambda x: np.sum(x) - 1}, 
               {'type': 'ineq', 'fun': lambda x: x @ mu.values - target_return}]

pure_result = minimize(lambda x: x.T @ Sigma @ x, x0, bounds=bounds, constraints=constraints)
top1_result = minimize(lambda x: x.T @ Sigma @ x - lambda_trend * (trend_scores_1 @ x), x0, bounds=bounds, constraints=constraints)
top2_result = minimize(lambda x: x.T @ Sigma @ x - lambda_trend * (trend_scores_2 @ x), x0, bounds=bounds, constraints=constraints)
top3_result = minimize(lambda x: x.T @ Sigma @ x - lambda_trend * (trend_scores_3 @ x), x0, bounds=bounds, constraints=constraints)

pure_weights = pure_result.x
top1_weights = top1_result.x
top2_weights = top2_result.x
top3_weights = top3_result.x

test_prices = load_data('test_prices')
test_returns = get_returns(test_prices)
test_returns = test_returns[common_tickers]

pure_returns, pure_value = portfolio(test_returns, pure_weights, initial_investment)
top1_returns, top1_value = portfolio(test_returns, top1_weights, initial_investment)
top2_returns, top2_value = portfolio(test_returns, top2_weights, initial_investment)
top3_returns, top3_value = portfolio(test_returns, top3_weights, initial_investment)

plt.figure(figsize=(14, 7))
plt.plot(pure_value, label='Pure Markowitz', linewidth=2)
plt.plot(top1_value, label='Trend-Embedded Top-1', linewidth=2)
plt.plot(top2_value, label='Trend-Embedded Top-2', linewidth=2)
plt.plot(top3_value, label='Trend-Embedded Top-3', linewidth=2)
plt.title('Portfolio Value Growth ($10,000 Investment)', fontsize=16)
plt.xlabel('Date')
plt.ylabel('Portfolio Value ($)')
plt.legend()
plt.grid(True)
plt.show()

ValueError: importance_df is empty. Check if extract_features() failed for all tickers.