In [1]:
import numpy as np
import pandas as pd
import yfinance as yf
import alphalens as al
import cvxpy as cp
import itertools
from datetime import datetime, timedelta
import nasdaqdatalink

nasdaqdatalink.read_key(filename="key")

# Step 1: Get S&P 500 Companies
def get_sp500_tickers():
    """Fetches S&P 500 tickers from Wikipedia."""
    table = pd.read_html("https://en.wikipedia.org/wiki/List_of_S%26P_500_companies")[0]
    return table[['Symbol', 'GICS Sector']].set_index('Symbol')

# Step 2: Universe Selection
def filter_universe(meta, tickers, exchange_select=None, currency_select=None, delisted_select=None, cap_select=None):
    """Filter stock universe based on exchange, currency, delisting status, and market cap."""
    
    meta = meta.loc[tickers]  # Apply filtering only on S&P 500 tickers
    
    if exchange_select:
        exchange = [list(meta[(meta['exchange'] == i)].index) for i in exchange_select]
        meta_ex = meta.loc[list(itertools.chain.from_iterable(exchange))]
    else:
        meta_ex = meta


    if currency_select:
        currency = [list(meta_ex[(meta_ex['currency'] == i)].index) for i in currency_select]
        meta_ex_cu = meta_ex.loc[list(itertools.chain.from_iterable(currency))]
    else:
        meta_ex_cu = meta_ex

    if delisted_select:
        delisted = [list(meta_ex_cu[(meta_ex_cu['isdelisted'] == i)].index) for i in delisted_select]
        meta_ex_cu_de = meta_ex_cu.loc[list(itertools.chain.from_iterable(delisted))]
    else:
        meta_ex_cu_de = meta_ex_cu


    if cap_select:
        cap = [list(meta_ex_cu_de[(meta_ex_cu_de['scalemarketcap'] == i)].index) for i in cap_select]
        meta_ex_cu_de_cap = meta_ex_cu_de.loc[list(itertools.chain.from_iterable(cap))]
    else:
        meta_ex_cu_de_cap = meta_ex_cu_de

    return meta_ex_cu_de_cap.index.tolist(), meta_ex_cu_de_cap

# Step 3: Fetch OHLCV Data
def get_stock_data(tickers, start, end):
    """Fetch OHLCV data from Yahoo Finance, including High and Low prices."""
    data = yf.download(tickers, start=start, end=end)
    return data  # Include High & Low

def dollar_volume_universe(tickers_num, ohlcv, sma_period):
    """Filter stocks based on dollar volume."""
    
    # Compute dollar volume for each ticker
    dollar_vol = ohlcv['Close'] * ohlcv['Volume']
    
    # Convert to long format and take the moving average
    dollar_vol_sma = dollar_vol.rolling(window=sma_period).mean()

    # Get the latest dollar volume values
    last_dv = dollar_vol_sma.iloc[-1, :]

    # Create DataFrame for sorting
    dol = pd.DataFrame({'dv': last_dv})
    
    # Drop NaN values
    dol.dropna(inplace=True)

    # Select top tickers based on dollar volume
    return list(dol.sort_values(by='dv', ascending=False).iloc[:tickers_num].index)


# Step 5: Sector Filtering
def filter_by_sector(universe, meta_ex_cu_de_cap, sec_to_drop):
    """Filter stocks by removing specific sectors."""
    universe_sectors = pd.DataFrame(index=universe, columns=['sectors'])
    for i in universe:
        try:
            universe_sectors.loc[i] = meta_ex_cu_de_cap.loc[i]['sector']
        except:
            universe_sectors.loc[i] = np.nan
        try:
            for sec in sec_to_drop:
                if meta_ex_cu_de_cap.loc[i]['sector'] == sec:
                    print(1)
                    universe_sectors.drop(i, axis=0, inplace=True)
        except:
            pass
    return universe_sectors.index.tolist()


In [None]:
start_date = '2020-01-01'
end_date = '2024-01-01'

sp500_tickers = get_sp500_tickers()
tickers = sp500_tickers.index.tolist()


# Fetch metadata from Nasdaq Data Link
meta = nasdaqdatalink.get_table('SHARADAR/TICKERS', table='SF1', paginate=True)


meta = meta.set_index(['ticker'])
tickers = [t for t in tickers if t in meta.index]

filtered_tickers,meta_ex_cu_de_cap = filter_universe(meta, tickers, exchange_select=['NYSE','NASDAQ','BATS'], currency_select=['USD'], delisted_select=['N'], cap_select=['6 - Mega', '5 - Large', '4 - Mid'])
# # print(filtered_tickers)
ohlcv_data = get_stock_data(filtered_tickers, start_date, end_date)
second_universe = dollar_volume_universe(50, ohlcv_data, 20)


final_universe = filter_by_sector(second_universe, meta_ex_cu_de_cap, sec_to_drop=['Financial Services',None])
print("Final Universe:", final_universe)


In [None]:
short_universe= final_universe[:5]
print(short_universe)

In [None]:
ohlcv_short_universe= get_stock_data(short_universe, start_date, end_date)

In [17]:
data= ohlcv_short_universe[:6]

In [13]:
def get_benchmark(start, end):
    """Fetch S&P 500 (^GSPC) data as benchmark."""
    market = '^GSPC'
    dfm = yf.download(market, start,end)
    dfm = dfm.rename(columns={'Open': 'open', 
                              'High': 'high', 
                              'Low': 'low', 
                              'Close': 'close',
                              'Volume': 'volume'})
    print(dfm.head())
    dfm.index.name = 'date'
    benchmark = dfm['close'].pct_change()
    benchmark.index = benchmark.index.tz_localize('UTC')
    return benchmark,dfm

In [None]:
benchmark_data,dfm = get_benchmark(start_date, end_date)
benchmark_data_short= benchmark_data[:6]
print("Benchmark Data:", benchmark_data_short)

In [80]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import statsmodels.api as sm
import yfinance as yf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2
from tensorflow.keras.losses import Huber as huber_loss
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from collections import defaultdict


# Improved data preparation function with better ticker handling
def prepare_yf_data_for_modeling(data):
    """Convert Yahoo Finance data to flat format needed for model, ensuring proper ticker encoding"""
    
    # Initialize an empty DataFrame for the result
    result_df = pd.DataFrame()
    
    # Handle Dictionary Input
    if isinstance(data, dict):
        if 'Close' in data and isinstance(data['Close'], pd.DataFrame):
            # Handle dictionary with DataFrames as values (multi-ticker case)
            tickers = data['Close'].columns
            
            for ticker in tickers:
                ticker_data = pd.DataFrame()
                for column in ['Open', 'High', 'Low', 'Close', 'Volume']:
                    if column in data:
                        ticker_data[column] = data[column][ticker]
                ticker_data['Ticker'] = ticker
                ticker_data['Date'] = ticker_data.index
                result_df = pd.concat([result_df, ticker_data])
        else:
            # Handle simple dictionary (single-ticker case)
            result_df = pd.DataFrame(data)
            if 'Ticker' not in result_df.columns:
                result_df['Ticker'] = 'SINGLE'
    
    # Handle DataFrame Input
    elif isinstance(data, pd.DataFrame):
        # For multi-ticker data with MultiIndex columns
        if isinstance(data.columns, pd.MultiIndex):
            for ticker in data['Close'].columns:
                ticker_data = pd.DataFrame()
                ticker_data['Open'] = data['Open'][ticker]
                ticker_data['High'] = data['High'][ticker]
                ticker_data['Low'] = data['Low'][ticker]
                ticker_data['Close'] = data['Close'][ticker]
                ticker_data['Volume'] = data['Volume'][ticker]
                ticker_data['Ticker'] = ticker
                ticker_data['Date'] = ticker_data.index
                result_df = pd.concat([result_df, ticker_data])
        else:
            # Single ticker case
            result_df = data.copy()
            if 'Ticker' not in result_df.columns:
                result_df['Ticker'] = 'SINGLE'
            if 'Date' not in result_df.columns and isinstance(result_df.index, pd.DatetimeIndex):
                result_df = result_df.reset_index()
                if 'index' in result_df.columns:
                    result_df = result_df.rename(columns={'index': 'Date'})
    else:
        raise TypeError(f"Expected DataFrame or dict, got {type(data)}")
    
    # Create a MultiIndex with Date and Ticker
    if 'Date' not in result_df.columns and result_df.index.name != 'Date':
        result_df['Date'] = result_df.index
    
    # Create ticker encoding mapping
    result_df['Ticker_Code'], ticker_mapping = pd.factorize(result_df['Ticker'])
    
    # Set MultiIndex
    result_df = result_df.set_index(['Date', 'Ticker'])
    
    print(f"Prepared data with shape: {result_df.shape}")
    print(f"Columns: {result_df.columns.tolist()}")
    print(f"Tickers found: {ticker_mapping.tolist()}")
    
    # Store ticker mapping for later decoding
    result_df.attrs['ticker_mapping'] = dict(enumerate(ticker_mapping))
    
    return result_df

def get_stock_data(tickers, start, end):
    """Fetch OHLCV data from Yahoo Finance, ensuring consistent DataFrame output"""
    data = yf.download(tickers, start=start, end=end)
    
    # Handle single ticker case
    if isinstance(data, pd.DataFrame) and not isinstance(data.columns, pd.MultiIndex):
        ticker = tickers if isinstance(tickers, str) else tickers[0]
        single_data = data.copy()
        single_data['Ticker'] = ticker
        single_data['Ticker_Code'] = 0  # Single ticker gets code 0
        return single_data
    
    return data

def calculate_indicators(df):
    """Calculate technical indicators for each ticker separately"""
    # Create a copy to avoid modifying the original
    result_df = df.copy()
    
    # Check for required column
    if 'Close' not in df.columns:
        raise ValueError("DataFrame must contain 'Close' column")
    
    # Iterate through each ticker
    for ticker in df.index.get_level_values('Ticker').unique():
        # Get data for this ticker
        ticker_data = df.xs(ticker, level='Ticker')
        
        # Calculate indicators
        result_df.loc[(slice(None), ticker), 'MA_50'] = ticker_data['Close'].rolling(window=min(50, len(ticker_data))).mean().values
        result_df.loc[(slice(None), ticker), 'RSI'] = calculate_rsi(ticker_data['Close'], period=min(14, len(ticker_data)-1)).values
        result_df.loc[(slice(None), ticker), 'MACD'] = calculate_macd(ticker_data['Close']).values
    
    return result_df

def calculate_rsi(series, period=14):
    delta = series.diff()
    gain = delta.where(delta > 0, 0)
    loss = -delta.where(delta < 0, 0)
    avg_gain = gain.rolling(window=period).mean()
    avg_loss = loss.rolling(window=period).mean()
    # Avoid division by zero
    rs = avg_gain / avg_loss.replace(0, np.nan).fillna(0.00001)
    return 100 - (100 / (1 + rs))

def calculate_macd(series, short_window=12, long_window=26, signal_window=9):
    short_ema = series.ewm(span=short_window, adjust=False).mean()
    long_ema = series.ewm(span=long_window, adjust=False).mean()
    macd = short_ema - long_ema
    signal = macd.ewm(span=signal_window, adjust=False).mean()
    return macd - signal

def prepare_data(df, look_back=5):
    """Prepare data for LSTM model with ticker encoding"""
    # Calculate next day returns for each ticker
    next_day_returns = pd.Series(index=df.index)
    
    for ticker in df.index.get_level_values('Ticker').unique():
        ticker_data = df.xs(ticker, level='Ticker')
        returns = ticker_data['Close'].pct_change().shift(-1) * 100
        next_day_returns.loc[(slice(None), ticker)] = returns.values
    
    df['Next Day Return'] = next_day_returns
    df.dropna(inplace=True)
    
    if df.empty:
        print("Error: No data available after calculating indicators and returns")
        return None, None, None, None
    
    features = ['Open', 'Close', 'MA_50', 'RSI', 'MACD']
    X = []
    y = []
    ticker_codes = []
    tickers = []
    
    # Iterate through each ticker
    for ticker in df.index.get_level_values('Ticker').unique():
        ticker_df = df.xs(ticker, level='Ticker')
        
        if len(ticker_df) <= look_back:
            print(f"Warning: Not enough data points for ticker {ticker}. Skipping.")
            continue
            
        try:
            # Ensure all features are available
            if not all(feature in ticker_df.columns for feature in features):
                print(f"Warning: Missing features for ticker {ticker}. Skipping.")
                continue
                
            if ticker_df[features].isna().all().any():
                print(f"Warning: All NaN values in at least one feature for ticker {ticker}. Skipping.")
                continue
            
            # Get ticker code from first row
            ticker_code = df.xs(ticker, level='Ticker')['Ticker_Code'].iloc[0]
            
            ticker_features = ticker_df[features].values
            ticker_target = ticker_df['Next Day Return'].values
            
            for i in range(look_back, len(ticker_features)):
                # Check for NaN values in this window
                window = ticker_features[i-look_back:i]
                if np.isnan(window).any() or np.isnan(ticker_target[i]):
                    continue
                    
                X.append(window)
                y.append(ticker_target[i])
                ticker_codes.append(ticker_code)
                tickers.append(ticker)
        except Exception as e:
            print(f"Error processing ticker {ticker}: {e}")
            continue
    
    if len(X) == 0:
        print("Error: No data points generated after filtering")
        return None, None, None, None
        
    X = np.array(X)
    y = np.array(y)
    ticker_codes = np.array(ticker_codes)
    tickers = np.array(tickers)
    
    print(f"Prepared data with shapes X:{X.shape}, y:{y.shape}")
    
    return X, y, ticker_codes, tickers


def train_lstm_model(X_train, y_train, epochs=300, batch_size=32):
    """Train an enhanced LSTM model for stock price prediction with advanced architecture"""
    if X_train is None or y_train is None:
        print("Error: No training data available")
        return None
    if len(X_train) == 0 or len(y_train) == 0:
        print("Error: Empty training data")
        return None
    if len(X_train.shape) != 3:
        print(f"Error: Expected 3D array for X_train, got shape {X_train.shape}")
        return None

    try:
        # Data normalization
        mean = X_train.mean(axis=0)
        std = X_train.std(axis=0)
        X_train_normalized = (X_train - mean) / (std + 1e-8)

        model = Sequential([
            # Bidirectional LSTM layers for better pattern recognition
            Bidirectional(LSTM(100, return_sequences=True, 
                             kernel_regularizer=l2(1e-5),
                             recurrent_regularizer=l2(1e-5),
                             dropout=0.2,
                             recurrent_dropout=0.2), 
                         input_shape=(X_train.shape[1], X_train.shape[2])),
            
            BatchNormalization(),
            
            Bidirectional(LSTM(50, return_sequences=True,
                             kernel_regularizer=l2(1e-5),
                             recurrent_regularizer=l2(1e-5),
                             dropout=0.2,
                             recurrent_dropout=0.2)),
            
            BatchNormalization(),
            
            Bidirectional(LSTM(25, return_sequences=False,
                             kernel_regularizer=l2(1e-5),
                             recurrent_regularizer=l2(1e-5),
                             dropout=0.2,
                             recurrent_dropout=0.2)),
            
            BatchNormalization(),
            
            Dense(50, activation='relu', kernel_regularizer=l2(1e-5)),
            Dropout(0.2),
            
            Dense(25, activation='relu', kernel_regularizer=l2(1e-5)),
            Dropout(0.2),
            
            Dense(1, activation='linear')
        ])

        optimizer = Adam(learning_rate=0.001)
        
        model.compile(optimizer=optimizer, 
                     loss=huber_loss,
                     metrics=['mae', 'mse'])

        early_stopping = EarlyStopping(
            monitor='val_loss',
            patience=10,
            restore_best_weights=True
        )

        reduce_lr = ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=5,
            min_lr=1e-6
        )

        # Train with validation split
        history = model.fit(
            X_train_normalized, 
            y_train,
            epochs=epochs,
            batch_size=batch_size,
            validation_split=0.2,
            callbacks=[early_stopping, reduce_lr],
            verbose=1
        )

        # Store normalization parameters as model attributes
        model.mean = mean
        model.std = std
        
        return model  # Return only the model

    except Exception as e:
        print(f"Error during model training: {e}")
        return None

def generate_signals_and_weights(predictions, ticker_codes, tickers, ticker_mapping):
    """Generate trading signals and portfolio weights with improved risk management"""
    try:
        # Create a dictionary to map predictions to tickers
        ticker_predictions = defaultdict(list)
        for i, (ticker, pred) in enumerate(zip(tickers, predictions)):
            ticker_predictions[ticker].append(pred)

        # Enhanced portfolio data generation with additional metrics
        portfolio_data = []
        for ticker, preds in ticker_predictions.items():
            avg_prediction = np.mean(preds)
            pred_std = np.std(preds)  # Measure of prediction uncertainty
            
            # Enhanced signal generation with confidence threshold
            confidence_threshold = 0.5 * pred_std
            if avg_prediction > confidence_threshold:
                signal = 'BUY'
            elif avg_prediction < -confidence_threshold:
                signal = 'SELL'
            else:
                signal = 'HOLD'

            # Find ticker code
            idx = np.where(tickers == ticker)[0][0]
            ticker_code = ticker_codes[idx]

            # Calculate prediction confidence score
            confidence_score = abs(avg_prediction) / (pred_std + 1e-8)
            
            portfolio_data.append({
                'Ticker_Code': ticker_code,
                'Ticker': ticker,
                'Predicted_Return': avg_prediction,
                'Prediction_Std': pred_std,
                'Confidence_Score': confidence_score,
                'Signal': signal,
                'Prediction_Count': len(preds)
            })

        portfolio = pd.DataFrame(portfolio_data)
        
        if len(portfolio) == 0:
            raise ValueError("No portfolio data generated")

        # Enhanced weight calculation incorporating confidence scores
        portfolio['Abs_Return'] = np.abs(portfolio['Predicted_Return'])
        portfolio['Risk_Adjusted_Return'] = portfolio['Abs_Return'] * portfolio['Confidence_Score']
        
        total_risk_adj_return = portfolio['Risk_Adjusted_Return'].sum()
        
        if total_risk_adj_return > 0:
            portfolio['Raw_Weight'] = portfolio['Risk_Adjusted_Return'] / total_risk_adj_return
            
            # Position sizing with risk management
            max_position_size = 0.25  # Maximum 25% in any single position
            portfolio['Raw_Weight'] = portfolio['Raw_Weight'].clip(upper=max_position_size)
            
            # Recalculate weights after clipping
            portfolio['Raw_Weight'] = portfolio['Raw_Weight'] / portfolio['Raw_Weight'].sum()
            
            portfolio['Weight'] = np.where(
                portfolio['Signal'] == 'BUY',
                portfolio['Raw_Weight'] * 100,
                np.where(
                    portfolio['Signal'] == 'SELL',
                    -portfolio['Raw_Weight'] * 100,
                    0
                )
            )
        else:
            # Modified equal weighting with risk management
            max_positions = min(len(portfolio), 10)  # Maximum number of positions
            weight = 100 / max_positions
            
            # Sort by confidence score and take top positions
            portfolio = portfolio.sort_values('Confidence_Score', ascending=False)
            portfolio['Weight'] = np.where(
                portfolio.index < max_positions,
                np.where(
                    portfolio['Signal'] == 'BUY', weight,
                    np.where(portfolio['Signal'] == 'SELL', -weight, 0)
                ),
                0
            )

        # Keep original columns for output consistency
        portfolio = portfolio[['Ticker_Code', 'Ticker', 'Predicted_Return', 'Signal', 'Weight']]
        return portfolio

    except Exception as e:
        print(f"Error in signal generation: {e}")
        return None

def backtest_portfolio(portfolio, benchmark_df, stock_data, initial_capital=10000):
    """Backtest portfolio performance against a benchmark with improved calculations"""
    try:
        tickers = portfolio['Ticker'].values
        weights = portfolio['Weight'].values / 100  # Convert weights to decimals
        
        # Create DataFrame for prices
        df_prices = pd.DataFrame(index=benchmark_df.index)
        df_prices['Benchmark'] = benchmark_df['close']
        
        # Get stock prices and align with benchmark dates
        for ticker in tickers:
            ticker_data = stock_data.xs(ticker, level='Ticker')['Close']
            df_prices[ticker] = ticker_data.reindex(benchmark_df.index)
        
        # Remove any dates with missing data
        df_prices = df_prices.dropna()
        
        if len(df_prices) == 0:
            raise ValueError("No overlapping dates found between stocks and benchmark")
        
        # Calculate daily returns
        daily_returns = df_prices[tickers].pct_change()
        
        # Initialize portfolio value array
        portfolio_values = np.zeros(len(df_prices))
        portfolio_values[0] = initial_capital
        
        # Calculate portfolio value each day
        for i in range(1, len(df_prices)):
            # Calculate daily portfolio return
            day_return = np.sum(daily_returns.iloc[i] * weights)
            # Update portfolio value
            portfolio_values[i] = portfolio_values[i-1] * (1 + day_return)
        
        df_prices['Portfolio'] = portfolio_values
        
        return df_prices
    except Exception as e:
        print(f"Error in backtesting: {e}")
        return None

def evaluate_performance(df_prices, initial_capital=10000, risk_free_rate=0.02):
    """Calculate portfolio performance metrics including benchmark comparison"""
    try:
        # Calculate daily returns
        portfolio_daily_returns = df_prices['Portfolio'].pct_change().dropna()
        benchmark_daily_returns = df_prices['Benchmark'].pct_change().dropna()
        print(df_prices['Portfolio'])
        print(df_prices['Benchmark'])
        # Basic metrics
        portfolio_total_return = ((df_prices['Portfolio'].iloc[-1] / initial_capital) - 1) * 100
        benchmark_total_return = ((df_prices['Benchmark'].iloc[-1] / df_prices['Benchmark'].iloc[0]) - 1) * 100
        
        # Time period in years
        days = len(df_prices)
        years = days / 252
        
        # CAGR
        portfolio_cagr = (((df_prices['Portfolio'].iloc[-1] / initial_capital) ** (1/years)) - 1) * 100
        benchmark_cagr = (((df_prices['Benchmark'].iloc[-1] / df_prices['Benchmark'].iloc[0]) ** (1/years)) - 1) * 100
        
        # Risk metrics
        portfolio_std = portfolio_daily_returns.std() * np.sqrt(252)
        excess_returns = portfolio_daily_returns - (risk_free_rate/252)
        
        # Sharpe Ratio
        sharpe = (np.mean(portfolio_daily_returns) * 252 - risk_free_rate) / portfolio_std
        
        # Maximum Drawdown
        rolling_max = df_prices['Portfolio'].cummax()
        drawdowns = (df_prices['Portfolio'] - rolling_max) / rolling_max
        max_drawdown = drawdowns.min() * 100
        
        # Win Rate
        win_rate = (portfolio_daily_returns > 0).mean() * 100
        
        # Beta and Alpha
        covariance = np.cov(portfolio_daily_returns, benchmark_daily_returns, ddof=0)[0,1]
        variance = np.var(benchmark_daily_returns, ddof=0)
        beta = covariance / variance
        
        expected_return = risk_free_rate + beta * (np.mean(benchmark_daily_returns) * 252 - risk_free_rate)
        actual_return = np.mean(portfolio_daily_returns) * 252
        alpha = (actual_return - expected_return) * 100
        
        metrics = {
            'portfolio_total_return': portfolio_total_return,
            'benchmark_total_return': benchmark_total_return,
            'portfolio_cagr': portfolio_cagr,
            'benchmark_cagr': benchmark_cagr,
            'max_drawdown': max_drawdown,
            'sharpe_ratio': sharpe,
            'win_rate': win_rate,
            'alpha': alpha,
            'beta': beta
        }
        
        # Print metrics
        print("\n📊 PORTFOLIO PERFORMANCE REPORT 📊")
        print(f"Portfolio Total Return: {portfolio_total_return:.2f}%")
        print(f"Benchmark Total Return: {benchmark_total_return:.2f}%")
        print(f"Portfolio CAGR: {portfolio_cagr:.2f}%")
        print(f"Benchmark CAGR: {benchmark_cagr:.2f}%")
        print(f"Max Drawdown: {max_drawdown:.2f}%")
        print(f"Sharpe Ratio: {sharpe:.2f}")
        print(f"Win Rate: {win_rate:.2f}%")
        print(f"Alpha: {alpha:.2f}%")
        print(f"Beta: {beta:.2f}")
        
        # Plotting
        plt.figure(figsize=(12,6))
        plt.plot(df_prices.index, df_prices['Portfolio'], label='Portfolio', linewidth=2)
        plt.plot(df_prices.index, 
                df_prices['Benchmark'] * (df_prices['Portfolio'].iloc[0] / df_prices['Benchmark'].iloc[0]),
                label='Benchmark (Normalized)', 
                linestyle='--')
        plt.title('Portfolio vs Benchmark Performance')
        plt.xlabel('Date')
        plt.ylabel('Value ($)')
        plt.legend()
        plt.grid(True)
        plt.show()
        
        return metrics
        
    except Exception as e:
        print(f"Error in performance evaluation: {e}")
        return None


def run_alpha_generation(tickers, start_date, end_date, data=None, benchmark_data_short=None, dfm=None):
    """Main function to run the alpha generation pipeline with explicit variable passing"""
    try:
        # 1. Get the data if not provided
        if data is None:
            stock_data = get_stock_data(tickers, start_date, end_date)
        else:
            stock_data = data
            
        if benchmark_data_short is None or dfm is None:
            # Fetch benchmark data if not provided
            benchmark_data_short = yf.download('^GSPC', start=start_date, end=end_date)
            dfm = benchmark_data_short.reset_index().rename(columns={'Date': 'date', 'Close': 'close'}).set_index('date')
        else:
            benchmark_data_short = benchmark_data_short
            dfm = dfm
        
        # 2. Prepare the data for modeling
        prepared_data = prepare_yf_data_for_modeling(stock_data)
        prepared_data = calculate_indicators(prepared_data)
        
        # Get ticker mapping from prepared data
        ticker_mapping = prepared_data.attrs.get('ticker_mapping', {})
        
        # 3. Prepare data for LSTM
        X, y, ticker_codes, tickers_array = prepare_data(prepared_data)
        
        if X is not None and len(X) > 0:
            # 4. Train the model
            model = train_lstm_model(X, y)
            
            if model is not None:
                # 5. Generate predictions and create portfolio
                try:
                    predictions = model.predict(X).flatten()
                    portfolio = generate_signals_and_weights(predictions, ticker_codes, tickers_array, ticker_mapping)
                    
                    # Decode ticker codes to show actual ticker names
                    print("\n📊 Portfolio Construction:")
                    print(portfolio)
                    
                    # 6. Backtest the portfolio
                    df_prices = backtest_portfolio(portfolio, dfm, prepared_data)
                    
                    # 7. Evaluate performance
                    initial_capital = 1000
                    performance_metrics = evaluate_performance(df_prices, initial_capital)
                    
                    return portfolio, df_prices, performance_metrics
                except Exception as e:
                    print(f"Error during portfolio construction or backtesting: {e}")
                    return None, None, None
            else:
                print("Model training failed. Cannot proceed with portfolio construction.")
                return None, None, None
        else:
            print("Data preparation failed. Cannot proceed with model training.")
            return None, None, None
    
    except Exception as e:
        print(f"Error in alpha generation pipeline: {e}")
        import traceback
        traceback.print_exc()
        return None, None, None

In [None]:
tickers= short_universe
portfolio, performance_data, metrics = run_alpha_generation(tickers, start_date, end_date)