In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
import statsmodels.api as sm

# Initialize df with Sample Data - increase sample size to ensure enough data
data = {
    "Date": pd.date_range(start="2023-01-01", periods=200, freq="D"),
    "Ticker": ["AAPL"] * 100 + ["MSFT"] * 100,  # Equal distribution of tickers
    "Open": np.random.uniform(100, 200, 200),
    "Close": np.random.uniform(100, 200, 200),
}
df = pd.DataFrame(data)

# Step 1: Data Preparation
def calculate_indicators(df):
    # Group by ticker to calculate indicators separately for each stock
    result_df = pd.DataFrame()
    
    for ticker, group in df.groupby('Ticker'):
        group = group.copy()
        group['MA_50'] = group['Close'].rolling(window=min(50, len(group))).mean()
        group['RSI'] = calculate_rsi(group['Close'], period=min(14, len(group)-1))
        group['MACD'] = calculate_macd(group['Close'])
        result_df = pd.concat([result_df, group])
    
    return result_df

def calculate_rsi(series, period=14):
    delta = series.diff()
    gain = delta.where(delta > 0, 0)
    loss = -delta.where(delta < 0, 0)
    avg_gain = gain.rolling(window=period).mean()
    avg_loss = loss.rolling(window=period).mean()
    # Avoid division by zero
    rs = avg_gain / avg_loss.replace(0, np.nan).fillna(0.00001)
    return 100 - (100 / (1 + rs))

def calculate_macd(series, short_window=12, long_window=26, signal_window=9):
    short_ema = series.ewm(span=short_window, adjust=False).mean()
    long_ema = series.ewm(span=long_window, adjust=False).mean()
    macd = short_ema - long_ema
    signal = macd.ewm(span=signal_window, adjust=False).mean()
    return macd - signal

df = calculate_indicators(df)
# Encode tickers using factorize to ensure consistent encoding
df['Ticker Encoding'], ticker_mapping = pd.factorize(df['Ticker'])
print(f"Ticker mapping: {dict(enumerate(ticker_mapping))}")

# Step 2: Model Training
def prepare_data(df, look_back=5):
    # Calculate returns by ticker group
    for ticker, group in df.groupby('Ticker'):
        idx = group.index
        df.loc[idx, 'Next Day Return'] = group['Close'].pct_change().shift(-1) * 100
    
    df.dropna(inplace=True)
    
    if df.empty:
        print("Error: No data available after calculating indicators and returns")
        return None, None, None, None
    
    features = ['Open', 'Close', 'MA_50', 'RSI', 'MACD']
    X = []
    y = []
    ticker_encodings = []
    tickers = []  # Store actual ticker symbols
    
    for ticker, ticker_df in df.groupby('Ticker'):
        if len(ticker_df) <= look_back:
            print(f"Warning: Not enough data points for ticker {ticker}. Skipping.")
            continue
            
        try:
            # Check if all features are available and not all NaN
            if not all(feature in ticker_df.columns for feature in features):
                print(f"Warning: Missing features for ticker {ticker}. Skipping.")
                continue
                
            if ticker_df[features].isna().all().any():
                print(f"Warning: All NaN values in at least one feature for ticker {ticker}. Skipping.")
                continue
            
            ticker_features = ticker_df[features].values
            ticker_target = ticker_df['Next Day Return'].values
            ticker_encoding = ticker_df['Ticker Encoding'].values[0]  # All encodings should be the same for a ticker
            
            for i in range(look_back, len(ticker_features)):
                # Check for NaN values in this window
                window = ticker_features[i-look_back:i]
                if np.isnan(window).any() or np.isnan(ticker_target[i]):
                    continue
                    
                X.append(window)
                y.append(ticker_target[i])
                ticker_encodings.append(ticker_encoding)
                tickers.append(ticker)  # Append the actual ticker symbol
        except Exception as e:
            print(f"Error processing ticker {ticker}: {e}")
            continue
    
    if len(X) == 0:
        print("Error: No data points generated after filtering")
        return None, None, None, None
        
    X = np.array(X)
    y = np.array(y)
    ticker_encodings = np.array(ticker_encodings)
    tickers = np.array(tickers)
    
    print(f"Prepared data for {np.unique(tickers)} with shapes X:{X.shape}, y:{y.shape}")
    
    return X, y, ticker_encodings, tickers

def train_lstm_model(X_train, y_train, epochs=1, batch_size=32):
    # First check if we have valid data
    if X_train is None or y_train is None:
        print("Error: No training data available")
        return None
        
    if len(X_train) == 0 or len(y_train) == 0:
        print("Error: Empty training data")
        return None
    
    # Verify that X_train has 3 dimensions: (samples, time steps, features)
    if len(X_train.shape) != 3:
        print(f"Error: Expected 3D array for X_train, got shape {X_train.shape}")
        return None
    
    try:
        model = Sequential()
        model.add(LSTM(50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
        model.add(LSTM(50, return_sequences=False))
        model.add(Dense(1))
        
        model.compile(optimizer='adam', loss='mse')
        model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=1)
        
        return model
    except Exception as e:
        print(f"Error during model training: {e}")
        return None

# Main logic with fallback for failure
X, y, ticker_encodings, tickers = prepare_data(df)

if X is not None and len(X) > 0:
    model = train_lstm_model(X, y)
    
    if model is not None:
        # Step 3: Portfolio Construction
        def generate_signals_and_weights(predictions, ticker_encodings, tickers):
            # Create a dictionary to map predictions to tickers
            ticker_predictions = {}
            
            for i, (ticker, pred) in enumerate(zip(tickers, predictions)):
                if ticker not in ticker_predictions:
                    ticker_predictions[ticker] = []
                ticker_predictions[ticker].append(pred)
            
            # Aggregate predictions by actual ticker
            portfolio_data = []
            
            for ticker, preds in ticker_predictions.items():
                avg_prediction = np.mean(preds)
                signal = 'BUY' if avg_prediction > 0 else 'SELL' if avg_prediction < 0 else 'HOLD'
                
                # Get the ticker encoding for this ticker
                ticker_encoding = ticker_encodings[np.where(tickers == ticker)[0][0]]
                
                portfolio_data.append({
                    'Encoding': ticker_encoding,
                    'Ticker': ticker,
                    'Predicted Return': avg_prediction,
                    'Signal': signal,
                    'Prediction Count': len(preds)
                })
            
            portfolio = pd.DataFrame(portfolio_data)
            
            if len(portfolio) == 0:
                raise ValueError("No portfolio data generated")
            
            # Calculate weights based on predicted returns
            portfolio['Abs Return'] = np.abs(portfolio['Predicted Return'])
            total_abs_return = portfolio['Abs Return'].sum()
            
            if total_abs_return > 0:
                portfolio['Raw Weight'] = portfolio['Abs Return'] / total_abs_return
                portfolio['Weight'] = np.where(
                    portfolio['Signal'] == 'BUY', 
                    portfolio['Raw Weight'] * 100,
                    np.where(
                        portfolio['Signal'] == 'SELL',
                        -portfolio['Raw Weight'] * 100,
                        0
                    )
                )
            else:
                # Default to equal weights if all predicted returns are 0
                weight = 100 / len(portfolio)
                portfolio['Weight'] = np.where(
                    portfolio['Signal'] == 'BUY', weight,
                    np.where(portfolio['Signal'] == 'SELL', -weight, 0)
                )
            
            # Keep only necessary columns
            portfolio = portfolio[['Encoding', 'Ticker', 'Predicted Return', 'Signal', 'Weight']]
            
            return portfolio

        # Step 4: Backtesting
        def backtest_portfolio(portfolio, df_prices, initial_capital=10000):
            # Get the tickers, weights, and initial prices
            tickers = portfolio['Ticker'].values  # Using actual tickers now
            weights = portfolio['Weight'].values / 100
            
            # Make sure all tickers exist in df_prices
            available_tickers = [ticker for ticker in tickers if ticker in df_prices.columns]
            if len(available_tickers) != len(tickers):
                print(f"Warning: {len(tickers) - len(available_tickers)} tickers not found in price data")
                missing_tickers = set(tickers) - set(available_tickers)
                print(f"Missing tickers: {missing_tickers}")
                print(f"Available columns in price data: {df_prices.columns.tolist()}")
                
            # Filter portfolio to only include available tickers
            available_indices = [i for i, ticker in enumerate(tickers) if ticker in df_prices.columns]
            tickers = [tickers[i] for i in available_indices]
            weights = [weights[i] for i in available_indices]
            
            if not tickers:
                raise ValueError("No tickers from portfolio found in price data")
            
            print(f"Backtesting with tickers: {tickers} and weights: {weights}")
            
            # Now get initial prices
            initial_prices = df_prices.loc[df_prices.index[0], tickers].values
            
            portfolio_values = []  # To store the portfolio value for each day
            
            # Iterate through each day in the price data
            for date, row in df_prices.iterrows():
                portfolio_value = 0  # Initialize portfolio value for the day
                
                # Calculate the value of each position in the portfolio
                for i, ticker in enumerate(tickers):
                    shares = (initial_capital * weights[i]) / initial_prices[i]  # Number of shares
                    price = row[ticker]  # Current price of the ticker
                    
                    if shares > 0:  # Long position
                        portfolio_value += shares * price
                    elif shares < 0:  # Short position
                        position_value = (-shares) * (2 * initial_prices[i] - price)
                        portfolio_value += position_value
                    else:  # No position
                        pass
                
                portfolio_values.append(portfolio_value)  # Append the daily portfolio value
            
            # Add the portfolio value to the price dataframe
            df_prices['Portfolio Value'] = portfolio_values
            return df_prices

        # Step 5: Performance Evaluation (same as before)
        def evaluate_performance(df_prices, initial_capital=10000, risk_free_rate=0.02):
            df_prices['Portfolio Daily Return'] = df_prices['Portfolio Value'].pct_change()
            df_prices['S&P 500 Daily Return'] = df_prices['S&P500'].pct_change()
            
            # Total Return
            total_return = (df_prices['Portfolio Value'].iloc[-1] - initial_capital) / initial_capital * 100
            
            # CAGR
            num_days = len(df_prices)
            years = num_days / 252
            CAGR = ((df_prices['Portfolio Value'].iloc[-1] / initial_capital) ** (1/years)) - 1
            
            # Max Drawdown
            rolling_max = df_prices['Portfolio Value'].cummax()
            drawdown = (df_prices['Portfolio Value'] - rolling_max) / rolling_max
            max_drawdown = drawdown.min() * 100
            
            # Handling potential NaN values in performance metrics
            portfolio_returns = df_prices['Portfolio Daily Return'].dropna()
            
            if len(portfolio_returns) > 0:
                # Sharpe Ratio
                sharpe_ratio = (portfolio_returns.mean() - risk_free_rate / 252) / portfolio_returns.std() if portfolio_returns.std() > 0 else 0
                sharpe_ratio *= np.sqrt(252)
                
                # Sortino Ratio
                downside_returns = portfolio_returns[portfolio_returns < 0]
                sortino_ratio = (portfolio_returns.mean() - risk_free_rate / 252) / downside_returns.std() if len(downside_returns) > 0 and downside_returns.std() > 0 else 0
                sortino_ratio *= np.sqrt(252)
                
                # Win Rate
                win_rate = (portfolio_returns > 0).sum() / len(portfolio_returns) * 100
            else:
                sharpe_ratio, sortino_ratio, win_rate = 0, 0, 0
            
            # Alpha and Beta
            benchmark_returns = df_prices['S&P 500 Daily Return'].dropna()
            portfolio_returns = df_prices['Portfolio Daily Return'].dropna()
            
            if len(benchmark_returns) > 0 and len(portfolio_returns) > 0 and len(benchmark_returns) == len(portfolio_returns):
                X = benchmark_returns.values
                y = portfolio_returns.values
                
                if np.std(X) > 0:
                    X_with_const = sm.add_constant(X)
                    try:
                        model = sm.OLS(y, X_with_const).fit()
                        alpha, beta = model.params
                        alpha *= 252  # Annualize alpha
                    except:
                        alpha, beta = 0, 1
                else:
                    alpha, beta = 0, 1
            else:
                alpha, beta = 0, 1
            
            # Print Performance Metrics
            print("\n📊 **FINAL PERFORMANCE REPORT** 📊")
            print(f"✅ Total Return: {total_return:.2f}%")
            print(f"✅ CAGR: {CAGR*100:.2f}% per year")
            print(f"✅ Max Drawdown: {max_drawdown:.2f}%")
            print(f"✅ Sharpe Ratio: {sharpe_ratio:.2f}")
            print(f"✅ Sortino Ratio: {sortino_ratio:.2f}")
            print(f"✅ Win Rate: {win_rate:.2f}%")
            print(f"✅ Alpha: {alpha*100:.2f}%")
            print(f"✅ Beta: {beta:.2f}")
            
            # Plot Portfolio vs S&P 500
            plt.figure(figsize=(12,6))
            plt.plot(df_prices.index, df_prices['Portfolio Value'], label="Portfolio", color="blue")
            plt.plot(df_prices.index, df_prices['S&P500'] * (df_prices['Portfolio Value'].iloc[0] / df_prices['S&P500'].iloc[0]), 
                    label="S&P 500 (Normalized)", color="red", linestyle="dashed")
            plt.title("Portfolio vs S&P 500 Performance")
            plt.xlabel("Date")
            plt.ylabel("Value ($)")
            plt.legend()
            plt.grid()
            plt.show()

        # Generate predictions and create portfolio
        try:
            predictions = model.predict(X).flatten()
            portfolio = generate_signals_and_weights(predictions, ticker_encodings, tickers)
            print("\n📊 Portfolio Construction:")
            print(portfolio)
            
            # Backtesting data
            data = {
                "Date": pd.date_range(start="2023-06-14", periods=20, freq="D"),
                "AAPL": [180, 182, 181, 185, 190, 192, 195, 194, 193, 192, 194, 195, 196, 198, 200, 202, 201, 200, 199, 198],
                "MSFT": [330, 328, 326, 325, 322, 320, 315, 314, 313, 312, 310, 308, 307, 305, 304, 302, 301, 300, 298, 295],
                "S&P500": [4400, 4410, 4420, 4430, 4440, 4450, 4460, 4470, 4480, 4490, 
                           4500, 4510, 4520, 4530, 4540, 4550, 4560, 4570, 4580, 4590]
            }

            df_prices = pd.DataFrame(data)
            df_prices.set_index("Date", inplace=True)

            initial_capital = 10000
            df_prices = backtest_portfolio(portfolio, df_prices, initial_capital)
            evaluate_performance(df_prices, initial_capital)
        except Exception as e:
            print(f"Error during portfolio construction or backtesting: {e}")
    else:
        print("Model training failed. Cannot proceed with portfolio construction and backtesting.")
else:
    print("Data preparation failed. Cannot proceed with model training.")