In [1]:
import pandas as pd
import numpy as np

import datetime
import os, sys
import importlib

import utils
importlib.reload(utils)

from utils import plot_series, plot_series_with_names, plot_series_bar
from utils import plot_dataframe
from utils import get_universe_adjusted_series, scale_weights_to_one, scale_to_book_long_short
from utils import generate_portfolio, backtest_portfolio
from utils import match_implementations

import plotly.graph_objects as go

In [2]:
# This directory can be used if you're working on a Kaggle Notebook inside the competition
# Change the directory as per your requirements if you're working somewhere else
data_dir = "/kaggle/input/qrt-quant-quest-iit-bombay-2025/"

features = pd.read_parquet(os.path.join(data_dir, "features.parquet"))

universe = pd.read_parquet(os.path.join(data_dir, "universe.parquet"))
 
returns = pd.read_parquet(os.path.join(data_dir, "returns.parquet"))

In [4]:
# Original selected features
selected_features = [
    'on_balance_volume', 'macd', 'ichimoku',
    'stochastic_oscillator', 'chaikin_money_flow', 'volume',
    'commodity_channel_index','williams_r',
     'volatility_20', 'trend_5_20','relative_strength_index'
]

In [5]:
train_features = features.loc["2005":"2015"]
validation_features = features.loc["2016":"2018"]
test_features = features.loc["2019"]

train_universe = universe.loc["2005":"2015"]
validation_universe = universe.loc["2016":"2018"]
test_universe = universe.loc["2019"]

train_returns = returns.loc["2005":"2015"]
validation_returns = returns.loc["2016":"2018"]
test_returns = returns.loc["2019"]

# Filter features based on selected features

# Check if columns have a MultiIndex
if isinstance(train_features.columns, pd.MultiIndex):
    # Filter columns based on level 0 of MultiIndex
    train_features = train_features.loc[:, train_features.columns.get_level_values(0).isin(selected_features)]
    validation_features = validation_features.loc[:, validation_features.columns.get_level_values(0).isin(selected_features)]
    test_features = test_features.loc[:, test_features.columns.get_level_values(0).isin(selected_features)]
    
    # Remove unused levels from MultiIndex
    train_features.columns = train_features.columns.remove_unused_levels()
    validation_features.columns = validation_features.columns.remove_unused_levels()
    test_features.columns = test_features.columns.remove_unused_levels()
else:
    # If columns are not MultiIndexed, filter directly
    train_features = train_features[selected_features]
    validation_features = validation_features[selected_features]
    test_features = test_features[selected_features]

# Debugging: Print filtered column levels
if isinstance(train_features.columns, pd.MultiIndex):
    print("Filtered Train Features Columns (Level 0):", train_features.columns.get_level_values(0).unique())
else:
    print("Filtered Train Features Columns:", train_features.columns)


Filtered Train Features Columns (Level 0): Index(['macd', 'trend_5_20', 'ichimoku', 'volatility_20', 'on_balance_volume',
       'chaikin_money_flow', 'relative_strength_index',
       'stochastic_oscillator', 'williams_r', 'volume',
       'commodity_channel_index'],
      dtype='object')


In [6]:
def backtest_portfolio_new(portfolio: pd.DataFrame, returns: pd.DataFrame, universe: pd.DataFrame, plot_: bool, print_: bool):

    """
    Computes performance metrics from a given portfolio DataFrame.

    This function calculates the net Sharpe ratio, along with other portfolio metrics,
    ensuring that certain constraints are met. It performs checks for:
    - Shape alignment of input DataFrames.
    - portfolio weights only in stocks that are part of the universe.
    - Dollar neutrality.
    - Unit capital constraint.
    - Maximum weight constraint.

    Parameters:
    ----------
    portfolio : pd.DataFrame
        DataFrame representing portfolio over time.
    returns : pd.DataFrame
        DataFrame containing stock returns corresponding to the portfolio.
    universe : pd.DataFrame
        Boolean DataFrame indicating whether a stock is part of the investable universe.
    plot_: bool
        Boolean Flag which decides whether to plot the cumulative PnL
    print_: bool
        Boolean Flag which decides whether to print the metrics of backtest

    Raises:
    ------
    ValueError
        If the input DataFrames do not have matching shapes.
        If portfolio contain stocks that are not in the universe.
        If the portfolio violates the dollar neutrality constraint.
        If the unit capital constraint is violated.
        If the maximum weight constraint is exceeded.

    Returns
    -------
    tuple
        - Net Sharpe Ratio (rounded to 3 decimal places).
        - Pandas Series containing gross PnL over time.

    Additional Outputs:
    -------------------
    - Prints the gross and net Sharpe ratios.
    - Prints the turnover percentage.
    - Plots cumulative Gross and Net PnL.

    Notes:
    ------
    - The turnover is calculated as the average traded capital divided by the average book value.
    - The gross and net Sharpe ratios are annualized using a factor of √252.
    - The net PnL accounts for trading costs (assumed to be 0.01% per unit traded).
    """

    universe = universe.astype(bool)
    
    if not (portfolio.shape == returns.shape == universe.shape):
        raise ValueError("Shapes of portfolio, returns and universe are not algined")

    if ((portfolio.replace(0, np.nan))[~universe].notna().sum().sum() != 0):
        raise ValueError("Your portfolio are present for a stock not present in the universe")

    if ((portfolio.sum(1).abs() > 0.01).sum() > 0):
        raise ValueError("Dollar Neutral Constraint is violated")
    
    if (((portfolio.abs().sum(1) - 1) > 0.01).sum() > 0):
        raise ValueError("Unit Capital Constraint is violated")

    if ((portfolio.abs().max(1) > 0.1).sum() > 0):
        raise ValueError("Maximum Weight Constraint is violated")

    portfolio = portfolio.fillna(0)

    rets = returns.fillna(0)

    gross_pnl = (portfolio * rets).sum(axis=1)

    traded = portfolio.diff(1).abs().sum(axis=1).fillna(0)

    net_pnl = gross_pnl - traded * 1e-4

    gross_sharpe_ratio = (gross_pnl.mean() / gross_pnl.std()) * np.sqrt(252)

    net_sharpe_ratio = (net_pnl.mean() / net_pnl.std()) * np.sqrt(252)

    if print_:
        print("Gross Sharpe Ratio: ", round(gross_sharpe_ratio, 3))
        print("Net Sharpe Ratio: ", round(net_sharpe_ratio, 3))

    return round(net_sharpe_ratio, 3), gross_pnl

In [7]:
def normalize_signal(feature, universe_boolean):
    """
    Normalize a feature signal for a given universe.
    Parameters:
        feature (pd.DataFrame): Feature data for all stocks.
        universe_boolean (pd.DataFrame): Boolean DataFrame indicating tradable stocks.
    Returns:
        pd.DataFrame: Normalized signal.
    """
    signal = feature.shift(1)  # Avoid lookahead bias
    signal = signal.where(universe_boolean, np.nan)
    signal = signal.rank(axis=1, method="min", ascending=True)
    signal = signal.sub(signal.mean(axis=1), axis=0)
    signal = signal.div(signal.abs().sum(axis=1), axis=0)
    return signal
    
def generate_portfolio_vectorized(features, universe, weights):
    """
    Generate portfolio weights using a vectorized approach.
    Parameters:
        features (pd.DataFrame): DataFrame containing all features for stocks.
        universe (pd.DataFrame): Binary DataFrame indicating tradable stocks.
        weights (array-like): Weights for each feature.
    Returns:
        pd.DataFrame: Portfolio weights for each stock on each trading day.
    """
    universe_boolean = universe.astype(bool)
    # features =  features.loc[:'2019']
    # Combine signals into a portfolio
    portfolio_weights = pd.DataFrame(0, index=features.index, columns=features.columns.get_level_values(1).unique())
    
    for i, feature_name in enumerate(features.columns.levels[0]):
        feature_data = features[feature_name]
        normalized_signal = normalize_signal(feature_data, universe_boolean)
        portfolio_weights += normalized_signal * weights[i]
    
    # Enforce dollar neutrality and normalize weights to sum to 1 per day
    portfolio_weights = portfolio_weights.sub(portfolio_weights.mean(axis=1), axis=0)
    portfolio_weights = portfolio_weights.div(portfolio_weights.abs().sum(axis=1), axis=0)
    
    return portfolio_weights.fillna(0)

In [8]:
from bayes_opt import BayesianOptimization
import numpy as np
import matplotlib.pyplot as plt
from joblib import Parallel, delayed

In [9]:
def optimize_signal_weights_bayesian(features_train, features_validation, universe_train, universe_validation, returns_train, returns_validation):

    def sharpe_ratio_objective(**weights):
        # Get all feature names from train_features
        all_features = sorted(features_train.columns.levels[0])
        
        # Assign 0 weight to features not in selected_features
        weight_array = np.array([weights.get(feature, 0) if feature in selected_features else 0 for feature in all_features])
        
        # Debug: Print the weight array
        # print(f"Weight Array: {weight_array}")
        
        # Generate portfolio weights for training set
        portfolio_weights_train = generate_portfolio_vectorized(features_train, universe_train, weight_array)
        
        # Backtest on training data
        sr_train, _ = backtest_portfolio_new(portfolio_weights_train, returns_train, universe_train, False, False)
        
        # Validate on validation set
        portfolio_weights_validation = generate_portfolio_vectorized(features_validation, universe_validation, weight_array)
        sr_validation, _ = backtest_portfolio_new(portfolio_weights_validation, validation_returns, validation_universe, False, False)
        
        print(f"Train Sharpe Ratio: {sr_train}, Validation Sharpe Ratio: {sr_validation}")
        
        # Combine training and validation Sharpe ratios
        alpha = 0.4  # Weight for training Sharpe ratio
        combined_sharpe_ratio = alpha * sr_train + (1 - alpha) * sr_validation
        
        # Add penalties to prevent overfitting
        penalty_l2 = np.sum(np.square(weight_array)) * combined_sharpe_ratio * 0.002
        penalty_l1 = np.sum(np.abs(weight_array)) * combined_sharpe_ratio * 0.009
        
        result = combined_sharpe_ratio * 10 - penalty_l2 - penalty_l1
        print(f"Debug: L2 penalty: {penalty_l2}")
        print(f"Debug: L1 penalty: {penalty_l1}")
        print(f"Debug: Objective function value (with penalty): {result}")
        print("-" * 50)
        return result

    # Create pbounds for selected features only
    pbounds = {feature: (-20, 20) for feature in selected_features}
    print(f"Parameter Bounds (pbounds): {pbounds}")
    
    optimizer = BayesianOptimization(
        f=sharpe_ratio_objective,
        pbounds=pbounds,
        random_state=42
    )
    
    optimizer.maximize(init_points=20, n_iter=300)
    
    optimal_weights = optimizer.max['params']
    
    return optimal_weights

# Run Bayesian Optimization using train/validation split
optimal_weights = optimize_signal_weights_bayesian(
    train_features,
    validation_features,
    train_universe,
    validation_universe,
    train_returns,
    validation_returns
)

# Evaluate on test set
portfolio_weights_test = generate_portfolio_vectorized(test_features, test_universe, np.array(list(optimal_weights.values())))
sr_test, _ = backtest_portfolio_new(portfolio_weights_test, test_returns, test_universe,False,False)

print("Optimal Weights:", optimal_weights)
print("Test Set Sharpe Ratio:", sr_test)

Parameter Bounds (pbounds): {'on_balance_volume': (-20, 20), 'macd': (-20, 20), 'ichimoku': (-20, 20), 'stochastic_oscillator': (-20, 20), 'chaikin_money_flow': (-20, 20), 'volume': (-20, 20), 'commodity_channel_index': (-20, 20), 'williams_r': (-20, 20), 'volatility_20': (-20, 20), 'trend_5_20': (-20, 20), 'relative_strength_index': (-20, 20)}
|   iter    |  target   | chaiki... | commod... | ichimoku  |   macd    | on_bal... | relati... | stocha... | trend_... | volati... |  volume   | willia... |
-------------------------------------------------------------------------------------------------------------------------------------------------------------
Train Sharpe Ratio: -0.558, Validation Sharpe Ratio: 0.06
Debug: L2 penalty: -0.6780137992204086
Debug: L1 penalty: -0.2150821800300543
Debug: Objective function value (with penalty): -0.9789040207495375
--------------------------------------------------
| [39m1        [39m | [39m-0.9789  [39m | [39m-5.018   [39m | [39m18.03    