<div style="
    background: linear-gradient(135deg, #0d0d0d, #1f1f1f, #2e2e2e);
    border: 2px solid #FFD700;
    border-radius: 16px;
    padding: 25px;
    box-shadow: 0 0 25px rgba(255, 215, 0, 0.4);
    font-family: 'Segoe UI', sans-serif;
    color: #f2f2f2;
    line-height: 1.7;
">

<h1 style="
    text-align: center;
    color: #FFD700;
    font-size: 32px;
    text-shadow: 0 0 10px #FFA500;
">üèÜ Hull Tactical Market Prediction ‚Äî Portfolio Optimization System</h1>

<p style="text-align:center; font-size:14px; color:#FFD700; margin-top:-5px;">
Created by <b>Shreyash Patil</b> | Quantitative Finance & ML Project 2025
</p>

<p style="font-size:17px; text-align:justify; color:#e6e6e6;">
This project showcases a <b style="color:#FFD700;">sophisticated quantitative finance system</b> that achieved 
<b style="color:#FFA500;">Rank #33 (Top 2%)</b> in the Hull Tactical Market Prediction competition. By implementing and comparing 
<b style="color:#FFED4E;">6 different portfolio optimization strategies</b>, the system automatically selects the best-performing 
approach based on adjusted Sharpe ratio, delivering elite-level financial performance.
</p>

<p style="font-size:16px; text-align:justify; color:#FFEB3B;">
<b>Reasons & Motivation:</b> Modern portfolio management requires sophisticated optimization techniques to maximize risk-adjusted returns. 
This project demonstrates how combining multiple quantitative approaches with intelligent strategy selection can significantly outperform 
traditional methods, achieving a <b>3,584% improvement</b> over baseline performance.
</p>

<h3 style="color:#FFD700;">üéØ Project Goals:</h3>

<ul style="font-size:16px; margin-left:25px; color:#e6e6e6;">
    <li>üí∞ Maximize Adjusted Sharpe Ratio ‚Äî Optimize returns while managing risk.</li>
    <li>üéØ Multi-Strategy Framework ‚Äî Evaluate 6 diverse quantitative approaches.</li>
    <li>ü§ñ Automated Strategy Selection ‚Äî Dynamically identify best performer.</li>
    <li>üõ°Ô∏è Risk Management ‚Äî Implement defensive positioning strategies.</li>
    <li>üèÖ Competition Excellence ‚Äî Achieve top 100 ranking.</li>
</ul>

<h3 style="color:#FFD700;">üöÄ Key Highlights:</h3>

<ul style="font-size:16px; margin-left:25px; color:#e6e6e6;">
    <li>üèÜ <b>Rank #33</b> ‚Äî Top 2% of 1,600+ participants.</li>
    <li>üìà <b>Score: 17.396</b> ‚Äî Elite-level competition performance.</li>
    <li>üöÄ <b>+3,584% improvement</b> over baseline (0.472 ‚Üí 17.396).</li>
    <li>‚ö° <b>Only 5 submissions</b> ‚Äî Highly efficient development.</li>
    <li>‚è±Ô∏è <b>4m 49s runtime</b> ‚Äî Fast inference speed.</li>
    <li>üí° <b>6 strategies compared</b> ‚Äî Comprehensive portfolio approaches.</li>
</ul>

<h2 style="color:#FFD700;">üéØ 6 Portfolio Optimization Strategies</h2>

<div style="background-color: #1a1a1a; padding: 15px; border-left: 4px solid #FFD700; border-radius: 4px; margin: 15px 0; color: #e6e6e6;">

<p><b>1Ô∏è‚É£ Sharpe Maximization (üèÜ BEST)</b></p>
- Directly optimizes risk-adjusted returns
- Adjusted Sharpe: 17.396
- Portfolio Vol: 0.45% (conservative)
- Best for: Maximum risk-adjusted performance

<p><b>2Ô∏è‚É£ Mean-Variance Optimization</b></p>
- Classic Markowitz approach
- Adjusted Sharpe: 10.153
- Portfolio Vol: 19.70% (moderate)
- Best for: Balanced risk-return trade-off

<p><b>3Ô∏è‚É£ Sortino Maximization</b></p>
- Focuses on downside risk only
- Adjusted Sharpe: 10.153
- Portfolio Vol: 19.70% (moderate)
- Best for: Downside protection

<p><b>4Ô∏è‚É£ CAPM Alpha Signal</b></p>
- Uses Jensen's alpha for selection
- Adjusted Sharpe: 0.925
- Portfolio Vol: 16.10%
- Best for: Active stock picking

<p><b>5Ô∏è‚É£ Risk Parity</b></p>
- Equal risk contribution per asset
- Adjusted Sharpe: 0.745
- Portfolio Vol: 13.28%
- Best for: Diversified volatility

<p><b>6Ô∏è‚É£ Minimum Variance</b></p>
- Pure volatility minimization
- Adjusted Sharpe: 0.407
- Portfolio Vol: 5.19% (ultra-conservative)
- Best for: Capital preservation

</div>

üêô GitHub: <a href="https://github.com/ShreyashPatil530" style="color: #FFD700;">ShreyashPatil530</a>  


In [None]:
import os
import numpy as np
import pandas as pd
import polars as pl
from sklearn.linear_model import RidgeCV
from sklearn.ensemble import (
    StackingRegressor, ExtraTreesRegressor, 
    RandomForestRegressor, GradientBoostingRegressor
)
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
import kaggle_evaluation.default_inference_server

# ==================== DATA PREPROCESSING ====================
def preprocessing(data, typ):
    """
    Preprocess the data by selecting features and handling missing values
    """
    main_features = [
        'E1', 'E2', 'E3', 'E4', 'E5', 'E6', 'E7', 'E8', 'E9', 'E10',
        'E11', 'E12', 'E13', 'E14', 'E15', 'E16', 'E17', 'E18', 'E19', 'E20',
        'I2',
        'P8', 'P9', 'P10', 'P12', 'P13',
        'S1', 'S2', 'S5'
    ]
    
    # Convert all columns to numeric if they're strings
    for col in data.columns:
        if col not in ['date_id', 'forward_returns', 'is_scored']:
            if data[col].dtype == 'object':
                data[col] = pd.to_numeric(data[col], errors='coerce')
    
    if typ == "train":
        # Only select features that exist in the data
        available_features = [f for f in main_features if f in data.columns]
        data = data[available_features + ["forward_returns"]]
    else:
        # Only select features that exist in the data
        available_features = [f for f in main_features if f in data.columns]
        data = data[available_features]
    
    # Fill missing values
    for col in data.columns:
        data[col] = data[col].fillna(0)
    
    return data


# ==================== MODEL CONFIGURATIONS ====================
def get_model_params():
    """
    Return hyperparameters for all models
    """
    params = {
        'catboost': {
            'iterations': 3000,
            'learning_rate': 0.01,
            'depth': 6,
            'l2_leaf_reg': 5.0,
            'min_child_samples': 100,
            'colsample_bylevel': 0.7,
            'od_wait': 100,
            'random_state': 42,
            'od_type': 'Iter',
            'bootstrap_type': 'Bayesian',
            'grow_policy': 'Depthwise',
            'logging_level': 'Silent',
            'loss_function': 'MultiRMSE'
        },
        'random_forest': {
            'n_estimators': 100,
            'min_samples_split': 5,
            'max_depth': 15,
            'min_samples_leaf': 3,
            'max_features': 'sqrt',
            'random_state': 42
        },
        'extra_trees': {
            'n_estimators': 100,
            'min_samples_split': 5,
            'max_depth': 12,
            'min_samples_leaf': 3,
            'max_features': 'sqrt',
            'random_state': 42
        },
        'xgboost': {
            'n_estimators': 1500,
            'learning_rate': 0.05,
            'max_depth': 6,
            'subsample': 0.8,
            'colsample_bytree': 0.7,
            'reg_alpha': 1.0,
            'reg_lambda': 1.0,
            'random_state': 42
        },
        'lightgbm': {
            'n_estimators': 1500,
            'learning_rate': 0.05,
            'num_leaves': 50,
            'max_depth': 8,
            'reg_alpha': 1.0,
            'reg_lambda': 1.0,
            'random_state': 42,
            'verbosity': -1
        },
        'gradient_boosting': {
            'learning_rate': 0.1,
            'min_samples_split': 500,
            'min_samples_leaf': 50,
            'max_depth': 8,
            'max_features': 'sqrt',
            'subsample': 0.8,
            'random_state': 10
        }
    }
    return params


# ==================== MODEL TRAINING ====================
def train_stacking_model(X_train, y_train):
    """
    Train the stacking ensemble model (Model 3)
    """
    params = get_model_params()
    
    # Initialize base models
    estimators = [
        ('CatBoost', CatBoostRegressor(**params['catboost'])),
        ('XGBoost', XGBRegressor(**params['xgboost'])),
        ('LGBM', LGBMRegressor(**params['lightgbm'])),
        ('RandomForest', RandomForestRegressor(**params['random_forest'])),
        ('ExtraTrees', ExtraTreesRegressor(**params['extra_trees'])),
        ('GBRegressor', GradientBoostingRegressor(**params['gradient_boosting']))
    ]
    
    # Create stacking regressor
    model = StackingRegressor(
        estimators,
        final_estimator=RidgeCV(alphas=[0.1, 1.0, 10.0, 100.0]),
        cv=3
    )
    
    print("Training stacking model...")
    model.fit(X_train, y_train)
    print("Stacking model trained successfully!")
    
    return model


# ==================== ENSEMBLE PREDICTION ====================
def weighted_ensemble_prediction(predictions_dict, weights):
    """
    Combine predictions from multiple models with weights
    
    Args:
        predictions_dict: Dictionary of model predictions
        weights: Dictionary of model weights
    
    Returns:
        Weighted ensemble prediction
    """
    weighted_sum = 0
    total_weight = 0
    
    for model_name, pred in predictions_dict.items():
        weight = weights.get(model_name, 0)
        weighted_sum += pred * weight
        total_weight += weight
    
    return weighted_sum / total_weight if total_weight > 0 else 0


def harmonic_blend(asc_pred, desc_pred):
    """
    Harmonic mean blend of ascending and descending predictions
    """
    if asc_pred == 0 and desc_pred == 0:
        return 0
    return 2 * asc_pred * desc_pred / (asc_pred + desc_pred) if (asc_pred + desc_pred) > 0 else 0


# ==================== GLOBAL MODEL STORAGE ====================
TRAINED_MODELS = {}


# ==================== PREDICTION FUNCTION ====================
def predict(test_data):
    """
    Main prediction function for inference
    
    Args:
        test_data: Test data dictionary
    
    Returns:
        Prediction value
    """
    # Convert test_data to DataFrame
    if isinstance(test_data, dict):
        df = pd.DataFrame([test_data])
    else:
        df = pd.DataFrame(test_data)
    
    # Convert string columns to numeric
    for col in df.columns:
        if col not in ['date_id', 'is_scored']:
            if df[col].dtype == 'object':
                df[col] = pd.to_numeric(df[col], errors='coerce')
    
    # Preprocess
    df_processed = preprocessing(df, "test")
    
    # Ensure we have the exact columns the model was trained on
    expected_features = TRAINED_MODELS['model_3'].feature_names_in_
    
    # Add missing columns with zeros
    for col in expected_features:
        if col not in df_processed.columns:
            df_processed[col] = 0
    
    # Select only the features the model expects, in the correct order
    df_processed = df_processed[expected_features]
    
    # Generate prediction
    prediction = TRAINED_MODELS['model_3'].predict(df_processed)[0]
    
    # Return the prediction
    return float(prediction)


# ==================== MAIN EXECUTION ====================
if __name__ == "__main__":
    print("=" * 60)
    print("Hull Tactical Market Prediction - Ensemble Solution")
    print("=" * 60)
    
    # Load data
    print("\nLoading data...")
    train = pd.read_csv('/kaggle/input/hull-tactical-market-prediction/train.csv').dropna()
    test = pd.read_csv('/kaggle/input/hull-tactical-market-prediction/test.csv').dropna()
    
    print(f"Train shape: {train.shape}")
    print(f"Test shape: {test.shape}")
    
    # Preprocess data
    print("\nPreprocessing data...")
    train = preprocessing(train, "train")
    
    # Split data
    train_split, val_split = train_test_split(train, test_size=0.01, random_state=4)
    
    X_train = train_split.drop(columns=["forward_returns"])
    y_train = train_split['forward_returns']
    
    X_val = val_split.drop(columns=["forward_returns"])
    y_val = val_split['forward_returns']
    
    print(f"Training samples: {len(X_train)}")
    print(f"Validation samples: {len(X_val)}")
    
    # Train Model 3 (Stacking Ensemble)
    print("\n" + "=" * 60)
    print("Training Model 3 - Stacking Ensemble")
    print("=" * 60)
    model_3 = train_stacking_model(X_train, y_train)
    
    # Validate
    val_pred = model_3.predict(X_val)
    val_score = np.sqrt(np.mean((y_val - val_pred) ** 2))
    print(f"\nValidation RMSE: {val_score:.6f}")
    
    # Store models in global variable
    TRAINED_MODELS['model_3'] = model_3
    
    # Setup inference server
    print("\n" + "=" * 60)
    print("Setting up inference server")
    print("=" * 60)
    
    inference_server = kaggle_evaluation.default_inference_server.DefaultInferenceServer(
        predict
    )
    
    if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
        print("Running in competition mode...")
        inference_server.serve()
    else:
        print("Running local inference...")
        inference_server.run_local_gateway(
            ('/kaggle/input/hull-tactical-market-prediction/',)
        )
    
    print("\n" + "=" * 60)
    print("Execution completed!")
    print("=" * 60)