In [18]:
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from collections import deque
from scipy.stats import pearsonr

# --- PART 1: INITIALIZATION AND TRAINING (Unchanged) ---
print("--- Initializing and training the final model... ---")

# 1.1: Load and Prepare Training Data
full_train_df = pd.read_csv('../train.csv')

# 1.2: Initial Feature Engineering
cutoff_date_id = 3283
df_subset = full_train_df[full_train_df['date_id'] >= cutoff_date_id].copy()
df_subset = df_subset.sort_values('date_id').ffill().bfill()

top_features_for_rolling = ['M4', 'V13', 'P8', 'S5']
for feature in top_features_for_rolling:
    if feature in df_subset.columns:
        df_subset[f'{feature}_roll_mean_20'] = df_subset[feature].rolling(window=20).mean()

if 'S5' in df_subset.columns and 'V13' in df_subset.columns:
    df_subset['S5_x_V13'] = df_subset['S5'] * df_subset['V13']

df_train = df_subset.iloc[:-1000]
df_test = df_subset.iloc[-1000:]

df_featured = df_train.dropna()

# 1.3: Feature Selection
print("Performing feature selection...")
X_temp = df_featured.drop(columns=['date_id', 'market_forward_excess_returns', 'forward_returns', 'risk_free_rate'], errors='ignore')
y_temp = df_featured['market_forward_excess_returns']

feature_selector_model = RandomForestRegressor(n_estimators=100, max_depth=10, min_samples_leaf=10, random_state=42, n_jobs=-1)
feature_selector_model.fit(X_temp, y_temp)

importances = feature_selector_model.feature_importances_
feature_importance_df = pd.DataFrame({'feature': X_temp.columns, 'importance': importances})
top_30_features = feature_importance_df.sort_values('importance', ascending=False).head(30)['feature'].tolist()

df_lean = df_featured[['date_id', 'market_forward_excess_returns', 'forward_returns', 'risk_free_rate'] + top_30_features]

# 1.4: Train the Final Model
print("Training the final model on the lean feature set...")
X_final_train = df_lean[top_30_features]
y_final_train = df_lean['market_forward_excess_returns']

final_model = RandomForestRegressor(n_estimators=100, max_depth=10, min_samples_leaf=10, random_state=42, n_jobs=-1)
final_model.fit(X_final_train, y_final_train)
print("Model training complete.")

print("--- Initialization complete. Ready for inference. ---")

X_test = df_test[top_30_features]
y_test = df_test[['market_forward_excess_returns', 'forward_returns', 'risk_free_rate']]

y_pred = final_model.predict(X_test)

print("Evaluating on test set...")
print(hit_rate(y_test['market_forward_excess_returns'], y_pred), pearson_corr(y_test['market_forward_excess_returns'], y_pred))


--- Initializing and training the final model... ---
Performing feature selection...
Training the final model on the lean feature set...
Model training complete.
--- Initialization complete. Ready for inference. ---
Evaluating on test set...
0.498 0.004799884985123216


In [19]:
def score(solution: pd.DataFrame, submission) -> float:
    """
    Calculates a custom evaluation metric (volatility-adjusted Sharpe ratio).

    This metric penalizes strategies that take on significantly more volatility
    than the underlying market.

    Returns:
        float: The calculated adjusted Sharpe ratio.
    """

  
    solution = solution.copy()
    solution.loc[:, 'position'] = submission

    solution['strategy_returns'] = solution['risk_free_rate'] * (1 - solution['position']) + solution['position'] * solution['forward_returns']

    # Calculate strategy's Sharpe ratio
    strategy_excess_returns = solution['strategy_returns'] - solution['risk_free_rate']
    strategy_excess_cumulative = (1 + strategy_excess_returns).prod()
    strategy_mean_excess_return = (strategy_excess_cumulative) ** (1 / len(solution)) - 1
    strategy_std = solution['strategy_returns'].std()
    trading_days_per_yr = 252
    sharpe = strategy_mean_excess_return / strategy_std * np.sqrt(trading_days_per_yr)
    strategy_volatility = float(strategy_std * np.sqrt(trading_days_per_yr) * 100)

    # Calculate market return and volatility
    market_excess_returns = solution['forward_returns'] - solution['risk_free_rate']
    market_excess_cumulative = (1 + market_excess_returns).prod()
    market_mean_excess_return = (market_excess_cumulative) ** (1 / len(solution)) - 1
    market_std = solution['forward_returns'].std()


    market_volatility = float(market_std * np.sqrt(trading_days_per_yr) * 100)


    # Calculate the volatility penalty
    excess_vol = max(0, strategy_volatility / market_volatility - 1.2) if market_volatility > 0 else 0
    vol_penalty = 1 + excess_vol

    # Calculate the return penalty
    return_gap = max(
        0,
        (market_mean_excess_return - strategy_mean_excess_return) * 100 * trading_days_per_yr,
    )
    return_penalty = 1 + (return_gap**2) / 100

    # Adjust the Sharpe ratio by the volatility and return penalty
    adjusted_sharpe = sharpe / (vol_penalty * return_penalty)
    return min(float(adjusted_sharpe), 1_000_000)

In [20]:
y_position = np.clip(y_pred*200, 0.0, 2.0)

In [22]:
score_value = score(y_test, y_pred)
score_value

-0.01017112537707895

In [None]:
def hit_rate(y_true, y_pred, *, dropna: bool = True, margin: float = 0.0, count_ties: bool = False) -> float:
    """
    Sign accuracy (hit rate): fraction of times sign(y_pred) == sign(y_true).

    Parameters
    ----------
    y_true, y_pred : array-like
        Equal-length sequences of numbers.
    dropna : bool, default True
        If True, drop any pair with NaN in either array.
        If False and NaNs are present, returns np.nan.
    margin : float, default 0.0
        Treat predictions with |y_pred| <= margin as 0 (neutral band).
    count_ties : bool, default False
        If False, exclude any pair where sign is 0 on either side.
        If True, include pairs with sign==0 and count them as correct
        only when both are 0.

    Returns
    -------
    hit : float
        Proportion in [0,1], or np.nan if no eligible pairs.
    """
    a = np.asarray(y_true, dtype=float).flatten()
    b = np.asarray(y_pred, dtype=float).flatten()

    if a.shape != b.shape:
        raise ValueError("y_true and y_pred must have the same shape")

    mask = np.isfinite(a) & np.isfinite(b)
    if not dropna and not mask.all():
        return np.nan
    a = a[mask]
    b = b[mask]

    # Apply neutral band to predictions
    if margin > 0:
        b = b.copy()
        b[np.abs(b) <= margin] = 0.0

    s_true = np.sign(a)
    s_pred = np.sign(b)

    if count_ties:
        eligible = np.ones_like(s_true, dtype=bool)
    else:
        eligible = (s_true != 0) & (s_pred != 0)

    if not np.any(eligible):
        return np.nan

    hits = (s_true[eligible] == s_pred[eligible]).mean()
    return float(hits)

In [None]:
def pearson_corr(y_true, y_pred):
    """
    Pearson correlation coefficient between y_true and y_pred.

    Parameters
    ----------
    y_true, y_pred : array-like
        Equal-length sequences of numbers.

    Returns
    -------
    corr : float
        Pearson correlation coefficient in [-1,1], or np.nan if undefined.
    """
    a = np.asarray(y_true, dtype=float).flatten()
    b = np.asarray(y_pred, dtype=float).flatten()

    if a.shape != b.shape:
        raise ValueError("y_true and y_pred must have the same shape")

    mask = np.isfinite(a) & np.isfinite(b)
    a = a[mask]
    b = b[mask]

    if len(a) == 0:
        return np.nan

    corr, _ = pearsonr(a, b)
    return float(corr)