In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
from pathlib import Path
import datetime

# --- NEW IMPORTS ---
from dataclasses import dataclass, asdict, field 
from typing import List, Tuple
import lightgbm as lgb
# -------------------

import polars as pl 
import numpy as np
from sklearn.preprocessing import StandardScaler

import kaggle_evaluation.default_inference_server


# ============ PATHS ============
DATA_PATH: Path = Path('/kaggle/input/hull-tactical-market-prediction/')

# ============ RETURNS TO SIGNAL CONFIGS ============
MIN_SIGNAL: float = 0.0                          
MAX_SIGNAL: float = 2.0                          
SIGNAL_MULTIPLIER: float = 400.0                 

# ============ MODEL CONFIGS (Used to initialize CONFIG object) ============
CV: int = 10                                     
L1_RATIO: float = 0.5                            
ALPHAS: np.ndarray = field(default_factory=lambda: np.logspace(-4, 2, 100)) # Safe initialization
MAX_ITER: int = 1000000                          

# --- CONFIGS for LGBM and Lag Features (New) ---
LAG_PERIODS: Tuple[int, ...] = (1, 5)
LAG_CANDIDATES: Tuple[str, ...] = ("S2", "E2", "I2", "P9", "U1", "U2")
# Features used in original ElasticNet model structure
VARS_TO_KEEP_BASE: List[str] = [
    "S2", "E2", "E3", "P9", "S1", "S5", "I2", "P8",
    "P10", "P12", "P13"
]

@dataclass
class DatasetOutput:
    X_train : pl.DataFrame 
    X_test: pl.DataFrame
    y_train: pl.Series
    y_test: pl.Series
    scaler: StandardScaler

@dataclass 
class ElasticNetParameters:
    l1_ratio : float 
    cv: int
    alphas: np.ndarray 
    max_iter: int 
    
    def __post_init__(self): 
        if self.l1_ratio < 0 or self.l1_ratio > 1: 
            raise ValueError("Wrong initializing value for ElasticNet l1_ratio")
        
@dataclass(frozen=True)
class RetToSignalParameters:
    signal_multiplier: float 
    min_signal : float = MIN_SIGNAL
    max_signal : float = MAX_SIGNAL


ret_signal_params = RetToSignalParameters(
    signal_multiplier= SIGNAL_MULTIPLIER
)

enet_params = ElasticNetParameters(
    l1_ratio = L1_RATIO, 
    cv = CV, 
    alphas = ALPHAS, 
    max_iter = MAX_ITER
)

# --- Feature Engineering Helper ---

def add_lag_features(df: pl.DataFrame) -> pl.DataFrame:
    """Adds 1-day and 5-day lag features for specified columns."""
    
    expressions = []
    available_lag_candidates = [col for col in LAG_CANDIDATES if col in df.columns]

    for col in available_lag_candidates:
        for lag in LAG_PERIODS:
            expressions.append(
                pl.col(col).shift(lag).over('date_id').alias(f"{col}_lag{lag}")
            )
    
    if expressions:
        df = df.with_columns(expressions)
        
    return df
    
def load_trainset() -> pl.DataFrame:
    """
    Loads and preprocesses the training dataset.
    """
    return (
        pl.read_csv(DATA_PATH / "train.csv")
        .rename({'market_forward_excess_returns':'target'})
        .with_columns(
            pl.exclude('date_id').cast(pl.Float64, strict=False)
        )
        .head(-10)
    )

def load_testset() -> pl.DataFrame:
    """
    Loads and preprocesses the testing dataset.
    """
    # FIX: Rename the existing 'lagged_forward_returns' to 'target' for placeholder consistency
    return (
        pl.read_csv(DATA_PATH / "test.csv")
        .rename({'lagged_forward_returns':'target'}) 
        .with_columns(
            pl.exclude('date_id').cast(pl.Float64, strict=False)
        )
    )

def create_example_dataset(df: pl.DataFrame) -> pl.DataFrame:
    """
    Creates new features, adds lags, and cleans a DataFrame.
    """
    
    # 1. Engineer U1 and U2
    df_with_u = df.with_columns(
        (pl.col("I2") - pl.col("I1")).alias("U1"),
        (pl.col("M11") / ((pl.col("I2") + pl.col("I9") + pl.col("I7")) / 3)).alias("U2")
    )

    # 2. Add Lag Features
    df_lagged = add_lag_features(df_with_u)

    # 3. Dynamically define ALL feature columns
    feature_candidates = VARS_TO_KEEP_BASE + ["U1", "U2"]
    lag_cols = [col for col in df_lagged.columns if any(col.endswith(f"_lag{p}") for p in LAG_PERIODS)]
    all_features = [col for col in feature_candidates + lag_cols if col in df_lagged.columns]
    
    # 4. Initial selection of all necessary columns
    selection_cols = ["date_id"]
    if 'target' in df_lagged.columns:
        selection_cols.append("target")
    selection_cols.extend(all_features)
    
    df_selected = df_lagged.select(selection_cols)
    
    # 5. Impute Nulls (Robust Fix: EWMA + Fallback to Zero)
    
    # Impute primary nulls using EWMA
    df_imputed = df_selected.with_columns([
        pl.col(col).fill_null(pl.col(col).ewm_mean(com=0.5))
        for col in all_features if col in df_selected.columns
    ])
    
    # Fallback Imputation: Fill any remaining nulls (e.g., first few rows) with 0.0
    df_imputed = df_imputed.with_columns([
        pl.col(col).fill_null(0.0) 
        for col in all_features if col in df_imputed.columns
    ])
    
    # Final selection step
    return df_imputed.select(selection_cols)
    
def join_train_test_dataframes(train: pl.DataFrame, test: pl.DataFrame) -> pl.DataFrame:
    """
    Joins two dataframes by common columns and concatenates them vertically.
    """
    common_columns: list[str] = [col for col in train.columns if col in test.columns]
    
    return pl.concat([train.select(common_columns), test.select(common_columns)], how="vertical")

def split_dataset(train: pl.DataFrame, test: pl.DataFrame, features: list[str]) -> DatasetOutput: 
    """
    Splits the data into features (X) and target (y), and scales the features.
    """
    X_train = train.select(features)
    y_train = train.get_column('target')
    X_test = test.select(features)
    y_test = test.get_column('target') 
    
    scaler = StandardScaler() 
    
    # Convert Polars DF to NumPy array for scikit-learn
    X_train_scaled_np = scaler.fit_transform(X_train.to_numpy())
    X_train = pl.from_numpy(X_train_scaled_np, schema=features)
    
    X_test_scaled_np = scaler.transform(X_test.to_numpy())
    X_test = pl.from_numpy(X_test_scaled_np, schema=features)
    
    return DatasetOutput(
        X_train = X_train,
        y_train = y_train, 
        X_test = X_test, 
        y_test = y_test,
        scaler = scaler
    )

def convert_ret_to_signal(
    ret_arr: np.ndarray,
    params: RetToSignalParameters
) -> np.ndarray:
    """
    Converts raw model predictions (expected returns) into a trading signal.
    """
    return np.clip(
        ret_arr * params.signal_multiplier + 1, params.min_signal, params.max_signal
    )


train: pl.DataFrame = load_trainset()
test: pl.DataFrame = load_testset() 
print(train.tail(3)) 
print(test.head(3))

# --- Data Preparation ---
df: pl.DataFrame = join_train_test_dataframes(train, test)
df = create_example_dataset(df=df) 
train: pl.DataFrame = df.filter(pl.col('date_id').is_in(train.get_column('date_id')))
test: pl.DataFrame = df.filter(pl.col('date_id').is_in(test.get_column('date_id')))

# Define final feature list (now includes lags)
FEATURES: list[str] = [col for col in test.columns if col not in ['date_id', 'target']]

dataset: DatasetOutput = split_dataset(train=train, test=test, features=FEATURES) 

X_train: pl.DataFrame = dataset.X_train
X_test: pl.DataFrame = dataset.X_test
y_train: pl.DataFrame = dataset.y_train
y_test: pl.DataFrame = dataset.y_test
scaler: StandardScaler = dataset.scaler 

# Convert to NumPy for LGBM training
X_train_np = X_train.to_numpy()
y_train_np = y_train.to_numpy()


# --- MODEL TRAINING (LightGBM) ---
print("\n--- Starting LightGBM Model Training ---")

lgbm_params = {
    'objective': 'regression',
    'metric': 'rmse',
    'n_estimators': 500, 
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'verbose': -1, 
    'n_jobs': -1,
    'seed': 42
}

# 1. Initialize and Fit the LightGBM model
model = lgb.LGBMRegressor(**lgbm_params)
model.fit(X_train_np, y_train_np)

print(f"--- LightGBM Model Trained with {model.n_estimators_} Estimators ---")
# --------------------------------------------------------------------------

def predict(test_chunk: pl.DataFrame) -> float:
    # 1. Preprocess: Engineer Features, Select Features
    # NOTE: The redundant rename has been removed. test_chunk is processed as is.
    df_processed = create_example_dataset(test_chunk)
    
    # 2. Select final features (matching the columns the model was trained on)
    X_test_chunk: pl.DataFrame = df_processed.select(FEATURES)
    
    # 3. Scale using the fitted scaler
    X_test_scaled_np: np.ndarray = scaler.transform(X_test_chunk.to_numpy())
    
    # 4. Predict
    raw_pred: float = model.predict(X_test_scaled_np)[0]
    
    # 5. Convert to signal
    return convert_ret_to_signal(np.array([raw_pred]), ret_signal_params).item()


inference_server = kaggle_evaluation.default_inference_server.DefaultInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    # Running a small local test
    local_test_chunk = load_testset().head(1)
    signal = predict(local_test_chunk)
    print(f"\nLocal Test Prediction Signal: {signal:.4f}")
    # inference_server.run_local_gateway(('/kaggle/input/hull-tactical-market-prediction/',))