In [None]:
import polars as pl
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
import os

script_directory = os.getcwd()  # Gets the current working directory
data_directory = os.path.join(script_directory, '..', 'data')

features_test = pd.read_parquet(os.path.join(data_directory, "test.parquet"))
responders_test = pd.read_parquet(os.path.join(data_directory, "lags.parquet"))

n_partitions = 1 # For full data, n_partitions=10
n_symbols = 1 # There are I think 39 symbols
df_partition = {}

for partition in range(n_partitions):

    df_partition[partition] = pd.read_parquet(os.path.join(data_directory, f"train_{partition}.parquet"))


In [None]:
def clean_data(features, responders):

    # Print columns before cleaning
    print(f"\nTotal feature columns before cleaning: {len(features.columns)}")
    print(f"Total responder columns before cleaning: {len(responders.columns)}")
    
    # Get clean features (no NaN)
    clean_features = features.loc[:, ~features.isna().any()].reset_index(drop=True)
    
    # Get clean responders (no NaN)
    clean_responders = responders.loc[:, ~responders.isna().any()].reset_index(drop=True)
    
    print(f"\nNumber of clean features: {len(clean_features.columns)}")
    print(f"Number of clean responders: {len(clean_responders.columns)}")
    
    return clean_features, clean_responders


def create_timeseries_for_symbol(df, symbol_id):

    # Filter for our symbol, then sort by date_id and time_id 
    df_for_symbol = df[df['symbol_id'] == symbol_id].copy()
    symbol_data = df_for_symbol.sort_values(['date_id', 'time_id'])
    
    # Get column names
    feature_cols = [col for col in df.columns if col.startswith('feature_')]
    responder_cols = [col for col in df.columns if col.startswith('responder_') and col != 'responder_6']
    target_col = 'responder_6'
    
    # Get first date and its last time for responders
    first_date = symbol_data['date_id'].min()
    first_date_last_time = symbol_data[symbol_data['date_id'] == first_date]['time_id'].max()
    first_date_last_responders = symbol_data[
        (symbol_data['date_id'] == first_date) &
        (symbol_data['time_id'] == first_date_last_time)
    ][responder_cols]
    
    # Get all data after first date (for features)
    feature_series = symbol_data[symbol_data['date_id'] > first_date][feature_cols].copy()
    
    # Get all data after first date (for target)
    target_series = symbol_data[symbol_data['date_id'] > first_date][target_col].copy()
    target_series = target_series.reset_index(drop=True)
    
    # Get all data after first date except the last row (for responders)
    responder_data = symbol_data[symbol_data['date_id'] > first_date][responder_cols].iloc[:-1]
    
    # Add first date's last responders at the start
    responder_series = pd.concat([first_date_last_responders, responder_data])
    
    # Print verification
    print(f"\nFeature series shape: {feature_series.shape}")
    print(f"\nResponder series shape: {responder_series.shape}")
    print(f"\nTarget series shape: {target_series.shape}")
    
    return feature_series, responder_series, target_series


def prepare_regression_data(features, responders, target=None):

    if target is None:
        common_indices = features.index.intersection(responders.index)
    else:
        common_indices = features.index.intersection(responders.index).intersection(target.index)

    X = pd.concat([features.loc[common_indices], responders.loc[common_indices]], axis=1)

    if target is not None:
        y = target.loc[common_indices]

    if target is not None:
        print(f"\nRegression data shapes:")

    print(f"X shape: {X.shape}")

    if target is not None:
        print(f"y shape: {y.shape}")

    return X, y if target is not None else X


def sample_training_data(X_train, y_train, sample_fraction=1/10, n_bins=10):

    # Handle RF and XGB with binning
    sample_size = int(len(X_train) * sample_fraction)

    # Initialize sampling variables
    sampling_succeeded = False
    X_train_sampled = None
    y_train_sampled = None

    # Try different binning approaches
    y_train_df = pd.DataFrame(y_train, columns=['target'])

    try:
        # Try quantile-based bins first
        y_train_df['bins'] = pd.qcut(y_train_df['target'], q=n_bins, labels=False, duplicates='drop')
        sampling_succeeded = True
    except ValueError:
        try:
            # If quantile binning fails, try equal-width bins
            y_train_df['bins'] = pd.cut(y_train_df['target'], bins=n_bins, labels=False)
            sampling_succeeded = True
        except ValueError:
            print("Warning: Binning methods failed. Falling back to random sampling.")
            sampling_succeeded = False

    if sampling_succeeded:
        # Proceed with bin-based sampling
        bin_sample_size = max(1, sample_size // n_bins)  # Ensure at least 1 sample per bin
        sampled_indices = []
        
        # Get unique bins that actually exist in the data
        existing_bins = y_train_df['bins'].dropna().unique()
        
        for bin_idx in existing_bins:
            bin_indices = y_train_df[y_train_df['bins'] == bin_idx].index.values  # Get numpy array
            if len(bin_indices) > 0:
                n_samples = min(len(bin_indices), bin_sample_size)
                sampled_bin_indices = np.random.choice(bin_indices, n_samples, replace=False)
                sampled_indices.extend(sampled_bin_indices)
        
        # Convert to numpy array for indexing
        sampled_indices = np.array(sampled_indices)
        
        # Verify we got some samples
        if len(sampled_indices) > 0:
            X_train_sampled = X_train[sampled_indices]
            y_train_sampled = y_train[sampled_indices]
        else:
            sampling_succeeded = False

    # If all sampling methods failed or got no samples, fall back to random sampling
    if not sampling_succeeded or X_train_sampled is None or len(X_train_sampled) == 0:
        print("Falling back to random sampling...")
        sampled_indices = np.random.choice(
            len(X_train), 
            size=min(sample_size, len(X_train)), 
            replace=False
        )
        X_train_sampled = X_train[sampled_indices]
        y_train_sampled = y_train[sampled_indices]

    # Verify final samples
    if len(X_train_sampled) == 0 or len(y_train_sampled) == 0:
        raise ValueError("Failed to create valid samples for tree-based models")

    print("\nSampled data shapes:")
    print(f"X shape: {X_train_sampled.shape} (samples, features+responders)")
    print(f"y shape: {y_train_sampled.shape}")

    return X_train_sampled, y_train_sampled


def reduce_dimensions_pca(X, n_components=None, variance_threshold=None):

   scaler = StandardScaler()
   X_scaled = scaler.fit_transform(X)
   
   if n_components is None and variance_threshold is None:
       n_components = min(X.shape[1], 10)  # Default to 10 components
       
   if variance_threshold:
       temp_pca = PCA()
       temp_pca.fit(X_scaled)
       cumsum = np.cumsum(temp_pca.explained_variance_ratio_)
       n_components = np.argmax(cumsum >= variance_threshold) + 1
       
   pca = PCA(n_components=n_components)
   X_reduced = pca.fit_transform(X_scaled)
   
   print(f"\nVariance explained: {np.sum(pca.explained_variance_ratio_):.3f}")
   
   return X_reduced, pca, scaler


def prepare_prediction_data(features_df, lags_df):

    # Get clean features
    clean_features = features_df.loc[:, ~features_df.isna().any()]
    feature_cols = [col for col in clean_features.columns if col.startswith('feature_')]
    clean_features = clean_features[feature_cols]
    
    # Get lagged responders (excluding responder_6)
    lags_df.columns = [col.replace('_lag_1', '') if col.startswith('responder_') else col for col in lags_df.columns]
    lag_cols = [col for col in lags_df.columns if col.startswith('responder_') and not col.startswith('responder_6')]
    responder_lags = lags_df[lag_cols]
    
    # Combine features and responders
    X = pd.concat([clean_features, responder_lags], axis=1)
    
    print("\nPrediction data preparation:")
    print(f"Number of clean features: {len(feature_cols)}")
    print(f"Number of lagged responders: {len(lag_cols)}")
    print(f"Final X shape: {X.shape}")
    
    return X


# Initialize the model once
model = XGBRegressor(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=6,
    min_child_weight=3,
    subsample=0.8,
    random_state=42
)

# Initialize lists to store all training data
all_X_reduced = []
all_y = []
pca = {}
scaler = {}
columns_to_keep = {}
models = {}

for symbol_id in range(n_symbols):

    print(f'\nProcessing raw data for symbol {symbol_id} ...')

    # Read and combine data for this symbol from all partitions
    symbol_data = []

    for partition in range(n_partitions):

        df = df_partition[partition]
        symbol_partition = df[df['symbol_id'] == symbol_id]

        if not symbol_partition.empty:
            symbol_data.append(symbol_partition)
    
    if symbol_data:

        df_symbol = pd.concat(symbol_data, ignore_index=True)

    print(f'\nDone processing raw data for symbol {symbol_id} ...')

    features, responders, target = create_timeseries_for_symbol(df_symbol, symbol_id)
    clean_features, clean_responders = clean_data(features, responders)
    X, y = prepare_regression_data(clean_features, clean_responders, target)
    columns_to_keep[symbol_id] = X.columns
    
    # Reduce dimensions
    X_reduced, pca[symbol_id], scaler[symbol_id] = reduce_dimensions_pca(X.values, n_components=25)

    # Convert inputs to numpy arrays if they're pandas DataFrames
    if isinstance(y, pd.Series):
        y = y.to_numpy()

    X_sampled, y_sampled = sample_training_data(X_reduced, y)

    # Single train-test split on combined data for the symbol 
    X_train, X_test, y_train, y_test = train_test_split(X_sampled, y_sampled, test_size=0.2, random_state=42)

    # Train model once on all data for symbol
    print(f"\nTraining model on combined data for symbol {symbol_id}...")
    model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], verbose=True)
    models[symbol_id] = model
    print(f"\nCompleted Training model on combined data for symbol {symbol_id}...")

In [None]:
lags_ : pl.DataFrame | None = None

def predict(test: pl.DataFrame, lags: pl.DataFrame | None) -> pl.DataFrame | pd.DataFrame:

    global lags_

    if lags is not None:
        lags_ = lags

    predictions = pd.DataFrame(columns=['row_id', 'responder_6'])

    # Make predictions for each symbol
    for symbol_id in range(n_symbols):
        
        symbol_features = test[test['symbol_id'] == symbol_id].copy()
        row_id = symbol_features['row_id'].values[0]

        symbol_responders = lags[lags['symbol_id'] == symbol_id].copy()

        X_test = prepare_prediction_data(symbol_features, symbol_responders)
        
        # Use the same scaler and PCA from the last iteration
        X_test_scaled = scaler[symbol_id].transform(X_test[columns_to_keep[symbol_id]].values)
        X_test_reduced = pca[symbol_id].transform(X_test_scaled)
        
        # Make prediction
        prediction = models[symbol_id].predict(X_test_reduced)
        
        # Create temporary DataFrame
        temp_df = pd.DataFrame({'row_id': row_id, 'responder_6': prediction})
        
        # Append to main DataFrame
        predictions = pd.concat([predictions, temp_df], ignore_index=True)

    # Sort by row_id
    predictions = predictions.sort_values('row_id').reset_index(drop=True)
    
    if isinstance(predictions, pl.DataFrame):
        assert predictions.columns == ['row_id', 'responder_6']

    elif isinstance(predictions, pd.DataFrame):
        assert (predictions.columns == ['row_id', 'responder_6']).all()

    else:
        raise TypeError('The predict function must return a DataFrame')

    return predictions


predict(features_test, responders_test)