In [1]:
import pandas as pd

df_partition = {}
for partition in range(10):
    partition_path = f"train.parquet/partition_id={partition}/part-0.parquet"
    df_partition[partition] = pd.read_parquet(partition_path)


In [None]:
import numpy as np
from data_engineering import create_timeseries_for_symbol, prepare_regression_data, clean_data, sample_training_data
from pca import reduce_dimensions_pca
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split

n = 2

# Initialize the model once
model = XGBRegressor(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=6,
    min_child_weight=3,
    subsample=0.8,
    random_state=42
)

# Initialize lists to store all training data
all_X_reduced = []
all_y = []
pca = {}
scaler = {}
columns_to_keep = {}

# First pass: collect and process all data
for symbol_id in range(n):

    print(f'\nProcessing raw data for symbol {symbol_id} ...')

    # Read and combine data for this symbol from all partitions
    symbol_data = []
    for partition in range(10):
        df = df_partition[partition]
        symbol_partition = df[df['symbol_id'] == symbol_id]
        if not symbol_partition.empty:
            symbol_data.append(symbol_partition)
    
    if symbol_data:
        df_symbol = pd.concat(symbol_data, ignore_index=True)

    print(f'\nDone processing raw data for symbol {symbol_id} ...')

    features, responders, target = create_timeseries_for_symbol(df_symbol, symbol_id)
    clean_features, clean_responders = clean_data(features, responders)
    X, y = prepare_regression_data(clean_features, clean_responders, target)
    columns_to_keep[symbol_id] = X.columns
    
    # Reduce dimensions
    X_reduced, pca[symbol_id], scaler[symbol_id] = reduce_dimensions_pca(X.values, n_components=25)

    # Convert inputs to numpy arrays if they're pandas DataFrames
    if isinstance(y, pd.Series):
        y = y.to_numpy()

    X_sampled, y_sampled = sample_training_data(X_reduced, y)

    # Store processed data
    all_X_reduced.append(X_sampled)
    all_y.append(y_sampled)

# Combine all data
X_combined = np.vstack(all_X_reduced)
y_combined = np.concatenate(all_y)

# Single train-test split on combined data
X_train, X_test, y_train, y_test = train_test_split(X_combined, y_combined, test_size=0.2, random_state=42)

# Train model once on all data
print("\nTraining model on combined data...")
model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], verbose=True)
print("\nCompleted Training model on combined data...")

In [None]:
############## Testing ###############
from data_engineering import prepare_prediction_data
# Load test data
features_test = pd.read_parquet("test.parquet/date_id=0/part-0.parquet")
responders_test = pd.read_parquet("lags.parquet/date_id=0/part-0.parquet")

def predict(test, lags):

    predictions = pd.DataFrame(columns=['row_id', 'responder_6'])

    # Make predictions for each symbol
    for symbol_id in range(n):
        symbol_features = test[test['symbol_id'] == symbol_id].copy()
        row_id = symbol_features['row_id'].values[0]

        symbol_responders = lags[lags['symbol_id'] == symbol_id].copy()

        X_test = prepare_prediction_data(symbol_features, symbol_responders)
        
        # Use the same scaler and PCA from the last iteration
        X_test_scaled = scaler[symbol_id].transform(X_test[columns_to_keep[symbol_id]].values)
        X_test_reduced = pca[symbol_id].transform(X_test_scaled)
        
        # Make prediction
        prediction = model.predict(X_test_reduced)
        
        # Create temporary DataFrame
        temp_df = pd.DataFrame({'row_id': row_id, 'responder_6': prediction})
        
        # Append to main DataFrame
        predictions = pd.concat([predictions, temp_df], ignore_index=True)

    # Sort by row_id
    predictions = predictions.sort_values('row_id').reset_index(drop=True)
    print("\nPredictions DataFrame shape:", predictions.shape)
    print(predictions.head())

    return predictions

predict(features_test, responders_test)