In [None]:
import pandas as pd
import numpy as np
import os
from data_engineering import create_timeseries_for_symbol, prepare_regression_data, clean_data, prepare_prediction_data, sample_training_data
from models import evaluate_best_model
from pca import reduce_dimensions_pca
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split

if __name__ == "__main__":
    output_dir = "symbol_data"
    all_predictions = pd.DataFrame(columns=['row_id', 'responder_6'])
    
    n = 5

    # Load test data
    features_test = pd.read_parquet("test.parquet/date_id=0/part-0.parquet")
    responders_test = pd.read_parquet("lags.parquet/date_id=0/part-0.parquet")
    
    # Initialize the model once
    model = XGBRegressor(
        n_estimators=200,
        learning_rate=0.1,
        max_depth=6,
        min_child_weight=3,
        subsample=0.8,
        random_state=42
    )
    
    # Initialize lists to store all training data
    all_X_reduced = []
    all_y = []
    
    # First pass: collect and process all data
    for symbol_id in range(n):
        csv_path = os.path.join(output_dir, f"symbol_{symbol_id}.csv")
        df = pd.read_csv(csv_path)
        print(f"\nProcessing data for symbol {symbol_id} ...")
        
        features, responders, target = create_timeseries_for_symbol(df, symbol_id)
        clean_features, clean_responders = clean_data(features, responders)
        X, y = prepare_regression_data(clean_features, clean_responders, target)
        
        # Reduce dimensions
        X_reduced, pca, scaler = reduce_dimensions_pca(X.values, n_components=25)

        # Convert inputs to numpy arrays if they're pandas DataFrames
        if isinstance(y, pd.Series):
            y = y.to_numpy()

        X_sampled, y_sampled = sample_training_data(X_reduced, y)

        # Store processed data
        all_X_reduced.append(X_sampled)
        all_y.append(y_sampled)
    
    # Combine all data
    X_combined = np.vstack(all_X_reduced)
    y_combined = np.concatenate(all_y)
    
    # Single train-test split on combined data
    X_train, X_test, y_train, y_test = train_test_split(
        X_combined, y_combined, test_size=0.2, random_state=42
    )
    
    # Train model once on all data
    print("\nTraining model on combined data...")
    model.fit(
        X_train, y_train,
        eval_set=[(X_train, y_train), (X_test, y_test)],
        verbose=True
    )
    
    # Evaluate model on combined test set
    print("\nEvaluating model on combined test set...")
    evaluate_best_model(model, X_train, X_test, y_train, y_test)
    
    # Make predictions for each symbol
    for symbol_id in range(n):
        symbol_features = features_test[features_test['symbol_id'] == symbol_id].copy()
        row_id = symbol_features['row_id'].values[0]
        symbol_responders = responders_test[responders_test['symbol_id'] == symbol_id].copy()
        X_test = prepare_prediction_data(symbol_features, symbol_responders)
        
        # Use the same scaler and PCA from the last iteration
        X_test_scaled = scaler.transform(X_test[X.columns].values)
        X_test_reduced = pca.transform(X_test_scaled)
        
        # Make prediction
        prediction = model.predict(X_test_reduced)
        
        # Create temporary DataFrame
        temp_df = pd.DataFrame({
            'row_id': row_id,
            'responder_6': prediction
        })
        
        # Append to main DataFrame
        all_predictions = pd.concat([all_predictions, temp_df], ignore_index=True)
    
    # Sort by row_id
    all_predictions = all_predictions.sort_values('row_id').reset_index(drop=True)
    print("\nPredictions DataFrame shape:", all_predictions.shape)
    print(all_predictions.head())