In [None]:
import pandas as pd
import os
from data_engineering import create_timeseries_for_symbol, prepare_regression_data, clean_data, prepare_prediction_data
from models import train_best_xgboost_model, evaluate_best_model
from pca import reduce_dimensions_pca

if __name__ == "__main__":
    output_dir = "symbol_data"
    # Initialize empty DataFrame to store all predictions
    all_predictions = pd.DataFrame(columns=['row_id', 'responder_6'])
    
    # Load test data
    features_test = pd.read_parquet("test.parquet/date_id=0/part-0.parquet")
    responders_test = pd.read_parquet("lags.parquet/date_id=0/part-0.parquet")
    
    for symbol_id in range(39):
        csv_path = os.path.join(output_dir, f"symbol_{symbol_id}.csv")
        df = pd.read_csv(csv_path)
        print(f"\nTraining and Evaluating for symbol {symbol_id} ...")
        
        features, responders, target = create_timeseries_for_symbol(df, symbol_id)
        clean_features, clean_responders = clean_data(features, responders)
        X, y = prepare_regression_data(clean_features, clean_responders, target)
        
        # For specific number of components:
        X_reduced, pca, scaler = reduce_dimensions_pca(X.values, n_components=25)
        
        # Train and evaluate model
        model, X_train, X_test, y_train, y_test = train_best_xgboost_model(X_reduced, y)
        evaluate_best_model(model, X_train, X_test, y_train, y_test)
        
        symbol_features = features_test[features_test['symbol_id'] == symbol_id].copy()
        row_id = symbol_features['row_id'].values[0]
        symbol_responders = responders_test[responders_test['symbol_id'] == symbol_id].copy()
        X_test = prepare_prediction_data(symbol_features, symbol_responders)
        
        # In test:
        X_test_scaled = scaler.transform(X_test[X.columns].values)  # use training scaler
        X_test_reduced = pca.transform(X_test_scaled)  # use training PCA
        
        # Make prediction and create temporary DataFrame
        prediction = model.predict(X_test_reduced)
        temp_df = pd.DataFrame({
            'row_id': [row_id],
            'responder_6': prediction
        })
        
        # Append to main DataFrame
        all_predictions = pd.concat([all_predictions, temp_df], ignore_index=True)
    
    # Sort by row_id if needed
    all_predictions = all_predictions.sort_values('row_id').reset_index(drop=True)
        
    print("\nPredictions DataFrame shape:", all_predictions.shape)
    print(all_predictions.head())