In [1]:
import pandas as pd
from methods import create_timeseries_for_symbol, prepare_regression_data, tune_xgboost, evaluate_model
from sklearn.model_selection import train_test_split

def main():

    # Load and prepare data (with all features and responders)
    df = pd.read_parquet("train.parquet/partition_id=0/part-0.parquet")
    list_of_symbols = df['symbol_id'].unique()

    print(f"\n Total number of symbols (financial instruments) is {len(list_of_symbols)}")
    symbol_id = list_of_symbols[0]

    for symbol_id in list_of_symbols:
        print(f"\nTraining and Evaluating for symbol {symbol_id} ...")
        features, responders, target = create_timeseries_for_symbol(df, symbol_id)
        X, y = prepare_regression_data(features, responders, target)
        
        # Tune model
        best_model = tune_xgboost(X, y)
        
        # Evaluate best model
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        evaluate_model(best_model, X_train, X_test, y_train, y_test)

# if __name__ == "__main__":

#     main()

In [None]:
from methods import train_best_xgboost_model, evaluate_best_model

if __name__ == "__main__":
    # Load and prepare data (with all features and responders)
    df = pd.read_parquet("train.parquet/partition_id=0/part-0.parquet")
    list_of_symbols = df['symbol_id'].unique()

    print(f"\n Total number of symbols (financial instruments) is {len(list_of_symbols)}")
    symbol_id = list_of_symbols[0]

    for symbol_id in list_of_symbols:
        print(f"\nTraining and Evaluating for symbol {symbol_id} ...")
        features, responders, target = create_timeseries_for_symbol(df, symbol_id)
        X, y = prepare_regression_data(features, responders, target)
        
        # Train and evaluate model
        model, X_train, X_test, y_train, y_test = train_best_xgboost_model(X, y)
        evaluate_best_model(model, X_train, X_test, y_train, y_test)


In [None]:
def prepare_prediction_data(features_df, lags_df):
    """
    Prepare data for prediction by:
    1. Getting clean features (no NaN)
    2. Combining with lagged responders (excluding responder_6)
    """
    # Get clean features
    clean_features = features_df.loc[:, ~features_df.isna().any()]
    feature_cols = [col for col in clean_features.columns if col.startswith('feature_')]
    clean_features = clean_features[feature_cols]
    
    # Get lagged responders (excluding responder_6)
    lag_cols = [col for col in lags_df.columns if col.startswith('responder_') and not col.startswith('responder_6')]
    responder_lags = lags_df[lag_cols]
    
    # Combine features and responders
    X = pd.concat([clean_features, responder_lags], axis=1)
    
    print("\nPrediction data preparation:")
    print(f"Number of clean features: {len(feature_cols)}")
    print(f"Number of lagged responders: {len(lag_cols)}")
    print(f"Final X shape: {X.shape}")
    
    return X

def make_predictions(model, features_df, lags_df):
    """
    Use trained model to predict responder_6
    """
    # Prepare prediction data
    X = prepare_prediction_data(features_df, lags_df)
    
    # Make predictions
    predictions = model.predict(X)
    
    # Create output DataFrame
    results = pd.DataFrame({
        'symbol_id': features_df['symbol_id'],
        'predicted_responder_6': predictions
    })
    
    if 'responder_6_lag_1' in lags_df.columns:
        results['actual_lag'] = lags_df['responder_6_lag_1']
        
    if 'weight' in features_df.columns:
        results['weight'] = features_df['weight']
    
    print("\nPrediction Results:")
    print(results)
    
    return results

# Example usage
if __name__ == "__main__":
    # Load test data
    features = pd.read_parquet("test.parquet/date_id=0/part-0.parquet")
    responders = pd.read_parquet("lags.parquet/date_id=0/part-0.parquet")
    
    # Assuming we have a trained model
    if 'model' in globals():  # Check if model exists
        predictions = make_predictions(model, features, responders)
    else:
        print("\nPlease ensure a trained model exists before running predictions")