In [None]:
import pandas as pd
import os
from data_engineering import create_timeseries_for_symbol, prepare_regression_data, clean_data, prepare_prediction_data
from models import train_best_xgboost_model, evaluate_best_model
from pca import reduce_dimensions_pca

if __name__ == "__main__":

    output_dir = "symbol_data"
    symbol_id = 0
    csv_path = os.path.join(output_dir, f"symbol_{symbol_id}.csv")
    df = pd.read_csv(csv_path)

    print(f"\nTraining and Evaluating for symbol {symbol_id} ...")
    features, responders, target = create_timeseries_for_symbol(df, symbol_id)
    clean_features, clean_responders = clean_data(features, responders)
    X, y = prepare_regression_data(clean_features, clean_responders, target)

    # For specific number of components:
    X_reduced, pca, scaler = reduce_dimensions_pca(X.values, n_components=25)

    # Train and evaluate model
    model, X_train, X_test, y_train, y_test = train_best_xgboost_model(X_reduced, y)
    evaluate_best_model(model, X_train, X_test, y_train, y_test)


In [None]:
if __name__ == "__main__":
    # Load test data
    features = pd.read_parquet("test.parquet/date_id=0/part-0.parquet")
    responders = pd.read_parquet("lags.parquet/date_id=0/part-0.parquet")

    symbol_features = features[features['symbol_id'] == symbol_id].copy()

    symbol_responders = responders[responders['symbol_id'] == symbol_id].copy()

    X_test = prepare_prediction_data(symbol_features, symbol_responders)

    # In test:
    X_test_scaled = scaler.transform(X_test[X.columns].values)  # use training scaler
    X_test_reduced = pca.transform(X_test_scaled)  # use training PCA

    # Assuming we have a trained model
    predictions = model.predict(X_test_reduced)