In [1]:
# Training dataset
import pandas as pd
import os
from data_engineering import create_timeseries_for_symbol, prepare_regression_data, clean_data
from pca import reduce_dimensions_pca

if __name__ == "__main__":

    output_dir = "symbol_data"
    symbol_id = 0
    csv_path = os.path.join(output_dir, f"symbol_{symbol_id}.csv")
    df = pd.read_csv(csv_path)

    features, responders, target = create_timeseries_for_symbol(df, symbol_id)
    clean_features, clean_responders = clean_data(features, responders)

    X, y = prepare_regression_data(clean_features, clean_responders, target=target)

    # For specific number of components:
    X_reduced, pca, scaler = reduce_dimensions_pca(X.values, n_components=25)



Feature series shape: (1467617, 79)

Responder series shape: (1467617, 8)

Target series shape: (1467617,)

Total feature columns before cleaning: 79
Total responder columns before cleaning: 8

Feature columns dropped due to NaN values:
- feature_00
- feature_01
- feature_02
- feature_03
- feature_04
- feature_08
- feature_15
- feature_17
- feature_21
- feature_26
- feature_27
- feature_31
- feature_32
- feature_33
- feature_39
- feature_40
- feature_41
- feature_42
- feature_43
- feature_44
- feature_45
- feature_46
- feature_50
- feature_51
- feature_52
- feature_53
- feature_54
- feature_55
- feature_58
- feature_62
- feature_63
- feature_64
- feature_65
- feature_66
- feature_73
- feature_74
- feature_75
- feature_76
- feature_77
- feature_78

Number of clean features: 39
Number of clean responders: 8

Regression data shapes:
X shape: (1467617, 47)
y shape: (1467617,)

Variance explained: 0.924


In [2]:
# Test dataset
from data_engineering import prepare_prediction_data

if __name__ == "__main__":
    features = pd.read_parquet("test.parquet/date_id=0/part-0.parquet")
    responders = pd.read_parquet("lags.parquet/date_id=0/part-0.parquet")

    symbol_features = features[features['symbol_id'] == symbol_id].copy()

    symbol_responders = responders[responders['symbol_id'] == symbol_id].copy()

    X_test = prepare_prediction_data(symbol_features, symbol_responders)

    # In test:
    X_test_scaled = scaler.transform(X_test[X.columns].values)  # use training scaler
    X_test_reduced = pca.transform(X_test_scaled)  # use training PCA


Prediction data preparation:
Number of clean features: 64
Number of lagged responders: 8
Final X shape: (1, 72)
