In [None]:
import setup_path
import pandas as pd
import os
from src.data_engineering import create_timeseries_for_symbol, prepare_regression_data, clean_data, prepare_prediction_data
from src.pca import reduce_dimensions_pca
from src.models import train_and_evaluate_multiple_models

script_directory = os.getcwd()  # Gets the current working directory
data_directory = os.path.join(script_directory, '..', 'data')

n_partitions = 1 # For full data, n_partitions=10
n_symbols = 1 # There are I think 39 symbols
df_partition = {}

for partition in range(n_partitions):

    df_partition[partition] = pd.read_parquet(os.path.join(data_directory, f"train_{partition}.parquet"))

for symbol_id in range(n_symbols):

    print(f'\nProcessing symbol {symbol_id} ...')

    # Read and combine data for this symbol from all partitions
    symbol_data = []

    for partition in range(n_partitions):

        df = df_partition[partition]
        symbol_partition = df[df['symbol_id'] == symbol_id]

        if not symbol_partition.empty:
            symbol_data.append(symbol_partition)
    
    if symbol_data:

        df_symbol = pd.concat(symbol_data, ignore_index=True)

    features, responders, target = create_timeseries_for_symbol(df_symbol, symbol_id)

    clean_features, clean_responders = clean_data(features, responders)

    X, y = prepare_regression_data(clean_features, clean_responders, target=target)

    # For specific number of components:
    X_reduced, pca, scaler = reduce_dimensions_pca(X.values, n_components=25)

    train_and_evaluate_multiple_models(X_reduced, y)


In [None]:
features = pd.read_parquet(os.path.join(data_directory, "test.parquet"))
responders = pd.read_parquet(os.path.join(data_directory, "lags.parquet"))

symbol_id = 0
symbol_features = features[features['symbol_id'] == symbol_id].copy()
symbol_responders = responders[responders['symbol_id'] == symbol_id].copy()

X_test = prepare_prediction_data(symbol_features, symbol_responders)

# In test:
X_test_scaled = scaler.transform(X_test[X.columns].values)  # use training scaler
X_test_reduced = pca.transform(X_test_scaled)  # use training PCA