In [None]:
import setup_path
import pandas as pd
import os
from src.data_engineering import create_timeseries_for_symbol, prepare_regression_data, clean_data, prepare_prediction_data
from src.pca import reduce_dimensions_pca
from src.models import train_and_evaluate_multiple_models

script_directory = os.getcwd()  # Gets the current working directory
data_directory = os.path.join(script_directory, '..', 'data')

features_test = pd.read_parquet(os.path.join(data_directory, "test.parquet"))
responders_test = pd.read_parquet(os.path.join(data_directory, "lags.parquet"))

output_dir = "symbol_data"

for symbol_id in range(39):

    print(f'\nProcessing symbol {symbol_id} ...')
    csv_path = os.path.join(output_dir, f"symbol_{symbol_id}.csv")
    df = pd.read_csv(csv_path)

    features, responders, target = create_timeseries_for_symbol(df, symbol_id)
    clean_features, clean_responders = clean_data(features, responders)

    X, y = prepare_regression_data(clean_features, clean_responders, target=target)

    # For specific number of components:
    X_reduced, pca, scaler = reduce_dimensions_pca(X.values, n_components=25)

    train_and_evaluate_multiple_models(X_reduced, y)


In [None]:
symbol_features = features[features['symbol_id'] == symbol_id].copy()
symbol_responders = responders[responders['symbol_id'] == symbol_id].copy()

X_test = prepare_prediction_data(symbol_features, symbol_responders)

# In test:
X_test_scaled = scaler.transform(X_test[X.columns].values)  # use training scaler
X_test_reduced = pca.transform(X_test_scaled)  # use training PCA