In [6]:
import sys
import os
sys.path.append(os.path.abspath("../../scripts"))
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PowerTransformer, QuantileTransformer
from sklearn.feature_selection import mutual_info_classif
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, make_scorer, precision_score, recall_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
from model_utils import rolling_test, optimize_threshold
from preprocessing import feature_engineering, analyze_features
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
plt.style.use("ggplot")
import warnings
warnings.filterwarnings('ignore')


In [7]:
X = pd.read_csv('../../data/X.csv', index_col='date', parse_dates=True)
y = pd.read_csv('../../data/y.csv', index_col='date', parse_dates=True)

In [8]:
val_size = 100
X_train = X[:-val_size]
y_train = y[:-val_size]
X_val = X[-val_size:]
y_val = y[-val_size:]

In [9]:
pipeline_gb = Pipeline([
    ('power', PowerTransformer('yeo-johnson')),
    ('scaler', QuantileTransformer(output_distribution='normal', random_state=42)),
    ("gb", GradientBoostingClassifier(random_state=42))
])

In [10]:
pipeline_mlp = Pipeline([
    ('power', PowerTransformer('yeo-johnson')),
    ('scaler', QuantileTransformer(output_distribution='normal', random_state=42)),
    ('model', MLPClassifier(
        random_state=42,
        hidden_layer_sizes=(64, 32),
        max_iter=500,
        alpha=0.005,
        learning_rate_init=0.0005,
        early_stopping=True,
        n_iter_no_change=20,
        batch_size=16
    ))
])

In [11]:
tscv_gb = TimeSeriesSplit(n_splits=4)
tscv_mlp = TimeSeriesSplit(n_splits=4)

In [12]:
X_train_gb = X_train
y_train_gb = y_train
X_train_mlp = X_train
y_train_mlp = y_train

scoring = {
    'accuracy': 'accuracy',
    'f1': 'f1',
    'roc_auc': 'roc_auc'
}
gb_scores = cross_validate(pipeline_gb, X_train_gb, y_train_gb, cv=tscv_gb, scoring=scoring)
mlp_scores = cross_validate(pipeline_mlp, X_train_mlp, y_train_mlp, cv=tscv_mlp, scoring=scoring)

In [13]:
print("Gradient Boosting Classifier Scores:")
print(f"Accuracy: {np.mean(gb_scores['test_accuracy']):.4f}")
print(f"F1 Score: {np.mean(gb_scores['test_f1']):.4f}")
print(f"ROC AUC: {np.mean(gb_scores['test_roc_auc']):.4f}")
print("\nMLP Classifier Scores:")
print(f"Accuracy: {np.mean(mlp_scores['test_accuracy']):.4f}")
print(f"F1 Score: {np.mean(mlp_scores['test_f1']):.4f}")
print(f"ROC AUC: {np.mean(mlp_scores['test_roc_auc']):.4f}")

Gradient Boosting Classifier Scores:
Accuracy: 0.5066
F1 Score: 0.5196
ROC AUC: 0.5109

MLP Classifier Scores:
Accuracy: 0.5362
F1 Score: 0.6275
ROC AUC: 0.5629


In [14]:
results_gb = rolling_test(pipeline_gb, X, y)
results_mlp = rolling_test(pipeline_mlp, X, y)

>>> Rolling Test (threshold 0.5):
Mean Accuracy: 0.5825
Mean F1 Score: 0.6001
Mean ROC AUC Score: 0.5929
>>> Rolling Test (threshold 0.5):
Mean Accuracy: 0.5475
Mean F1 Score: 0.6128
Mean ROC AUC Score: 0.6006


In [15]:
pipeline_stack = Pipeline([
    ('power', PowerTransformer('yeo-johnson')),
    ("scaler", QuantileTransformer(output_distribution='normal', random_state=42)),
    ("stack", StackingClassifier(
        estimators=[
            ('gb', pipeline_gb.named_steps['gb']),
            ('mlp', pipeline_mlp.named_steps['model'])
        ],
        final_estimator=LogisticRegression(random_state=42),
        passthrough=False,
        cv=4
    ))
])

In [16]:
results_stack = rolling_test(pipeline_stack, X, y)

>>> Rolling Test (threshold 0.5):
Mean Accuracy: 0.5025
Mean F1 Score: 0.6443
Mean ROC AUC Score: 0.4866
