## Imports

In [20]:
import numpy as np 

# import LogisitcRegression and RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import TimeSeriesSplit , KFold , train_test_split , cross_val_score
from sklearn.metrics import roc_auc_score

# import gridsearchCV for hyperparameter tuning
from sklearn.model_selection import GridSearchCV

# import MakeClassification for creating synthetic data
from sklearn.datasets import make_classification

# Topics

## Test-Train Test Vs Cross-Validation Test 

In [21]:
# first : train_test_split
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf = LogisticRegression()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

#roc score is a metric for binary classifier , from 0 to 1 
# it plots the rate of true positive to false positive along a variable threshold
# higher rocauc score means accurate performance
rocauc = roc_auc_score(y_test, y_pred)
print(f'roc_auc_score : {rocauc}')

roc_auc_score : 0.8849999999999999


In [23]:
# second : cross_val_score
clf = LogisticRegression()
scores = cross_val_score(clf, X, y, cv=5, scoring='roc_auc')
print(f'cross_val_score : {scores.mean()}')

cross_val_score : 0.9287000000000001


### Another problem with test-train 

In [24]:
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train, y_train)
y_pred = rf_clf.predict(X_test)
rocauc = roc_auc_score(y_test, y_pred)
print(f'roc_auc_score : {rocauc}')

roc_auc_score : 0.915


In [25]:
rf_clf = RandomForestClassifier(n_estimators=500)
rf_clf.fit(X_train, y_train)
y_pred = rf_clf.predict(X_test)
rocauc = roc_auc_score(y_test, y_pred)
print(f'roc_auc_score : {rocauc}')

roc_auc_score : 0.9199999999999998


In [26]:
rf_clf = RandomForestClassifier(n_estimators=1000)
rf_clf.fit(X_train, y_train)
y_pred = rf_clf.predict(X_test)
rocauc = roc_auc_score(y_test, y_pred)
print(f'roc_auc_score : {rocauc}')

roc_auc_score : 0.9199999999999998


The problem with train test is : 
1. The Test data is not representative of the data we will see in the future.
2. The score is biased , it is not robust as we saw the difference in accuracy 
3. we hypertune params on the cost of the train set , leakage

Another suggestion that I didn't try here is : 
using the validation test as hyper-tuning and then testing with train-test split, this is a good idea but it is not the best. leading us back to problem no.1 and 2 

## Introducing the Nested Cross Validaiton (Robustness)

![nested_cross_validation](./images/nested_cross_val.png)

In [27]:
# Hyperparameters to tune
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [None, 10, 20],
}

# Inner CV for hyperparameter tuning
grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)

# Outer CV for model evaluation
outer_cv = KFold(n_splits=5)

# Nested CV
nested_score = cross_val_score(grid_search, X, y, cv=outer_cv)

print("Nested CV Score: ", nested_score.mean())

Nested CV Score:  0.899


## Second Mistake : Don't Train Test Split with Time Series Data

A defining feature of time series data is that they're autocorrelated — i.e., the time series is linearly related to a lagged version of itself. (This is a fancy way of saying that observations made close together tend to be similar.)

This is a problem because, if your training data set contains records which occur later than your testing data set, you're allowing your model to "peak" at useful information which wouldn't be available in production. We don't want our model to learn using information from the future; we want it to learn the trend using information from the past.

## Bayesian Optimization Search 
!pip install scikit-optimize

In [1]:
from skopt import BayesSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
 
# Load data
data = load_iris()
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2, random_state=25)
 
# Initialize model
model = DecisionTreeClassifier()
 
# Define hyperparameter space for Bayesian Optimization
param_space = {
 'criterion': ['gini', 'entropy'],
 'max_depth': [None] + list(range(10, 31)),
 'min_samples_split': (2, 10),
 'min_samples_leaf': (1, 10)
}


# Bayesian Optimization
opt = BayesSearchCV(model, param_space, n_iter=32, cv=5, scoring='accuracy')
opt.fit(X_train, y_train)
best_params_bayes = opt.best_params_
best_score_bayes = opt.best_score_
 
print(f'Best Parameters (Bayesian Optimization): {best_params_bayes}')
print(f'Best Cross-Validation Score (Bayesian Optimization): {best_score_bayes:.2f}')


best_model = DecisionTreeClassifier(**best_params_bayes)
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)
final_accuracy = accuracy_score(y_test, y_pred)
 
print(f'Final Model Accuracy: {final_accuracy:.2f}')

Best Parameters (Bayesian Optimization): OrderedDict({'criterion': 'gini', 'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2})
Best Cross-Validation Score (Bayesian Optimization): 0.97
Final Model Accuracy: 0.97
