## Imports

In [16]:
# import LogisitcRegression and RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split , cross_val_score
from sklearn.metrics import roc_auc_score

# K Fold 
from sklearn.model_selection import KFold

# import gridsearchCV for hyperparameter tuning
from sklearn.model_selection import GridSearchCV

# import MakeClassification for creating synthetic data
from sklearn.datasets import make_classification

# Topics

## Test-Train Test Vs Cross-Validation Test 

In [10]:
# first : train_test_split
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf = LogisticRegression()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

#roc score is a metric for binary classifier , from 0 to 1 
# it plots the rate of true positive to false positive along a variable threshold
# higher rocauc score means accurate performance
rocauc = roc_auc_score(y_test, y_pred)
print(f'roc_auc_score : {rocauc}')

roc_auc_score : 0.8868957893679027


In [12]:
# second : cross_val_score
clf = LogisticRegression()
scores = cross_val_score(clf, X, y, cv=5, scoring='roc_auc')
print(f'cross_val_score : {scores.mean()}')

cross_val_score : 0.9287000000000001


### Another problem with test-train 

In [13]:
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train, y_train)
y_pred = rf_clf.predict(X_test)
rocauc = roc_auc_score(y_test, y_pred)
print(f'roc_auc_score : {rocauc}')

roc_auc_score : 0.9109637222389709


In [14]:
rf_clf = RandomForestClassifier(n_estimators=500)
rf_clf.fit(X_train, y_train)
y_pred = rf_clf.predict(X_test)
rocauc = roc_auc_score(y_test, y_pred)
print(f'roc_auc_score : {rocauc}')

roc_auc_score : 0.9062908250427092


In [15]:
rf_clf = RandomForestClassifier(n_estimators=1000)
rf_clf.fit(X_train, y_train)
y_pred = rf_clf.predict(X_test)
rocauc = roc_auc_score(y_test, y_pred)
print(f'roc_auc_score : {rocauc}')

roc_auc_score : 0.9062908250427092


The problem with train test is : 
1. The Test data is not representative of the data we will see in the future.
2. The score is biased , it is not robust as we saw the difference in accuracy 
3. we hypertune params on the cost of the train set , leakage

Another suggestion that I didn't try here is : 
using the validation test as hyper-tuning and then testing with train-test split, this is a good idea but it is not the best. leading us back to problem no.1 and 2 

## Introducing the Nested Cross Validaiton (Robustness)

![nested_cross_validation](./nested_cross_val.png)

In [18]:
# Hyperparameters to tune
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [None, 10, 20],
}

# Inner CV for hyperparameter tuning
grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)

# Outer CV for model evaluation
outer_cv = KFold(n_splits=5)

# Nested CV
nested_score = cross_val_score(grid_search, X, y, cv=outer_cv)

print("Nested CV Score: ", nested_score.mean())

Nested CV Score:  0.897
