---
# Model Hyperparameter Optimization with TPOT Regressor
---

In [None]:
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tqdm       
import copy

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
                     
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score, cross_val_predict, cross_validate
from sklearn.model_selection import train_test_split

from tpot import TPOTRegressor
from tpot.builtins import StackingEstimator

# Regression metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

RANDOM_STATE=42
random.seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)

---
# Read the Dataset

In [2]:
org_df = pd.read_csv('AirlineDelay_CleanDataset.csv', index_col=0) 
df = org_df.copy()

display(df.head())

Unnamed: 0,Month,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,TailNum,ActualElapsedTime,Origin,Dest,TaxiIn,TaxiOut,Delay
0,1,4,1829.0,1755,1959.0,1925,WN,N464WN,9.486833,IND,BWI,1.732051,3.162278,34.0
1,1,4,1937.0,1830,2037.0,1940,WN,N763SW,15.491933,IND,LAS,1.732051,2.645751,57.0
2,1,4,1644.0,1510,1845.0,1725,WN,N334SW,11.0,IND,MCO,2.44949,2.828427,80.0
3,1,4,1452.0,1425,1640.0,1625,WN,N286WN,15.099669,IND,PHX,2.645751,2.828427,15.0
4,1,4,1323.0,1255,1526.0,1510,WN,N674AA,11.090537,IND,TPA,2.0,3.0,16.0


---
# Encoding Categorical features

In [3]:
cat_cols = list(df.select_dtypes('object').columns)

for c in cat_cols:
    df[c] = LabelEncoder().fit_transform(df[c].values)

---
# Dividing the dataset: 
### X-Class/Target column and y-Features/Attributes 

In [4]:
# Whole Dataset

# X = df.drop(columns=['Delay'])                            
# y = df['Delay']

In [5]:
# Testing sample of the Dataset

A = df.drop(columns=['Delay'])                            
b = df['Delay']

bins = [15, 60, 120, 180, 240, 300, 360, 2462] 
y_binned = np.digitize(b, bins=bins, right=True)

X_train, X, y_train, y = train_test_split(A, b, test_size=0.10, stratify=y_binned, random_state=RANDOM_STATE)

---
# Train/Test Stratified Split

In [6]:
# Dividing the bins by hour
bins = [15, 60, 120, 180, 240, 300, 360, 2462] 
#bins = [np.log(x) for x in original_bins]
y_binned = np.digitize(y, bins=bins, right=True)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y_binned, random_state=RANDOM_STATE)

---
# Train and evaluate the model with TPOT Regressor 

In [8]:
tpot_config = {
    'sklearn.ensemble.RandomForestRegressor': {
        'criterion': ['absolute_error', 'squared_error'],
        'max_depth': [1, 5, None],
        'min_samples_leaf': [1,5],
        'min_samples_split': [2,5,],
        'n_estimators': [50, 100],
        'random_state': [RANDOM_STATE]
        },
    'sklearn.neighbors.KNeighborsRegressor':{
        'n_neighbors': [2,5,10]
    },
    'sklearn.preprocessing.MinMaxScaler': {}
}

model = TPOTRegressor(generations=10, population_size=30, verbosity=2, cv=3, scoring='r2', n_jobs=-2,
                     config_dict = tpot_config)

model.fit(X_train, y_train.to_numpy().flatten())

Optimization Progress:   0%|          | 0/330 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.8290184529474457

Generation 2 - Current best internal CV score: 0.8351573442708856

Generation 3 - Current best internal CV score: 0.9562901212764746

Generation 4 - Current best internal CV score: 0.9562901212764746

Generation 5 - Current best internal CV score: 0.9562901212764746

Generation 6 - Current best internal CV score: 0.9562901212764746

Generation 7 - Current best internal CV score: 0.9562901212764746

Generation 8 - Current best internal CV score: 0.9562901212764746

Generation 9 - Current best internal CV score: 0.9562901212764746

Generation 10 - Current best internal CV score: 0.9562901212764746

Best pipeline: RandomForestRegressor(input_matrix, criterion=squared_error, max_depth=None, min_samples_leaf=5, min_samples_split=5, n_estimators=50, random_state=42)


In [10]:
print('R2 Score: ', model.score(X_test, y_test.to_numpy().flatten()))

R2 Score:  0.9687702996506035
