---
# Model Hyperparameter Optimization with Random Search
# Version IV
### * Delay columns substraction, without target class transformation
---

In [1]:
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tqdm       
import copy

from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
from collections import Counter
                     
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline, make_union

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score, cross_val_predict, cross_validate
from sklearn.model_selection import train_test_split

from tpot import TPOTRegressor
from tpot.builtins import StackingEstimator

# Regression models
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso, ElasticNet, Ridge, SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.dummy import DummyRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import LinearSVR, SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor

# Regression metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

RANDOM_STATE=42
random.seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)

---
# Read the Dataset

In [2]:
org_df = pd.read_csv('AirlineDelay_CleanDataset_DelaySubstraction.csv', index_col=0) 
df = org_df.copy()

display(df)

Unnamed: 0,Month,DayOfWeek,UniqueCarrier,TailNum,ActualElapsedTime,Origin,Dest,TaxiIn,TaxiOut,Delay,DepTDelay,ArrTDelay
0,1,4,WN,N464WN,9.486833,IND,BWI,1.732051,3.162278,34.0,74.0,34.0
1,1,4,WN,N763SW,15.491933,IND,LAS,1.732051,2.645751,57.0,107.0,97.0
2,1,4,WN,N334SW,11.000000,IND,MCO,2.449490,2.828427,80.0,134.0,120.0
3,1,4,WN,N286WN,15.099669,IND,PHX,2.645751,2.828427,15.0,27.0,15.0
4,1,4,WN,N674AA,11.090537,IND,TPA,2.000000,3.000000,16.0,68.0,16.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1247479,12,6,DL,N907DE,10.535654,ATL,PBI,2.828427,4.582576,64.0,91.0,104.0
1247480,12,6,DL,N905DE,6.557439,HSV,ATL,3.000000,2.645751,17.0,32.0,17.0
1247481,12,6,DL,N938DL,12.124356,MSP,ATL,3.000000,4.242641,25.0,30.0,65.0
1247482,12,6,DL,N3743H,11.269428,RIC,ATL,3.872983,5.830952,75.0,57.0,155.0


In [3]:
cat_cols = list(df.select_dtypes('object').columns)

for c in cat_cols:
    df[c] = LabelEncoder().fit_transform(df[c].values)

---
# Dividing the dataset: 
### X-Class/Target column and y-Features/Attributes 

In [4]:
# Whole Dataset

# X = df.drop(columns=['Delay'])                            
# y = df['Delay']

In [5]:
# Testing sample of the Dataset

A = df.drop(columns=['Delay'])                            
b = df['Delay']

bins = [15, 60, 120, 180, 240, 300, 360, 2462] 
y_binned = np.digitize(b, bins=bins, right=True)

X_train, X, y_train, y = train_test_split(A, b, test_size=0.10, stratify=y_binned, random_state=RANDOM_STATE)

---
# Train and evaluate the model with Randomized Search 

In [6]:
def model_loop(X, y, reg, grid_params):
    
    bins = [15, 60, 120, 180, 240, 300, 360, 2462]
    
    y_binned = np.digitize(y, bins=bins, right=True)

    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE).split(X, y_binned)
    
    cat_cols = ['UniqueCarrier', 'TailNum', 'Origin', 'Dest']
    num_cols = list(set(X.columns) - set(cat_cols))
    
    
    col_trans = ColumnTransformer(
        [
            ('mms', MinMaxScaler(), num_cols)
            #('ord', OrdinalEncoder(), cat_cols) # THERE IS A PROBLEM WITH THIS LINE AND THE CODE IS NOT WORKING
        ],
        remainder='drop'
    )
    
    
    pipeline = Pipeline(
        [
            ('col_trans', col_trans),
            ('reg', reg)
        ]
    )
    
    model = RandomizedSearchCV(estimator=pipeline,
                         param_distributions = grid_params,
                         cv = skf,
                         scoring='r2',
                         n_iter=5,
                         refit=True,
                         n_jobs=-2,
                         random_state=RANDOM_STATE,
                         verbose=2) 
    model.fit(X,y)
    return model

In [7]:
param_dicts = [
    {
        'reg': RandomForestRegressor(),
        'name': "RandomForest",
        'grid_dict': {
            'reg__criterion': ['absolute_error', 'squared_error'],
            'reg__max_depth': [1, 5, None],
            'reg__min_samples_leaf': [1,5],
            'reg__min_samples_split': [2,5,],
            'reg__n_estimators': [50, 100],
            'reg__random_state': [RANDOM_STATE]
        }
    },
    {
        'reg': Lasso(),
        'name': "Lasso",
        'grid_dict': {
            'reg__alpha': [0.1,1,10]
        }
    },
    {
        'reg': ElasticNet(),
        'name': "ElasticNet",
        'grid_dict': {
            'reg__alpha': [0.1,1,10]
        }
    },
    {
        'reg': KNeighborsRegressor(),
        'name': "KNN",
        'grid_dict': {
            'reg__n_neighbors': [2,5,10]
        }
    }   
]

In [8]:
best_models=[]
best_model_names=[]

for el in tqdm.tqdm(param_dicts):
    print(el['name'])
    best_models.append(model_loop(X, y, el['reg'], el['grid_dict']))
    best_model_names.append(el['name'])

  0%|          | 0/4 [00:00<?, ?it/s]

RandomForest
Fitting 3 folds for each of 5 candidates, totalling 15 fits




Lasso
Fitting 3 folds for each of 3 candidates, totalling 9 fits




ElasticNet
Fitting 3 folds for each of 3 candidates, totalling 9 fits




KNN
Fitting 3 folds for each of 3 candidates, totalling 9 fits


100%|██████████| 4/4 [01:50<00:00, 27.68s/it]


In [9]:
for el, name in zip(best_models, best_model_names):
    print('Model Name:     ', name)
    print('R2 Score:       ', el.best_score_)
    print('Best parameters:', el.best_params_)
    print('-----------------------------------------------------\n')

Model Name:      RandomForest
R2 Score:        0.9868333540014914
Best parameters: {'reg__random_state': 42, 'reg__n_estimators': 100, 'reg__min_samples_split': 5, 'reg__min_samples_leaf': 1, 'reg__max_depth': None, 'reg__criterion': 'squared_error'}
-----------------------------------------------------

Model Name:      Lasso
R2 Score:        0.07073459580265747
Best parameters: {'reg__alpha': 0.1}
-----------------------------------------------------

Model Name:      ElasticNet
R2 Score:        0.016488518044770695
Best parameters: {'reg__alpha': 0.1}
-----------------------------------------------------

Model Name:      KNN
R2 Score:        0.7699049458771846
Best parameters: {'reg__n_neighbors': 2}
-----------------------------------------------------

