---
# Model Hyperparameter Optimization with Random Search
---

In [None]:
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tqdm       
import copy

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
                     
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_val_predict, cross_validate
from sklearn.model_selection import train_test_split

# Regression models
from sklearn.linear_model import Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor

# Regression metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

RANDOM_STATE=42
random.seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)

---
# Read the Dataset

In [2]:
org_df = pd.read_csv('AirlineDelay_CleanDataset.csv', index_col=0) 
df = org_df.copy()

display(df)

Unnamed: 0,Month,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,TailNum,ActualElapsedTime,Origin,Dest,TaxiIn,TaxiOut,Delay
0,1,4,1829.0,1755,1959.0,1925,WN,N464WN,9.486833,IND,BWI,1.732051,3.162278,34.0
1,1,4,1937.0,1830,2037.0,1940,WN,N763SW,15.491933,IND,LAS,1.732051,2.645751,57.0
2,1,4,1644.0,1510,1845.0,1725,WN,N334SW,11.000000,IND,MCO,2.449490,2.828427,80.0
3,1,4,1452.0,1425,1640.0,1625,WN,N286WN,15.099669,IND,PHX,2.645751,2.828427,15.0
4,1,4,1323.0,1255,1526.0,1510,WN,N674AA,11.090537,IND,TPA,2.000000,3.000000,16.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1247479,12,6,921.0,830,1112.0,1008,DL,N907DE,10.535654,ATL,PBI,2.828427,4.582576,64.0
1247480,12,6,1552.0,1520,1735.0,1718,DL,N905DE,6.557439,HSV,ATL,3.000000,2.645751,17.0
1247481,12,6,1250.0,1220,1617.0,1552,DL,N938DL,12.124356,MSP,ATL,3.000000,4.242641,25.0
1247482,12,6,657.0,600,904.0,749,DL,N3743H,11.269428,RIC,ATL,3.872983,5.830952,75.0


In [3]:
cat_cols = list(df.select_dtypes('object').columns)

for c in cat_cols:
    df[c] = LabelEncoder().fit_transform(df[c].values)

---
# Dividing the dataset: 
### X-Class/Target column and y-Features/Attributes 

In [4]:
# Whole Dataset

X = df.drop(columns=['Delay'])                            
y = df['Delay']

In [5]:
# Testing sample of the Dataset

# A = df.drop(columns=['Delay'])                            
# b = df['Delay']

# # Dividing the bins by hour
# #original_bins = [15, 60, 120, 180, 240, 300, 360, 2462] 

# # # Because original class has log transformation, bins should be transformed as well
# #bins = [np.log(x) for x in original_bins]

# bins = [15, 60, 120, 180, 240, 300, 360, 2462] 
# y_binned = np.digitize(b, bins=bins, right=True)

# X_train, X, y_train, y = train_test_split(A, b, test_size=0.10, stratify=y_binned, random_state=RANDOM_STATE)

---
# Train and evaluate the model with Randomized Search 

In [6]:
def model_loop(X, y, reg, grid_params):
        
    bins = [15, 60, 120, 180, 240, 300, 360, 2462]
    y_binned = np.digitize(y, bins=bins, right=True)

    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE).split(X, y_binned)
    
    cat_cols = ['UniqueCarrier', 'TailNum', 'Origin', 'Dest']
    num_cols = list(set(X.columns) - set(cat_cols))
    
    
    col_trans = ColumnTransformer(
        [
            ('mms', MinMaxScaler(), num_cols)
            #('ord', OrdinalEncoder(), cat_cols) # THERE IS A PROBLEM WITH THIS LINE AND THE CODE IS NOT WORKING
        ],
        remainder='drop'
    )
    
    
    pipeline = Pipeline(
        [
            ('col_trans', col_trans),
            ('reg', reg)
        ]
    )
    
    model = RandomizedSearchCV(estimator=pipeline,
                         param_distributions = grid_params,
                         cv = skf,
                         scoring='r2',
                         n_iter=5,
                         refit=True,
                         n_jobs=-1,
                         random_state=RANDOM_STATE,
                         verbose=2) 
    model.fit(X,y)
    return model

In [7]:
param_dicts = [
    {
        'reg': RandomForestRegressor(),
        'name': "RandomForest",
        'grid_dict': {
            'reg__criterion': ['absolute_error', 'squared_error'],
            'reg__max_depth': [1, 5, None],
            'reg__min_samples_leaf': [1,5],
            'reg__min_samples_split': [2,5,],
            'reg__n_estimators': [50, 100],
            'reg__random_state': [RANDOM_STATE]
        }
    },
    {
        'reg': Lasso(),
        'name': "Lasso",
        'grid_dict': {
            'reg__alpha': [0.1,1,10],
        }

    },
    {
        'reg': ElasticNet(),
        'name': "ElasticNet",
        'grid_dict': {
            'reg__alpha': [0.1,1,10],
        }
    }
]

In [8]:
best_models=[]
best_model_names=[]

for el in tqdm.tqdm(param_dicts):
    print(el['name'])
    best_models.append(model_loop(X, y, el['reg'], el['grid_dict']))
    best_model_names.append(el['name'])


  0%|          | 0/3 [00:00<?, ?it/s]

RandomForest
Fitting 3 folds for each of 5 candidates, totalling 15 fits



 33%|███▎      | 1/3 [32:28<1:04:56, 1948.44s/it]

Lasso




Fitting 3 folds for each of 3 candidates, totalling 9 fits



 67%|██████▋   | 2/3 [32:34<13:26, 806.01s/it]   

ElasticNet




Fitting 3 folds for each of 3 candidates, totalling 9 fits


100%|██████████| 3/3 [32:38<00:00, 652.93s/it]


In [9]:
for el, name in zip(best_models, best_model_names):
    print('Model Name:     ', name)
    print('R2 Score:       ', el.best_score_)
    print('Best parameters:', el.best_params_)
    print('-----------------------------------------------------\n')

Model Name:      RandomForest
R2 Score:        0.9886236466466879
Best parameters: {'reg__random_state': 42, 'reg__n_estimators': 100, 'reg__min_samples_split': 5, 'reg__min_samples_leaf': 1, 'reg__max_depth': None, 'reg__criterion': 'squared_error'}
-----------------------------------------------------

Model Name:      Lasso
R2 Score:        0.08105733334187619
Best parameters: {'reg__alpha': 0.1}
-----------------------------------------------------

Model Name:      ElasticNet
R2 Score:        0.022675756532522223
Best parameters: {'reg__alpha': 0.1}
-----------------------------------------------------

