In [66]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0)
import seaborn as sns
from scipy import stats
from scipy.stats import norm

import xgboost as xgb

from sklearn import preprocessing
from sklearn import metrics, model_selection
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
# import dataloader

from sklearn.metrics import precision_recall_fscore_support,classification_report,accuracy_score,roc_auc_score
import random

from sklearn.model_selection import train_test_split,KFold

In [25]:
def shuffle_data(X_, y_):
    N   = X_.shape[0]
    N_f = X_.shape[1]
    data = []

    for i in range(N):
        feat = X_[i, :]
        label = float(y_[i])
        data.append((feat, label))

    random.shuffle(data)

    #rebuild X and y
    X =  np.zeros((N, N_f))
    y = -np.ones((N,)) #negative value to audit ingress code

    for i in range(N):
        X_1, y_1 = data[i]
        X[i, :] = X_1
        y[i]    = y_1

    return X, y

In [27]:
train = pd.read_csv('flight_delays_train.csv')
test = pd.read_csv('flight_delays_test.csv')

In [28]:
train.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732,N
1,c-4,c-20,c-3,1548,US,PIT,MCO,834,N
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416,N
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872,N
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423,Y


In [29]:
X = train[['Distance', 'DepTime']]
Y = train['dep_delayed_15min'].map({'Y': 1, 'N': 0})
# Y = pd.DataFrame(Y,columns=['label'])

In [30]:
X_Train, X_valid, y_Train, y_valid = train_test_split(X, Y, test_size=0.20, random_state=42)

In [56]:
xgb_model = xgb.XGBClassifier(seed=17)

xgb_model.fit(X_Train, y_Train)
xgb_valid_pred = xgb_model.predict_proba(X_valid)[:, 1]

roc_auc_score(y_valid, xgb_valid_pred)

0.7062682474290187

In [57]:
xgb_model = xgb.XGBClassifier(max_depth = 8, seed=17)

xgb_model.fit(X_Train, y_Train)
xgb_valid_pred = xgb_model.predict_proba(X_valid)[:, 1]

roc_auc_score(y_valid, xgb_valid_pred)

0.712943503123907

In [63]:
xgb_model = xgb.XGBClassifier(max_depth =20, seed=17, n_estimators = 500)

xgb_model.fit(X_Train, y_Train)
xgb_valid_pred = xgb_model.predict_proba(X_valid)[:, 1]

roc_auc_score(y_valid, xgb_valid_pred)

0.6912267823258795

In [67]:
# Parameter Tuning
model = xgb.XGBClassifier()
param_dist = {"max_depth": [3, 6, 9, 12], # default 6
              "min_child_weight" : [1], # default 1
              "n_estimators": [100, 150], # default 100
              "learning_rate": [0.05, 0.1, 0.15], # default 0.1
             } 

grid_search = model_selection.GridSearchCV(model, param_grid=param_dist, cv = 3, verbose=10, n_jobs=-1, scoring='roc_auc')

In [69]:
grid_search.fit(X_Train, y_Train)

print("{}".format(grid_search.best_estimator_))
print("{}".format(grid_search.best_score_))
print("{}".format(grid_search.best_params_))
# print("{}".format(grid_search.scorer_ ))
# print("{}".format(grid_search.cv_results_))

Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV] n_estimators=100, learning_rate=0.05, max_depth=3, min_child_weight=1 
[CV] n_estimators=100, learning_rate=0.05, max_depth=3, min_child_weight=1 
[CV] n_estimators=100, learning_rate=0.05, max_depth=3, min_child_weight=1 
[CV] n_estimators=150, learning_rate=0.05, max_depth=3, min_child_weight=1 
[CV] n_estimators=150, learning_rate=0.05, max_depth=3, min_child_weight=1 
[CV] n_estimators=150, learning_rate=0.05, max_depth=3, min_child_weight=1 
[CV] n_estimators=100, learning_rate=0.05, max_depth=6, min_child_weight=1 
[CV] n_estimators=100, learning_rate=0.05, max_depth=6, min_child_weight=1 
[CV]  n_estimators=100, learning_rate=0.05, max_depth=3, min_child_weight=1, score=0.693446207056, total=   1.3s
[CV] n_estimators=100, learning_rate=0.05, max_depth=6, min_child_weight=1 
[CV]  n_estimators=100, learning_rate=0.05, max_depth=3, min_child_weight=1, score=0.692188394069, total=   1.3s
[CV] n_estimators=150, learni

[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    1.6s


[CV]  n_estimators=150, learning_rate=0.05, max_depth=3, min_child_weight=1, score=0.693371555685, total=   1.9s
[CV] n_estimators=150, learning_rate=0.05, max_depth=6, min_child_weight=1 
[CV]  n_estimators=150, learning_rate=0.05, max_depth=3, min_child_weight=1, score=0.694168797718, total=   2.1s
[CV] n_estimators=100, learning_rate=0.05, max_depth=9, min_child_weight=1 
[CV]  n_estimators=150, learning_rate=0.05, max_depth=3, min_child_weight=1, score=0.705175977593, total=   2.2s
[CV] n_estimators=100, learning_rate=0.05, max_depth=9, min_child_weight=1 
[CV]  n_estimators=100, learning_rate=0.05, max_depth=6, min_child_weight=1, score=0.695366552664, total=   2.5s
[CV] n_estimators=100, learning_rate=0.05, max_depth=9, min_child_weight=1 
[CV]  n_estimators=100, learning_rate=0.05, max_depth=6, min_child_weight=1, score=0.693745154036, total=   2.6s
[CV] n_estimators=150, learning_rate=0.05, max_depth=9, min_child_weight=1 
[CV]  n_estimators=100, learning_rate=0.05, max_depth=6

[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    4.7s


[CV]  n_estimators=150, learning_rate=0.05, max_depth=6, min_child_weight=1, score=0.694706747469, total=   3.8s
[CV] n_estimators=150, learning_rate=0.05, max_depth=9, min_child_weight=1 
[CV]  n_estimators=150, learning_rate=0.05, max_depth=6, min_child_weight=1, score=0.695910330641, total=   3.8s
[CV] n_estimators=100, learning_rate=0.05, max_depth=12, min_child_weight=1 
[CV]  n_estimators=150, learning_rate=0.05, max_depth=6, min_child_weight=1, score=0.70717317924, total=   3.7s
[CV] n_estimators=100, learning_rate=0.05, max_depth=12, min_child_weight=1 
[CV]  n_estimators=100, learning_rate=0.05, max_depth=9, min_child_weight=1, score=0.692915765697, total=   4.1s
[CV] n_estimators=100, learning_rate=0.05, max_depth=12, min_child_weight=1 
[CV]  n_estimators=100, learning_rate=0.05, max_depth=9, min_child_weight=1, score=0.695612298874, total=   4.1s
[CV] n_estimators=150, learning_rate=0.05, max_depth=12, min_child_weight=1 
[CV]  n_estimators=100, learning_rate=0.05, max_dept

[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   10.3s


[CV]  n_estimators=150, learning_rate=0.05, max_depth=9, min_child_weight=1, score=0.695690187642, total=   5.8s
[CV] n_estimators=100, learning_rate=0.1, max_depth=3, min_child_weight=1 
[CV]  n_estimators=100, learning_rate=0.1, max_depth=3, min_child_weight=1, score=0.693864345797, total=   1.4s
[CV] n_estimators=100, learning_rate=0.1, max_depth=3, min_child_weight=1 
[CV]  n_estimators=150, learning_rate=0.05, max_depth=9, min_child_weight=1, score=0.703270716054, total=   5.9s
[CV] n_estimators=100, learning_rate=0.1, max_depth=3, min_child_weight=1 
[CV]  n_estimators=100, learning_rate=0.05, max_depth=12, min_child_weight=1, score=0.690968132156, total=   6.1s
[CV] n_estimators=150, learning_rate=0.1, max_depth=3, min_child_weight=1 
[CV]  n_estimators=100, learning_rate=0.05, max_depth=12, min_child_weight=1, score=0.693553496488, total=   6.1s
[CV] n_estimators=150, learning_rate=0.1, max_depth=3, min_child_weight=1 
[CV]  n_estimators=100, learning_rate=0.1, max_depth=3, min

[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   16.5s


[CV]  n_estimators=150, learning_rate=0.1, max_depth=3, min_child_weight=1, score=0.706301838591, total=   2.0s
[CV] n_estimators=150, learning_rate=0.1, max_depth=6, min_child_weight=1 
[CV]  n_estimators=150, learning_rate=0.1, max_depth=3, min_child_weight=1, score=0.694556793606, total=   2.3s
[CV] n_estimators=150, learning_rate=0.1, max_depth=6, min_child_weight=1 
[CV]  n_estimators=100, learning_rate=0.1, max_depth=6, min_child_weight=1, score=0.6954102598, total=   2.6s
[CV] n_estimators=150, learning_rate=0.1, max_depth=6, min_child_weight=1 
[CV]  n_estimators=100, learning_rate=0.1, max_depth=6, min_child_weight=1, score=0.695566606498, total=   2.5s
[CV] n_estimators=100, learning_rate=0.1, max_depth=9, min_child_weight=1 
[CV]  n_estimators=150, learning_rate=0.05, max_depth=12, min_child_weight=1, score=0.691075958878, total=   8.7s
[CV] n_estimators=100, learning_rate=0.1, max_depth=9, min_child_weight=1 
[CV]  n_estimators=150, learning_rate=0.05, max_depth=12, min_chi

[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   21.8s


[CV]  n_estimators=150, learning_rate=0.1, max_depth=6, min_child_weight=1, score=0.695326661193, total=   3.8s
[CV] n_estimators=100, learning_rate=0.1, max_depth=12, min_child_weight=1 
[CV]  n_estimators=150, learning_rate=0.1, max_depth=6, min_child_weight=1, score=0.707316385931, total=   3.6s
[CV] n_estimators=100, learning_rate=0.1, max_depth=12, min_child_weight=1 
[CV]  n_estimators=100, learning_rate=0.1, max_depth=9, min_child_weight=1, score=0.693506392589, total=   3.7s
[CV] n_estimators=100, learning_rate=0.1, max_depth=12, min_child_weight=1 
[CV]  n_estimators=100, learning_rate=0.1, max_depth=9, min_child_weight=1, score=0.69412475819, total=   4.0s
[CV] n_estimators=150, learning_rate=0.1, max_depth=12, min_child_weight=1 
[CV]  n_estimators=100, learning_rate=0.1, max_depth=9, min_child_weight=1, score=0.703883933984, total=   3.7s
[CV] n_estimators=150, learning_rate=0.1, max_depth=12, min_child_weight=1 
[CV]  n_estimators=150, learning_rate=0.1, max_depth=9, min_c

[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:   29.0s


[CV]  n_estimators=100, learning_rate=0.1, max_depth=12, min_child_weight=1, score=0.697206066364, total=   5.0s
[CV] n_estimators=150, learning_rate=0.15, max_depth=3, min_child_weight=1 
[CV]  n_estimators=100, learning_rate=0.15, max_depth=3, min_child_weight=1, score=0.705673200153, total=   1.3s
[CV] n_estimators=100, learning_rate=0.15, max_depth=6, min_child_weight=1 
[CV]  n_estimators=100, learning_rate=0.15, max_depth=3, min_child_weight=1, score=0.694639058096, total=   1.5s
[CV] n_estimators=100, learning_rate=0.15, max_depth=6, min_child_weight=1 
[CV]  n_estimators=150, learning_rate=0.15, max_depth=3, min_child_weight=1, score=0.694957525078, total=   2.0s
[CV] n_estimators=100, learning_rate=0.15, max_depth=6, min_child_weight=1 
[CV]  n_estimators=150, learning_rate=0.15, max_depth=3, min_child_weight=1, score=0.695149519569, total=   2.1s
[CV] n_estimators=150, learning_rate=0.15, max_depth=6, min_child_weight=1 
[CV]  n_estimators=150, learning_rate=0.15, max_depth=3

[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:   34.1s


[CV]  n_estimators=150, learning_rate=0.1, max_depth=12, min_child_weight=1, score=0.696656916862, total=   7.4s
[CV] n_estimators=150, learning_rate=0.15, max_depth=9, min_child_weight=1 
[CV]  n_estimators=150, learning_rate=0.15, max_depth=6, min_child_weight=1, score=0.694374306407, total=   3.6s
[CV] n_estimators=150, learning_rate=0.15, max_depth=9, min_child_weight=1 
[CV]  n_estimators=150, learning_rate=0.15, max_depth=6, min_child_weight=1, score=0.693815019887, total=   3.6s
[CV] n_estimators=100, learning_rate=0.15, max_depth=12, min_child_weight=1 
[CV]  n_estimators=100, learning_rate=0.15, max_depth=9, min_child_weight=1, score=0.693881916986, total=   3.5s
[CV] n_estimators=100, learning_rate=0.15, max_depth=12, min_child_weight=1 
[CV]  n_estimators=150, learning_rate=0.15, max_depth=6, min_child_weight=1, score=0.705155110267, total=   3.6s
[CV] n_estimators=100, learning_rate=0.15, max_depth=12, min_child_weight=1 
[CV]  n_estimators=100, learning_rate=0.15, max_dept

[Parallel(n_jobs=-1)]: Done  65 out of  72 | elapsed:   41.8s remaining:    4.5s


[CV]  n_estimators=100, learning_rate=0.15, max_depth=12, min_child_weight=1, score=0.689431903024, total=   5.3s
[CV]  n_estimators=150, learning_rate=0.15, max_depth=9, min_child_weight=1, score=0.699892067378, total=   5.7s
[CV]  n_estimators=100, learning_rate=0.15, max_depth=12, min_child_weight=1, score=0.687922797907, total=   4.8s
[CV]  n_estimators=100, learning_rate=0.15, max_depth=12, min_child_weight=1, score=0.696327507314, total=   5.4s
[CV]  n_estimators=150, learning_rate=0.15, max_depth=12, min_child_weight=1, score=0.68735831607, total=   6.5s
[CV]  n_estimators=150, learning_rate=0.15, max_depth=12, min_child_weight=1, score=0.686155971397, total=   6.2s
[CV]  n_estimators=150, learning_rate=0.15, max_depth=12, min_child_weight=1, score=0.693970647151, total=   4.5s


[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:   45.9s finished


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=6, min_child_weight=1, missing=None, n_estimators=150,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)
0.699763418586
{'n_estimators': 150, 'learning_rate': 0.1, 'max_depth': 6, 'min_child_weight': 1}


In [70]:
for i, param in enumerate(grid_search.cv_results_['params']):
    print("{} / ROC_AUC SCORE: {:.2f}".format(param, grid_search.cv_results_['mean_test_score'][i]))

{'n_estimators': 100, 'learning_rate': 0.05, 'max_depth': 3, 'min_child_weight': 1} / ROC_AUC SCORE: 0.70
{'n_estimators': 150, 'learning_rate': 0.05, 'max_depth': 3, 'min_child_weight': 1} / ROC_AUC SCORE: 0.70
{'n_estimators': 100, 'learning_rate': 0.05, 'max_depth': 6, 'min_child_weight': 1} / ROC_AUC SCORE: 0.70
{'n_estimators': 150, 'learning_rate': 0.05, 'max_depth': 6, 'min_child_weight': 1} / ROC_AUC SCORE: 0.70
{'n_estimators': 100, 'learning_rate': 0.05, 'max_depth': 9, 'min_child_weight': 1} / ROC_AUC SCORE: 0.70
{'n_estimators': 150, 'learning_rate': 0.05, 'max_depth': 9, 'min_child_weight': 1} / ROC_AUC SCORE: 0.70
{'n_estimators': 100, 'learning_rate': 0.05, 'max_depth': 12, 'min_child_weight': 1} / ROC_AUC SCORE: 0.69
{'n_estimators': 150, 'learning_rate': 0.05, 'max_depth': 12, 'min_child_weight': 1} / ROC_AUC SCORE: 0.69
{'n_estimators': 100, 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 1} / ROC_AUC SCORE: 0.70
{'n_estimators': 150, 'learning_rate': 0.1, '

## RandomForest

In [71]:
from sklearn import metrics, model_selection
from sklearn import ensemble

In [73]:
rf = ensemble.RandomForestClassifier()
param_dist = {
#               "max_depth": [-1], # default -1
#               "learning_rate" : [0.05,0.1, 0.15], # default 0.1
#               "num_leaves": [31, 63], # default 31
              "n_estimators": [50, 100, 150] # default 100
             }
grid_search = model_selection.GridSearchCV(rf, n_jobs=-1, param_grid=param_dist, cv = 3, scoring="roc_auc", verbose=10)
grid_search.fit(X_Train,y_Train)

print("{}".format(grid_search.best_estimator_))
print("{}".format(grid_search.best_score_))
print("{}".format(grid_search.best_params_))

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] n_estimators=50 .................................................
[CV] n_estimators=50 .................................................
[CV] n_estimators=50 .................................................
[CV] n_estimators=100 ................................................
[CV] n_estimators=100 ................................................
[CV] n_estimators=100 ................................................
[CV] n_estimators=150 ................................................
[CV] n_estimators=150 ................................................
[CV] ............ n_estimators=50, score=0.660539609703, total=   5.6s
[CV] ............ n_estimators=50, score=0.662303758872, total=   5.6s
[CV] n_estimators=150 ................................................
[CV] ............ n_estimators=50, score=0.669051371185, total=   5.7s


[Parallel(n_jobs=-1)]: Done   2 out of   9 | elapsed:    6.7s remaining:   23.3s
[Parallel(n_jobs=-1)]: Done   3 out of   9 | elapsed:    6.7s remaining:   13.5s


[CV] ........... n_estimators=100, score=0.667161930739, total=   9.8s


[Parallel(n_jobs=-1)]: Done   4 out of   9 | elapsed:   11.4s remaining:   14.3s


[CV] ........... n_estimators=100, score=0.663468780137, total=  11.1s


[Parallel(n_jobs=-1)]: Done   5 out of   9 | elapsed:   12.8s remaining:   10.2s


[CV] ........... n_estimators=100, score=0.671685682074, total=  11.2s


[Parallel(n_jobs=-1)]: Done   6 out of   9 | elapsed:   13.0s remaining:    6.5s


[CV] ........... n_estimators=150, score=0.664848712657, total=  14.9s


[Parallel(n_jobs=-1)]: Done   7 out of   9 | elapsed:   17.1s remaining:    4.9s


[CV] ........... n_estimators=150, score=0.668191086054, total=  15.4s
[CV] ............ n_estimators=150, score=0.67157179985, total=  10.5s


[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:   18.8s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:   18.8s finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=150, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
0.668203824088
{'n_estimators': 150}


In [74]:
for i, param in enumerate(grid_search.cv_results_['params']):
    print("{} / ROC_AUC SCORE: {:.2f}".format(param, grid_search.cv_results_['mean_test_score'][i]))

{'n_estimators': 50} / ROC_AUC SCORE: 0.66
{'n_estimators': 100} / ROC_AUC SCORE: 0.67
{'n_estimators': 150} / ROC_AUC SCORE: 0.67
