# Imports and Extra Data Cleaning

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
from datetime import datetime
from datetime import timedelta
%load_ext line_profiler
sns.set(style="darkgrid")
import requests
import pprint
import re
%matplotlib inline

In [2]:
time1 = datetime.now()
filename = '../cleaned_flight_data_updated.csv'
random.seed(234)
p = 0.05  # p% of lines
df = pd.read_csv(
         filename,
         header=0, 
         parse_dates = ['FL_DATE','dep_datetime','arr_datetime'],
         skiprows=lambda i: i>0 and random.random() > p
)
time2 = datetime.now()
print('Time cost: ', time2-time1)

  interactivity=interactivity, compiler=compiler, result=result)


Time cost:  0:00:35.276494


In [3]:
# if DEP_DELAY is NaN, drop because useless for our model
df = df.dropna(subset=['DEP_DELAY'])

## Visibility/Wind Speed Cleaning and Imputing NaN values

In [4]:
# visibility column cleaning
time1 = datetime.now()

import math
def spacetonan(x):
    if isinstance(x, str) and x != '':
        return float(x)
    elif x == '':
        return np.nan
    return x

# remove 'V', 'Vs', 's', etc.
df['HOURLYVISIBILITY_origin'] = df['HOURLYVISIBILITY_origin'].str.replace('[^\d(.)]', "")
df['HOURLYVISIBILITY_dest'] = df['HOURLYVISIBILITY_dest'].str.replace('[^\d(.)]', "")
df['HOURLYWindSpeed_origin'] = df['HOURLYWindSpeed_origin'].str.replace('[^\d(.)]', "")
df['HOURLYWindSpeed_dest'] = df['HOURLYWindSpeed_dest'].str.replace('[^\d(.)]', "")

# for the entries with space, fill with nan
df['HOURLYVISIBILITY_origin'] = list(map(spacetonan, df['HOURLYVISIBILITY_origin']))
df['HOURLYVISIBILITY_dest'] = list(map(spacetonan, df['HOURLYVISIBILITY_dest']))
df['HOURLYWindSpeed_origin'] = list(map(spacetonan, df['HOURLYWindSpeed_origin']))
df['HOURLYWindSpeed_dest'] = list(map(spacetonan, df['HOURLYWindSpeed_dest']))

# impute the nan values to mean (~9.4 mi visibility)
np.set_printoptions(threshold=np.nan)
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
df['HOURLYVISIBILITY_origin_imp'] = imp_mean.fit_transform(df[['HOURLYVISIBILITY_origin']]).ravel()
df['HOURLYVISIBILITY_dest_imp'] = imp_mean.fit_transform(df[['HOURLYVISIBILITY_dest']]).ravel()
df['HOURLYWindSpeed_origin_imp'] = imp_mean.fit_transform(df[['HOURLYWindSpeed_origin']]).ravel()
df['HOURLYWindSpeed_dest_imp'] = imp_mean.fit_transform(df[['HOURLYWindSpeed_dest']]).ravel()

time2 = datetime.now()
print('Time cost: ', time2-time1)

Time cost:  0:00:03.383933


## Parsing "present weather type" and adding column with most salient weather code

In [5]:
# given an entry in the HOURLYPRSENTWEATHERTYPE, give AW xx value
# otherwise return 0
def regex_weather_cond(h_pres_w):
    pattern = re.compile(r'\d{2}')
    if isinstance(h_pres_w, str):
        if re.search(r'\|.*?(([A-Za-z]{2,}:(\d{2})\s)*)\|', h_pres_w):
            string = re.search(r'\|.*?(([A-Za-z]{2,}:(\d{2})\s)*)\|', h_pres_w).group(0)
            match = pattern.findall(string)
        else:
            return 0
    else:
        return 0
    return most_severe_cond(match)

# hard code severe weather condition
# based mostly on avg delay min per weather type
# smoke from Oct 2017 CA forestfire is high cancelled but not delay
def most_severe_cond(lst):
    # freezing rain
    if '64' in lst:
        return 64
    elif '65' in lst:
        return 65
    elif '66' in lst:
        return 66
    # ice pellets
    elif '74' in lst:
        return 74
    elif '75' in lst:
        return 75
    elif '76' in lst:
        return 76
    # thunderstorm with hail
    elif '93' in lst:
        return 93
    # snow
    elif '71' in lst:
        return 71
    elif '72' in lst:
        return 72
    elif '73' in lst:
        return 73
    # fog
    elif '30' in lst:
        return 30
    elif '31' in lst:
        return 31
    elif '32' in lst:
        return 32
    elif '33' in lst:
        return 33
    elif '34' in lst:
        return 34
    # safety return 0 if somehow pass empty list
    elif len(lst) == 0:
        return 0
    # return max element as int from list of strings
    # after those preceding cases, most severe is higher code
    # also not too many cases of combination of drastically different codes 
    else:
        return max(list(map(int, lst)))
    

# hard code severe weather condition
# based mostly on avg delay min per weather type
# smoke from Oct 2017 CA forestfire is high cancelled but not delay
def ordinal_ordering(x):
    # freezing rain
    if x == 64:
        return 115
    elif x == 65:
        return 116
    elif x == 66:
        return 117
    # ice pellets
    elif x == 74:
        return 112
    elif x == 75:
        return 113
    elif x == 76:
        return 114
    # thunderstorm with hail
    elif x == 93:
        return 113
    # snow
    elif x == 71:
        return 110
    elif x == 72:
        return 111
    elif x == 73:
        return 112
    # fog
    elif x == 30:
        return 105
    elif x == 31:
        return 106
    elif x == 32:
        return 107
    elif x == 33:
        return 108
    elif x == 34:
        return 109
    else:
        return x

In [6]:
# make the present weather type columns into new col
# with most severe weather code selected as integer
time1 = datetime.now()
df['origin_sev_code'] = df['HOURLYPRSENTWEATHERTYPE_origin'].apply(
    lambda x: regex_weather_cond(x))
df['dest_sev_code'] = df['HOURLYPRSENTWEATHERTYPE_dest'].apply(
    lambda x: regex_weather_cond(x))

df['origin_enc_code'] = df['origin_sev_code'].apply(
    lambda x: ordinal_ordering(x))
df['dest_enc_code'] = df['dest_sev_code'].apply(
    lambda x: ordinal_ordering(x))
time2 = datetime.now()
print('Time cost: ', time2-time1)

Time cost:  0:00:01.428189


In [7]:
print(df.origin_sev_code.unique())
print(df.dest_sev_code.unique())
print(df.origin_enc_code.unique())
print(df.dest_enc_code.unique())

[ 0 91 61 71  5 62 51 63 31 35 72 30 90 92 73 64 67 95 93 33 54 52 75 74
 81  4 68 65 18 82 99 85 96 89]
[ 0 61 31 92 71 95 51  5 30 62 91 90 63 72 54 35 64 67 73 75 68 74 33 89
  4 18 65 52 93 99 81 55 86]
[  0  91  61 110   5  62  51  63 106  35 111 105  90  92 112 115  67  95
 113 108  54  52  81   4  68 116  18  82  99  85  96  89]
[  0  61 106  92 110  95  51   5 105  62  91  90  63 111  54  35 115  67
 112 113  68 108  89   4  18 116  52  99  81  55  86]


# Binary Classification

Using "DEP_DEL15":
- 1 means delay >= 15
- 0 means delay < 15 (on-time / early)

Goal is to predict this classification. We will use **F1-score** as the primary evaluation metric, since we are concerned with both classes of classificacation: on-time/early vs. delayed. Since the classes are unbalanced, metrics such as accuracy will give a baseline model where we classify all flights as on-time very well (which is misleading since that would not be a useful model). 

Let's start with a **random forest** weather model, where we predict the DEP_DEL15 using the features:
- weather code - origin
- visibilty - origin
- wind speed at origin
- precipitation at origin
- (possibly temperature)

After weather, we will use flight/airport data:
- number of seats
- departure airport (*need encoding/merging - use passenger count*)
- carrier (*need ordinal encoding*)
- departure time (DEP_TIME)
- duration of flight

Later on if we have time:
- cloud cover (need complicated regex)
- time dependence/delay
- other things

http://scikit-learn.org/stable/auto_examples/ensemble/plot_feature_transformation.html#sphx-glr-auto-examples-ensemble-plot-feature-transformation-py

http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier

https://roamanalytics.com/2016/10/28/are-categorical-variables-getting-lost-in-your-random-forests/

http://docs.h2o.ai/h2o/latest-stable/h2o-docs/data-science/drf.html

https://en.wikipedia.org/wiki/Receiver_operating_characteristic

http://scikit-learn.org/stable/modules/model_evaluation.html

http://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html

In [8]:
# import all these lol
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [9]:
from sklearn import svm
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

In [10]:
from math import log10

In [11]:
df['log_origin_passenger_count'] = df['origin_passenger_count'].apply(lambda x: log10(x))

In [13]:
# X_column_list = ['DEP_TIME', 'number_of_seats', 'log_origin_passenger_count']
X_column_list = ['DEP_TIME', 'number_of_seats', 'origin_enc_code']

X = df[X_column_list]
y = df['DEP_DEL15']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [76]:
np.unique(y_pred, return_counts=True)

(array([0., 1.]), array([4884, 1718], dtype=int64))

# predict all zeros baseline model

In [33]:
y_pred_0 = X_test.apply(lambda x: 0, axis=1)

In [34]:
metrics.balanced_accuracy_score(y_test, y_pred_0)

0.5

# severe weather-delay baseline model

In [342]:
time1 = datetime.now()
clf = svm.SVC(kernel='rbf', gamma='auto')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(metrics.f1_score(y_test, y_pred))
print(clf.score(X_test, y_test))
print(metrics.balanced_accuracy_score(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))
time2 = datetime.now()
print('Time cost: ', time2-time1)

0.07042253521126761
0.8217019360648357
0.50877762852249
              precision    recall  f1-score   support

         0.0       0.84      0.98      0.90      1853
         1.0       0.26      0.04      0.07       368

   micro avg       0.82      0.82      0.82      2221
   macro avg       0.55      0.51      0.49      2221
weighted avg       0.74      0.82      0.76      2221

[[1810   43]
 [ 353   15]]
Time cost:  0:00:05.210361


In [15]:
from imblearn.over_sampling import SMOTE

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
sm = SMOTE(random_state=12, ratio = 1.0)
X_train_res, y_train_res = sm.fit_sample(X_train, y_train)

In [344]:
time1 = datetime.now()
svc_clf = svm.SVC(kernel='rbf', gamma='auto')
svc_clf.fit(X_train_res, y_train_res)
y_pred = svc_clf.predict(X_test)
print(metrics.f1_score(y_test, y_pred))
print(svc_clf.score(X_test, y_test))
print(metrics.balanced_accuracy_score(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))
time2 = datetime.now()
print('Time cost: ', time2-time1)

0.259904912836767
0.71968787515006
0.5503492857887603
              precision    recall  f1-score   support

         0.0       0.85      0.80      0.83      2780
         1.0       0.23      0.30      0.26       552

   micro avg       0.72      0.72      0.72      3332
   macro avg       0.54      0.55      0.54      3332
weighted avg       0.75      0.72      0.73      3332

[[2234  546]
 [ 388  164]]
Time cost:  0:00:20.979911


In [346]:
X_test = X_test.reset_index(drop=True)

for i in range(X_test.shape[0]):
    if X_test.loc[i, 'origin_enc_code'] > 100:
        y_pred[i] = 1
print(metrics.f1_score(y_test, y_pred))
print(svc_clf.score(X_test, y_test))
print(metrics.balanced_accuracy_score(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))

0.2564885496183206
0.71968787515006
0.5460588051298092
              precision    recall  f1-score   support

         0.0       0.85      0.79      0.82      2780
         1.0       0.22      0.30      0.26       552

   micro avg       0.71      0.71      0.71      3332
   macro avg       0.54      0.55      0.54      3332
weighted avg       0.75      0.71      0.73      3332

[[2190  590]
 [ 384  168]]


# SGD Classifier

In [77]:
from sklearn.linear_model import SGDClassifier

In [195]:
df['log_origin_passenger_count'] = df['origin_passenger_count'].apply(lambda x: log10(x))

In [327]:
# X_column_list = ['DEP_TIME', 'number_of_seats']
# X_column_list = ['DEP_TIME', 'number_of_seats', 'log_origin_passenger_count']
X_column_list = ['DEP_TIME', 'number_of_seats', 'log_origin_passenger_count', 'origin_enc_code']

X = df[X_column_list]
y = df['DEP_DEL15']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [328]:
from imblearn.over_sampling import SMOTE

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
sm = SMOTE(random_state=0, ratio = 1)
X_train_res, y_train_res = sm.fit_sample(X_train, y_train)

In [329]:
time1 = datetime.now()
scores = []
for rs in range(1,100):
    sgd_clf = SGDClassifier(loss="hinge", penalty="l2", max_iter=5, random_state=rs, shuffle=True)
    # 
    sgd_clf.fit(X_train_res, y_train_res)
    y_pred = sgd_clf.predict(X_test)
    scores.append(metrics.balanced_accuracy_score(y_test, y_pred))
scores = np.array(scores)
i = np.argmax(scores)
print('best [rs, score] = [%g, %g], mean score = '%(i+1, scores[i]), scores.mean())
time2 = datetime.now()
print('Time cost: ', time2-time1)

best [rs, score] = [92, 0.615126], mean score =  0.5252418550190736
Time cost:  0:00:07.378238


In [332]:
sgd_clf = SGDClassifier(loss="hinge", penalty="l2", max_iter=5, shuffle=True, random_state=92)
# 
sgd_clf.fit(X_train_res, y_train_res)
y_pred = sgd_clf.predict(X_test)
print(sgd_clf.score(X_test, y_test))
print(metrics.balanced_accuracy_score(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))

0.5556022956040985
0.6151262694584592
              precision    recall  f1-score   support

         0.0       0.89      0.52      0.66     27429
         1.0       0.24      0.71      0.36      5852

   micro avg       0.56      0.56      0.56     33281
   macro avg       0.57      0.62      0.51     33281
weighted avg       0.78      0.56      0.61     33281

[[14354 13075]
 [ 1715  4137]]


In [186]:
np.unique(y_train_res, return_counts=True)

(array([0., 1.]), array([63939, 63939], dtype=int64))

In [331]:
print(sgd_clf.score(X_test, y_test))
print(metrics.balanced_accuracy_score(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))

0.2002343679576936
0.5034430252248601
              precision    recall  f1-score   support

         0.0       0.85      0.04      0.07     27429
         1.0       0.18      0.97      0.30      5852

   micro avg       0.20      0.20      0.20     33281
   macro avg       0.51      0.50      0.18     33281
weighted avg       0.73      0.20      0.11     33281

[[  981 26448]
 [  169  5683]]


In [218]:
from sklearn.model_selection import GridSearchCV

In [221]:
losses = ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron']
penalties = ['none', 'l2', 'l1', 'elasticnet']
nfolds = 5
param_grid = {'loss': losses, 'penalty': penalties}
grid_search = GridSearchCV(SGDClassifier(max_iter=5), param_grid, cv=nfolds, scoring='balanced_accuracy')
grid_search.fit(X, y)
print(grid_search.best_score_)
print(grid_search.best_params_)

0.5602179051433808
{'loss': 'modified_huber', 'penalty': 'elasticnet'}


In [236]:
sgd_clf = SGDClassifier(loss="modified_huber", penalty="elasticnet", max_iter=5, shuffle=True)
# 
sgd_clf.fit(X_train_res, y_train_res)
y_pred = sgd_clf.predict(X_test)
print(sgd_clf.score(X_test, y_test))
print(metrics.balanced_accuracy_score(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))

0.8236558496883455
0.5029162268947719
              precision    recall  f1-score   support

         0.0       0.82      1.00      0.90    275556
         1.0       0.48      0.01      0.01     58949

   micro avg       0.82      0.82      0.82    334505
   macro avg       0.65      0.50      0.46    334505
weighted avg       0.76      0.82      0.75    334505

[[275069    487]
 [ 58501    448]]


# Random Forest

In [353]:
df['log_origin_passenger_count'] = df['origin_passenger_count'].apply(lambda x: log10(x))

In [354]:
# X_column_list = ['DEP_TIME']
# X_column_list = ['DEP_TIME', 'number_of_seats']
X_column_list = ['DEP_TIME', 'number_of_seats', 'origin_passenger_count', 'HOURLYVISIBILITY_origin_imp', 'HOURLYWindSpeed_origin_imp', 'origin_enc_code']
# X_column_list = ['DEP_TIME', 'number_of_seats', 'origin_enc_code']

X = df[X_column_list]
y = df['DEP_DEL15']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [355]:
from imblearn.over_sampling import SMOTE

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
sm = SMOTE(random_state=0, ratio = 1)
X_train_res, y_train_res = sm.fit_sample(X_train, y_train)

In [356]:
rf_clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
rf_clf.fit(X_train_res, y_train_res)
y_pred = rf_clf.predict(X_test)
print(rf_clf.feature_importances_)
print(rf_clf.score(X_test, y_test))
print(metrics.balanced_accuracy_score(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))

[0.46032291 0.06244019 0.01238502 0.26505699 0.08096094 0.11883395]
0.5254091867087188
0.6222507895510884
              precision    recall  f1-score   support

         0.0       0.91      0.47      0.62    275556
         1.0       0.24      0.77      0.36     58949

   micro avg       0.53      0.53      0.53    334505
   macro avg       0.57      0.62      0.49    334505
weighted avg       0.79      0.53      0.58    334505

[[130255 145301]
 [ 13452  45497]]


In [262]:
rf_clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
rf_clf.fit(X_train_res, y_train_res)
y_pred = rf_clf.predict(X_test)
print(rf_clf.feature_importances_)
print(rf_clf.score(X_test, y_test))
print(metrics.balanced_accuracy_score(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))

[0.53285028 0.1830278  0.28412192]
0.5435533788047234
0.6082197597989086
              precision    recall  f1-score   support

         0.0       0.89      0.51      0.65     27429
         1.0       0.24      0.71      0.35      5852

   micro avg       0.54      0.54      0.54     33281
   macro avg       0.56      0.61      0.50     33281
weighted avg       0.78      0.54      0.60     33281

[[13947 13482]
 [ 1709  4143]]


In [277]:
rf_clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
rf_clf.fit(X_train_res, y_train_res)
y_pred = rf_clf.predict(X_test)
print(rf_clf.feature_importances_)
print(rf_clf.score(X_test, y_test))
print(metrics.balanced_accuracy_score(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.matthews_corrcoef(y_test, y_pred))

[1.]
0.7931853009224482
0.5670357379782767
              precision    recall  f1-score   support

         0.0       0.85      0.92      0.88     27429
         1.0       0.36      0.22      0.27      5852

   micro avg       0.79      0.79      0.79     33281
   macro avg       0.60      0.57      0.58     33281
weighted avg       0.76      0.79      0.77     33281

[[25121  2308]
 [ 4575  1277]]
0.16462658872388428


In [289]:
time1 = datetime.now()
scores = []
for r in np.linspace(0.3,1,10):
    sm = SMOTE(random_state=0, ratio = r)
    X_train_res, y_train_res = sm.fit_sample(X_train, y_train)
    rf_clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
    rf_clf.fit(X_train_res, y_train_res)
    y_pred = rf_clf.predict(X_test)
    scores.append(metrics.matthews_corrcoef(y_test, y_pred))
scores = np.array(scores)
i = np.argmax(scores)
print('best [rs, score] = [%g, %g], mean score = '%(i+1, scores[i]), scores.mean())
time2 = datetime.now()
print('Time cost: ', time2-time1)

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


best [rs, score] = [10, 0.194164], mean score =  0.08665047249009492
Time cost:  0:00:16.578904


In [316]:
sm = SMOTE(random_state=0, ratio = r)
X_train_res, y_train_res = sm.fit_sample(X_train, y_train)
rf_clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0, min_samples_leaf=1)
rf_clf.fit(X_train_res, y_train_res)
y_pred = rf_clf.predict(X_test)
print(rf_clf.feature_importances_)
print(rf_clf.score(X_test, y_test))
print(metrics.balanced_accuracy_score(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.matthews_corrcoef(y_test, y_pred))

[0.80592737 0.19407263]
0.5409392746612182
0.6268646569941204
              precision    recall  f1-score   support

         0.0       0.91      0.49      0.64     27429
         1.0       0.24      0.76      0.37      5852

   micro avg       0.54      0.54      0.54     33281
   macro avg       0.57      0.63      0.50     33281
weighted avg       0.79      0.54      0.59     33281

[[13559 13870]
 [ 1408  4444]]
0.19416404318603186


In [294]:
min_samples_leaf = list(range(50,110,10))
nfolds = 5
param_grid = {'min_samples_leaf': min_samples_leaf}
grid_search = GridSearchCV(RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0), param_grid, cv=nfolds, scoring='balanced_accuracy')
grid_search.fit(X, y)
print(grid_search.best_score_)
print(grid_search.best_params_)

0.5
{'min_samples_leaf': 50}


In [None]:
from imblearn.pipeline import Pipeline

In [314]:
time1 = datetime.now()
smote = SMOTE(random_state=0, ratio = 1)
rf_clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)

model = Pipeline([
        ('sampling', smote),
        ('classification', rf_clf)
    ])
nfolds = 5
min_samples_leaf = list(range(10,110,10))
params = {'classification__min_samples_leaf': min_samples_leaf}
grid = GridSearchCV(model, param_grid=params, cv=nfolds, scoring='balanced_accuracy')
grid.fit(X, y)
print(grid.best_score_)
print(grid.best_params_)
time2 = datetime.now()
print('Time cost: ', time2-time1)

0.6186327362663987
{'classification__min_samples_leaf': 10}
Time cost:  0:02:40.593291


# Onehot encoder for carriers

In [7]:
from sklearn.preprocessing import OneHotEncoder

In [368]:
df['OP_CARRIER'].unique().reshape(-1,1).tolist()

[['B6'],
 ['DL'],
 ['WN'],
 ['AA'],
 ['UA'],
 ['NK'],
 ['HA'],
 ['AS'],
 ['OO'],
 ['EV'],
 ['VX'],
 ['F9']]

In [369]:
onehot_enc = OneHotEncoder(handle_unknown='ignore')
carrier_list = df['OP_CARRIER'].unique().reshape(-1,1).tolist()
onehot_enc.fit(carrier_list)


OneHotEncoder(categorical_features=None, categories=None,
       dtype=<class 'numpy.float64'>, handle_unknown='ignore',
       n_values=None, sparse=True)

In [377]:
df['OP_CARRIER'].head().values

array(['B6', 'DL', 'WN', 'WN', 'B6'], dtype=object)

In [382]:
print(onehot_enc.transform(df['OP_CARRIER'].head().values.reshape(-1,1)).toarray())

[[0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


In [353]:
df['log_origin_passenger_count'] = df['origin_passenger_count'].apply(lambda x: log10(x))

In [12]:
# X_column_list = ['DEP_TIME']
# X_column_list = ['DEP_TIME', 'number_of_seats', 'OP_CARRIER']
X_column_list = ['DEP_TIME', 'number_of_seats', 'OP_CARRIER', 'origin_passenger_count', 'HOURLYVISIBILITY_origin_imp', 'HOURLYWindSpeed_origin_imp', 'origin_enc_code']
# X_column_list = ['DEP_TIME', 'number_of_seats', 'origin_enc_code']

X = pd.get_dummies(df[X_column_list])
y = df['DEP_DEL15']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [13]:
from imblearn.over_sampling import SMOTE

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
sm = SMOTE(random_state=0, ratio = 1)
X_train_res, y_train_res = sm.fit_sample(X_train, y_train)

In [None]:
time1 = datetime.now()
rf_clf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=0)
rf_clf.fit(X_train_res, y_train_res)
y_pred = rf_clf.predict(X_test)
print(rf_clf.feature_importances_)
print(rf_clf.score(X_test, y_test))
print(metrics.balanced_accuracy_score(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.matthews_corrcoef(y_test, y_pred))
time2 = datetime.now()
print('Time cost: ', time2-time1)

# pipeline randomsearchCV

In [14]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
time1 = datetime.now()
smote = SMOTE(random_state=0, ratio = 1)
rf_clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
model = Pipeline([
        ('sampling', smote),
        ('classification', rf_clf)
    ])
nfolds = 3
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

params_grid = {'classification__n_estimators': n_estimators,
               'classification__max_features': max_features,
               'classification__max_depth': max_depth,
               'classification__min_samples_split': min_samples_split,
               'classification__min_samples_leaf': min_samples_leaf,
               'classification__bootstrap': bootstrap}

randomized = RandomizedSearchCV(model, param_distributions=params_grid, n_iter = 100, cv=nfolds, scoring='balanced_accuracy')
randomized.fit(X, y)
print(randomized.best_score_)
print(randomized.best_params_)
time2 = datetime.now()
print('Time cost: ', time2-time1)

In [None]:
from dask.diagnostics import ProgressBar

In [None]:
with ProgressBar():