## Importing Libraries

In [63]:
from sklearn.model_selection import train_test_split
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.linear_model import RidgeCV, LassoCV, Ridge, Lasso
from sklearn import tree
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.externals.six import StringIO  
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from sklearn import metrics
from imblearn.over_sampling import ADASYN 
from imblearn.under_sampling import NearMiss


### Loading Dataset

In [64]:
df = pd.read_csv("../Project.csv")
del df['Unnamed: 0']
df = df.drop(['ArrDelayMinutes','DepDel15','ArrTime','CRSDepTime','DepTime','ArrTime','date','FlightDate','airport'],1)
lb = LabelEncoder()
df['Origin'] = lb.fit_transform(df['Origin'])
df['Dest'] = lb.fit_transform(df['Dest'])

## Logistic Regression

### Assigning Target and Dependent Variables

In [45]:
X = np.array(df.drop("ArrDel15", axis = 1))
y = np.array(df["ArrDel15"])
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=0)


### Fitting the Dataset

In [46]:
logreg = LogisticRegression()
logreg.fit(X_train,y_train)




LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

### Data Prediction and Validation

In [47]:
y_pred=logreg.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))
print("Confusion matrix")
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix


Accuracy: 0.9085252871461669
Precision: 0.954737061769616
Recall: 0.5933750956512717
Confusion matrix


array([[287179,   2169],
       [ 31352,  45751]])

## Decision Tree Classifier

In [48]:
X = np.array(df.drop("ArrDel15", axis = 1))
Y = np.array(df["ArrDel15"])
x_train, x_test, y_train, y_test = train_test_split( X, Y, test_size=0.15, random_state=26)


### Fitting the Dataset

In [51]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(x_train,y_train)


### Prediction and Validation

In [52]:
y_pred = clf.predict(x_test)
print("Test Score: "+ str(clf.score(x_test,y_test))) 
print("precision score : "+str(precision_score(y_test,y_pred))) 
print("recall score : "+str(recall_score(y_test,y_pred))) 
print("f1 score : "+str(f1_score(y_test,y_pred)))
print(classification_report(y_test,y_pred))


Test Score: 0.8705419192396976
precision score : 0.6872568093385214
recall score : 0.7075979968917285
f1 score : 0.697279085200878
              precision    recall  f1-score   support

         0.0       0.92      0.91      0.92    216928
         1.0       0.69      0.71      0.70     57910

    accuracy                           0.87    274838
   macro avg       0.80      0.81      0.81    274838
weighted avg       0.87      0.87      0.87    274838



## ExtraTree Classifier

In [53]:
X = np.array(df.drop("ArrDel15", axis = 1))
Y = np.array(df["ArrDel15"])
x_train, x_test, y_train, y_test = train_test_split( X, Y, test_size=0.22)


In [54]:
clf = ExtraTreesClassifier()
clf.fit(x_train, y_train) 




ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
                     max_depth=None, max_features='auto', max_leaf_nodes=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
                     oob_score=False, random_state=None, verbose=0,
                     warm_start=False)

In [55]:
y_pred = clf.predict(x_test)
print("Test Score: "+ str(clf.score(x_test,y_test))) 
print("precision score : "+str(precision_score(y_test,y_pred))) 
print("recall score : "+str(recall_score(y_test,y_pred))) 
print("f1 score : "+str(f1_score(y_test,y_pred)))
print(classification_report(y_test,y_pred))
print("Confusion matrix")
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix


Test Score: 0.8912864429317086
precision score : 0.8358272836518675
recall score : 0.6000023634236555
f1 score : 0.6985485313338379
              precision    recall  f1-score   support

         0.0       0.90      0.97      0.93    318473
         1.0       0.84      0.60      0.70     84623

    accuracy                           0.89    403096
   macro avg       0.87      0.78      0.82    403096
weighted avg       0.89      0.89      0.88    403096

Confusion matrix


array([[308500,   9973],
       [ 33849,  50774]])

# Sampling

## Gradient Boost Oversampling(ADASYN/SMOTE) and UnderSampling(Near miss)


### SMOTE

In [65]:
sm = SMOTE(random_state=42)
X = np.array(df.drop("ArrDel15", axis = 1))
Y = np.array(df["ArrDel15"])
x_train, x_test, y_train, y_test = train_test_split( X, Y, test_size=0.20, random_state=42)
x_train, y_train = sm.fit_sample(x_train, y_train)


In [None]:
clf = GradientBoostingClassifier()
clf.fit(x_train, y_train) 


In [None]:
y_pred = clf.predict(x_test)
print("Test Score: "+ str(clf.score(x_test,y_test))) 
print("precision score : "+str(precision_score(y_test,y_pred))) 
print("recall score : "+str(recall_score(y_test,y_pred))) 
print("f1 score : "+str(f1_score(y_test,y_pred)))
print(classification_report(y_test,y_pred))
print("Confusion matrix")
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix


### NearMiss


In [58]:
nm = NearMiss()
X = np.array(df.drop("ArrDel15", axis = 1))
Y = np.array(df["ArrDel15"])
x_train, x_test, y_train, y_test = train_test_split( X, Y, test_size=0.20, random_state=42)
x_train, y_train = nm.fit_resample(x_train, y_train)


In [59]:
clf = GradientBoostingClassifier()
clf.fit(x_train, y_train) 


GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [60]:
y_pred=logreg.predict(X_test)
print("Test Score: "+ str(clf.score(x_test,y_test))) 
print("precision score : "+str(precision_score(y_test,y_pred))) 
print("recall score : "+str(recall_score(y_test,y_pred))) 
print("f1 score : "+str(f1_score(y_test,y_pred)))
print(classification_report(y_test,y_pred))
print("Confusion matrix")
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix


Test Score: 0.8028385786912848
precision score : 0.21331385642737896
recall score : 0.13304006038993155
f1 score : 0.16387450502589096
              precision    recall  f1-score   support

         0.0       0.79      0.87      0.83    289617
         1.0       0.21      0.13      0.16     76834

    accuracy                           0.72    366451
   macro avg       0.50      0.50      0.50    366451
weighted avg       0.67      0.72      0.69    366451

Confusion matrix


array([[251919,  37698],
       [ 66612,  10222]])

### ADASYN

In [None]:
sm = ADASYN()
X = np.array(df.drop("ArrDel15", axis = 1))
Y = np.array(df["ArrDel15"])
x_train, x_test, y_train, y_test = train_test_split( X, Y, test_size=0.20, random_state=42)
x_train, y_train = sm.fit_sample(x_train, y_train)


In [None]:
clf = GradientBoostingClassifier()
clf.fit(x_train, y_train) 


In [None]:
y_pred=logreg.predict(X_test)
print("Test Score: "+ str(clf.score(x_test,y_test))) 
print("precision score : "+str(precision_score(y_test,y_pred))) 
print("recall score : "+str(recall_score(y_test,y_pred))) 
print("f1 score : "+str(f1_score(y_test,y_pred)))
print(classification_report(y_test,y_pred))
print("Confusion matrix")
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix


### Extra Tree Classifier ( Over and under Sampling)

### ADASYN

In [32]:
sm = ADASYN()
X = np.array(df.drop("ArrDel15", axis = 1))
Y = np.array(df["ArrDel15"])
x_train, x_test, y_train, y_test = train_test_split( X, Y, test_size=0.20, random_state=42)
x_train, y_train = sm.fit_sample(x_train, y_train)


In [33]:
clf = ExtraTreesClassifier()
clf.fit(x_train, y_train) 




ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
                     max_depth=None, max_features='auto', max_leaf_nodes=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
                     oob_score=False, random_state=None, verbose=0,
                     warm_start=False)

In [34]:
y_pred = clf.predict(x_test)
print("Test Score: "+ str(clf.score(x_test,y_test))) 
print("precision score : "+str(precision_score(y_test,y_pred))) 
print("recall score : "+str(recall_score(y_test,y_pred))) 
print("f1 score : "+str(f1_score(y_test,y_pred)))
print(classification_report(y_test,y_pred))
print("Confusion matrix")
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix


Test Score: 0.8915216495520547
precision score : 0.7931476094105743
recall score : 0.6529010594268162
f1 score : 0.7162233548921346
              precision    recall  f1-score   support

         0.0       0.91      0.95      0.93    289617
         1.0       0.79      0.65      0.72     76834

    accuracy                           0.89    366451
   macro avg       0.85      0.80      0.82    366451
weighted avg       0.89      0.89      0.89    366451



### Near Miss

In [35]:
nm = NearMiss()
X = np.array(df.drop("ArrDel15", axis = 1))
Y = np.array(df["ArrDel15"])
x_train, x_test, y_train, y_test = train_test_split( X, Y, test_size=0.20, random_state=42)
x_train, y_train = nm.fit_resample(x_train, y_train)


In [36]:
clf = ExtraTreesClassifier()
clf.fit(x_train, y_train) 




ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
                     max_depth=None, max_features='auto', max_leaf_nodes=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
                     oob_score=False, random_state=None, verbose=0,
                     warm_start=False)

In [37]:
y_pred = clf.predict(x_test)
print("Test Score: "+ str(clf.score(x_test,y_test))) 
print("precision score : "+str(precision_score(y_test,y_pred))) 
print("recall score : "+str(recall_score(y_test,y_pred))) 
print("f1 score : "+str(f1_score(y_test,y_pred)))
print(classification_report(y_test,y_pred))
print("Confusion matrix")
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix


Test Score: 0.571156853167272
precision score : 0.3055773420479303
recall score : 0.8214722648827342
f1 score : 0.4454521073878554
              precision    recall  f1-score   support

         0.0       0.91      0.50      0.65    289617
         1.0       0.31      0.82      0.45     76834

    accuracy                           0.57    366451
   macro avg       0.61      0.66      0.55    366451
weighted avg       0.79      0.57      0.61    366451



### SMOTE

In [38]:
sm = SMOTE(random_state=42)
X = np.array(df.drop("ArrDel15", axis = 1))
Y = np.array(df["ArrDel15"])
x_train, x_test, y_train, y_test = train_test_split( X, Y, test_size=0.20, random_state=42)
x_train, y_train = sm.fit_sample(x_train, y_train)


In [39]:
clf = ExtraTreesClassifier()
clf.fit(x_train, y_train) 




ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
                     max_depth=None, max_features='auto', max_leaf_nodes=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
                     oob_score=False, random_state=None, verbose=0,
                     warm_start=False)

In [40]:
y_pred = clf.predict(x_test)
print("Test Score: "+ str(clf.score(x_test,y_test))) 
print("precision score : "+str(precision_score(y_test,y_pred))) 
print("recall score : "+str(recall_score(y_test,y_pred))) 
print("f1 score : "+str(f1_score(y_test,y_pred)))
print(classification_report(y_test,y_pred))
print("Confusion matrix")
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix


Test Score: 0.8920701539905744
precision score : 0.7950117900267452
recall score : 0.653825129499961
f1 score : 0.7175392614070546
              precision    recall  f1-score   support

         0.0       0.91      0.96      0.93    289617
         1.0       0.80      0.65      0.72     76834

    accuracy                           0.89    366451
   macro avg       0.85      0.80      0.83    366451
weighted avg       0.89      0.89      0.89    366451

