# Classification

In [1]:
# import the libraries

%matplotlib inline

import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn :: utils
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold

# sklearn :: models
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# sklearn :: evaluation metrics
from sklearn.metrics import cohen_kappa_score

# convert scientific notation to decimals
pd.set_option('display.float_format', lambda x: '%.2f' % x)
sns.set_style('whitegrid')

________________________
# Load Data

In [22]:
df_flights = pd.read_csv('data/flightsmerged.csv', low_memory=False)

In [4]:
df_flights.head()

Unnamed: 0,MONTH,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AC,DESTINATION_AC,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,SCHEDULED_TIME,DISTANCE,...,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY,DATE,AIRLINE_CODE,AIRLINE_NAME,ORIGIN_AIRPORT,DESTINATION_AIRPORT
0,1,98,N407AS,ANC,SEA,00:05:00,23:54:00,-11.0,205.0,1448,...,,,,,,2015-01-01,AS,Alaska Airlines Inc.,Ted Stevens Anchorage International Airport,Seattle-Tacoma International Airport
1,1,2336,N3KUAA,LAX,PBI,00:10:00,00:02:00,-8.0,280.0,2330,...,,,,,,2015-01-01,AA,American Airlines Inc.,Los Angeles International Airport,Palm Beach International Airport
2,1,840,N171US,SFO,CLT,00:20:00,00:18:00,-2.0,286.0,2296,...,,,,,,2015-01-01,US,US Airways Inc.,San Francisco International Airport,Charlotte Douglas International Airport
3,1,258,N3HYAA,LAX,MIA,00:20:00,00:15:00,-5.0,285.0,2342,...,,,,,,2015-01-01,AA,American Airlines Inc.,Los Angeles International Airport,Miami International Airport
4,1,135,N527AS,SEA,ANC,00:25:00,00:24:00,-1.0,235.0,1448,...,,,,,,2015-01-01,AS,Alaska Airlines Inc.,Seattle-Tacoma International Airport,Ted Stevens Anchorage International Airport


____

# Feature Engineering

## Add Columns

In [23]:
# Add column for flight Class (Early, On_Time, Delayed, Cancelled)

# Default value:
df_flights['CLASS'] = 'On_Time'

In [24]:
# 1- Cancelled flights

cancelled_flights = list(df_flights[df_flights['CANCELLED'] == 1].index)    
df_flights.at[cancelled_flights, 'CLASS'] = 'Cancelled'
    
print("# Cancelled flights = ", len(df_flights[df_flights['CLASS'] == 'Cancelled']))

# Cancelled flights =  89270


In [25]:
# 2- Delayed flights
    
delayed_flights = list(df_flights[(df_flights['DEPARTURE_DELAY'] > 0) | (df_flights['ARRIVAL_DELAY'] > 0)].index)    
df_flights.at[delayed_flights, 'CLASS'] = 'Delayed'

print("# Delayed flights = ", len(df_flights[df_flights['CLASS'] == 'Delayed']))

# Delayed flights =  2582805


In [26]:
# 3- Early flights

early_flights = list(df_flights[(df_flights['DEPARTURE_DELAY'] < 0) & (df_flights['ARRIVAL_DELAY'] < 0)].index)
df_flights.at[early_flights, 'CLASS'] = 'Early'
print("# Early flights = ", len(early_flights))

# Remove the early flights
#df = df[~df.index.isin(early_flights)]
#print("Was: ", df.shape, " Now: ", df.shape)


# Early flights =  2708248


In [27]:
# 4- On-time flights

print("On-time flights = ",len(df_flights[df_flights['CLASS'] == 'On_Time']))

On-time flights =  319120


In [28]:
df_flights['CLASS'].value_counts()

Early        2708248
Delayed      2582805
On_Time       319120
Cancelled      87344
Name: CLASS, dtype: int64

## Missing Values

In [29]:
df_flights.isnull().sum()

MONTH                        0
FLIGHT_NUMBER                0
TAIL_NUMBER              14721
ORIGIN_AC                    0
DESTINATION_AC               0
SCHEDULED_DEPARTURE          0
DEPARTURE_TIME           86153
DEPARTURE_DELAY          86153
SCHEDULED_TIME               4
DISTANCE                     0
SCHEDULED_ARRIVAL            0
ARRIVAL_TIME             91472
ARRIVAL_DELAY           103522
DIVERTED                     0
CANCELLED                    0
CANCELLATION_REASON          0
AIR_SYSTEM_DELAY       4754090
SECURITY_DELAY         4754090
AIRLINE_DELAY          4754090
LATE_AIRCRAFT_DELAY    4754090
WEATHER_DELAY          4754090
DATE                         0
AIRLINE_CODE                 0
AIRLINE_NAME                 0
ORIGIN_AIRPORT          480211
DESTINATION_AIRPORT     480211
CLASS                        0
dtype: int64

In [30]:
msv_columns = ['AIR_SYSTEM_DELAY', 'SECURITY_DELAY' ,'AIRLINE_DELAY' ,'LATE_AIRCRAFT_DELAY' ,'WEATHER_DELAY']
df_flights[msv_columns] = df_flights[msv_columns].fillna(0.0)

In [31]:
df_flights.isnull().sum()

MONTH                       0
FLIGHT_NUMBER               0
TAIL_NUMBER             14721
ORIGIN_AC                   0
DESTINATION_AC              0
SCHEDULED_DEPARTURE         0
DEPARTURE_TIME          86153
DEPARTURE_DELAY         86153
SCHEDULED_TIME              4
DISTANCE                    0
SCHEDULED_ARRIVAL           0
ARRIVAL_TIME            91472
ARRIVAL_DELAY          103522
DIVERTED                    0
CANCELLED                   0
CANCELLATION_REASON         0
AIR_SYSTEM_DELAY            0
SECURITY_DELAY              0
AIRLINE_DELAY               0
LATE_AIRCRAFT_DELAY         0
WEATHER_DELAY               0
DATE                        0
AIRLINE_CODE                0
AIRLINE_NAME                0
ORIGIN_AIRPORT         480211
DESTINATION_AIRPORT    480211
CLASS                       0
dtype: int64

## Categorical Columns

In [32]:
# Check to see what data types we have. Since we are encoding the categorical variables, we are going to include only the object columns.
df_flights.dtypes

MONTH                    int64
FLIGHT_NUMBER            int64
TAIL_NUMBER             object
ORIGIN_AC               object
DESTINATION_AC          object
SCHEDULED_DEPARTURE     object
DEPARTURE_TIME          object
DEPARTURE_DELAY        float64
SCHEDULED_TIME         float64
DISTANCE                 int64
SCHEDULED_ARRIVAL       object
ARRIVAL_TIME            object
ARRIVAL_DELAY          float64
DIVERTED                 int64
CANCELLED                int64
CANCELLATION_REASON     object
AIR_SYSTEM_DELAY       float64
SECURITY_DELAY         float64
AIRLINE_DELAY          float64
LATE_AIRCRAFT_DELAY    float64
WEATHER_DELAY          float64
DATE                    object
AIRLINE_CODE            object
AIRLINE_NAME            object
ORIGIN_AIRPORT          object
DESTINATION_AIRPORT     object
CLASS                   object
dtype: object

## Label encoding

In [47]:
# apply_dummies

categorical = 'AIRLINE_CODE'
df_dummies = pd.get_dummies(df_flights[categorical])
df_dummies.columns = [str(categorical)+'_'+str(c) for c in df_dummies.columns]
df_flights2 = pd.concat([df_flights, df_dummies], axis=1)


In [49]:
print(df_flights2.columns)

Index(['MONTH', 'FLIGHT_NUMBER', 'TAIL_NUMBER', 'ORIGIN_AC', 'DESTINATION_AC',
       'SCHEDULED_DEPARTURE', 'DEPARTURE_TIME', 'DEPARTURE_DELAY',
       'SCHEDULED_TIME', 'DISTANCE', 'SCHEDULED_ARRIVAL', 'ARRIVAL_TIME',
       'ARRIVAL_DELAY', 'DIVERTED', 'CANCELLED', 'CANCELLATION_REASON',
       'AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY',
       'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY', 'DATE', 'AIRLINE_CODE',
       'AIRLINE_NAME', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'CLASS',
       'AIRLINE_CODE_AA', 'AIRLINE_CODE_AS', 'AIRLINE_CODE_B6',
       'AIRLINE_CODE_DL', 'AIRLINE_CODE_EV', 'AIRLINE_CODE_F9',
       'AIRLINE_CODE_HA', 'AIRLINE_CODE_MQ', 'AIRLINE_CODE_NK',
       'AIRLINE_CODE_OO', 'AIRLINE_CODE_UA', 'AIRLINE_CODE_US',
       'AIRLINE_CODE_VX', 'AIRLINE_CODE_WN'],
      dtype='object')


In [51]:
corr_matrix = df_flights2.corr()
corr_matrix["DEPARTURE_DELAY"].sort_values(ascending=False)

DEPARTURE_DELAY        1.00
ARRIVAL_DELAY          0.85
LATE_AIRCRAFT_DELAY    0.68
AIRLINE_DELAY          0.58
AIR_SYSTEM_DELAY       0.25
WEATHER_DELAY          0.20
AIRLINE_CODE_WN        0.06
AIRLINE_CODE_UA        0.05
SECURITY_DELAY         0.04
SCHEDULED_TIME         0.03
DISTANCE               0.03
AIRLINE_CODE_NK        0.03
CANCELLED              0.02
DIVERTED               0.02
AIRLINE_CODE_B6        0.01
AIRLINE_CODE_F9        0.00
AIRLINE_CODE_VX        0.00
AIRLINE_CODE_MQ        0.00
AIRLINE_CODE_AA       -0.01
AIRLINE_CODE_US       -0.02
FLIGHT_NUMBER         -0.02
AIRLINE_CODE_EV       -0.02
AIRLINE_CODE_OO       -0.03
AIRLINE_CODE_DL       -0.03
MONTH                 -0.03
AIRLINE_CODE_HA       -0.03
AIRLINE_CODE_AS       -0.05
Name: DEPARTURE_DELAY, dtype: float64

__________________
# Training The Models

In [52]:
# selecting the columns
X_columns = ['ORIGIN_AC','DESTINATION_AC', 'AIRLINE_CODE']
y_column = ['DEPARTURE_DELAY']

In [54]:
# splitting the data

threshold = 0.8
X = df_flights2[X_columns]
y = df_flights2[y_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1.0-threshold, shuffle=True)

print('X_train', X_train.shape)
print('y_train', y_train.shape)
print('X_test', X_test.shape)
print('y_test', y_test.shape)

X_train (4558013, 3)
y_train (4558013, 1)
X_test (1139504, 3)
y_test (1139504, 1)


_________
# Testing The Models

In [55]:
models = [
    ('Naive Bayes', GaussianNB()),
    ('RandomForestClassifier10', RandomForestClassifier(n_estimators=10)),
    ('RandomForestClassifier100', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('KNeighborsClassifier9', KNeighborsClassifier(n_neighbors=9)),
    ('DecisionTreeClassifier', DecisionTreeClassifier())
]
results = []
for m in models:
    print('MODEL', m[0])
    model = m[1]
    model.fit(X_train, y_train.values.ravel())
    y_pred = model.predict(X_test)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    print(confusion_matrix(y_test, y_pred))
    print('Precision', precision)
    print('Recall', recall)
    results.append([m[0], precision, recall])
    
    # print top 5feature importance
    importance = []
    if hasattr(model, 'feature_importances_'):
        print('Feature Importance')
        importance = []
        for i in range(len(X_columns)):
            importance.append([X_columns[i], model.feature_importances_[i]])
        print(pd.DataFrame(importance).sort_values(by=1, ascending=False).head(10))
    elif hasattr(model, 'coef_'):
        print('Feature Importance')
        for i in range(len(X_columns)):
            importance.append([X_columns[i], model.coef_[i]])
        print(pd.DataFrame(importance).sort_values(by=1, ascending=False).head(10))
        
    print('')

# sort the results and print as a table
df_results = pd.DataFrame(results)
df_results.columns = ['model', 'precision', 'recall']
df_results = df_results.sort_values(by='precision', ascending=False)
df_results

MODEL Naive Bayes


ValueError: could not convert string to float: 'CLT'

____________
# Model Evaluation