In [1]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn import ensemble
from sklearn import datasets
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv('2008.csv')

In [3]:
missing_df = df.isnull().sum(axis=0).reset_index()
missing_df.columns = ['variable', 'missing values']
missing_df['filling factor (%)']=(df.shape[0]-missing_df['missing values'])/df.shape[0]*100
missing_df.sort_values('filling factor (%)').reset_index(drop = True)

Unnamed: 0,variable,missing values,filling factor (%)
0,CancellationCode,6872294,1.960618
1,LateAircraftDelay,5484993,21.7517
2,NASDelay,5484993,21.7517
3,WeatherDelay,5484993,21.7517
4,CarrierDelay,5484993,21.7517
5,SecurityDelay,5484993,21.7517
6,AirTime,154699,97.793081
7,ActualElapsedTime,154699,97.793081
8,ArrDelay,154699,97.793081
9,ArrTime,151649,97.836592


In [3]:
df = df.drop(['CancellationCode', 'LateAircraftDelay', 'NASDelay', 'WeatherDelay', 'CarrierDelay', 'SecurityDelay',
             'ArrTime', 'ActualElapsedTime', 'Cancelled', 'Year', 'TaxiIn',
             'TaxiOut', 'Diverted', 'Cancelled', 'DepDelay', 'DepTime'], axis=1)
df.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,CRSDepTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,CRSElapsedTime,AirTime,ArrDelay,Origin,Dest,Distance
0,1,3,4,1955,2225,WN,335,N712SW,150.0,116.0,-14.0,IAD,TPA,810
1,1,3,4,735,1000,WN,3231,N772SW,145.0,113.0,2.0,IAD,TPA,810
2,1,3,4,620,750,WN,448,N428WN,90.0,76.0,14.0,IND,BWI,515
3,1,3,4,930,1100,WN,1746,N612SW,90.0,78.0,-6.0,IND,BWI,515
4,1,3,4,1755,1925,WN,3920,N464WN,90.0,77.0,34.0,IND,BWI,515


In [4]:
df = df.dropna()
df.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,CRSDepTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,CRSElapsedTime,AirTime,ArrDelay,Origin,Dest,Distance
0,1,3,4,1955,2225,WN,335,N712SW,150.0,116.0,-14.0,IAD,TPA,810
1,1,3,4,735,1000,WN,3231,N772SW,145.0,113.0,2.0,IAD,TPA,810
2,1,3,4,620,750,WN,448,N428WN,90.0,76.0,14.0,IND,BWI,515
3,1,3,4,930,1100,WN,1746,N612SW,90.0,78.0,-6.0,IND,BWI,515
4,1,3,4,1755,1925,WN,3920,N464WN,90.0,77.0,34.0,IND,BWI,515


In [5]:
df['Late'] = np.where(df['ArrDelay'] >=30, 1, 0)
df = df.drop(['ArrDelay'], axis = 1)
df.describe()

Unnamed: 0,Month,DayofMonth,DayOfWeek,CRSDepTime,CRSArrTime,FlightNum,CRSElapsedTime,AirTime,Distance,Late
count,6855024.0,6855024.0,6855024.0,6855024.0,6855024.0,6855024.0,6855024.0,6855024.0,6855024.0,6855024.0
mean,6.388525,15.73826,3.925802,1325.037,1493.715,2212.505,129.0778,104.0186,728.7441,0.1352895
std,3.398833,8.801085,1.988459,464.313,482.6643,1956.435,69.53273,67.43981,563.2449,0.3420326
min,1.0,1.0,1.0,0.0,0.0,1.0,-141.0,0.0,11.0,0.0
25%,3.0,8.0,2.0,925.0,1115.0,618.0,80.0,55.0,326.0,0.0
50%,6.0,16.0,4.0,1317.0,1515.0,1563.0,111.0,86.0,581.0,0.0
75%,9.0,23.0,6.0,1715.0,1905.0,3494.0,159.0,132.0,954.0,0.0
max,12.0,31.0,7.0,2359.0,2359.0,9741.0,660.0,1350.0,4962.0,1.0


In [7]:
df.shape

(6855024, 14)

In [6]:
from sklearn.preprocessing import LabelEncoder

le_UniqueCarrier = LabelEncoder()
le_Origin = LabelEncoder()
le_Dest = LabelEncoder()

df['UniqueCarrier_n'] = le_UniqueCarrier.fit_transform(df['UniqueCarrier'])
df['Origin_n'] = le_Origin.fit_transform(df['Origin'])
df['Dest_n'] = le_Dest.fit_transform(df['Dest'])

df = df.drop(['UniqueCarrier', 'Origin', 'Dest', 'TailNum'], axis=1)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6855024 entries, 0 to 7009727
Data columns (total 13 columns):
Month              int64
DayofMonth         int64
DayOfWeek          int64
CRSDepTime         int64
CRSArrTime         int64
FlightNum          int64
CRSElapsedTime     float64
AirTime            float64
Distance           int64
Late               int64
UniqueCarrier_n    int64
Origin_n           int64
Dest_n             int64
dtypes: float64(2), int64(11)
memory usage: 732.2 MB


In [7]:
X = df.drop('Late', 1)
y = df['Late']


In [8]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

X_new = SelectKBest(f_classif, k=10).fit_transform(X, y)
X_new.shape

(6855024, 10)

In [9]:
from sklearn.model_selection import train_test_split
from sklearn import svm

X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.999, random_state=0)


In [10]:
from sklearn import ensemble
from sklearn.model_selection import cross_val_score

rfc = ensemble.RandomForestClassifier()

cross_val_score(rfc, X_train, y_train, cv=10)



array([0.84693878, 0.85131195, 0.84693878, 0.85714286, 0.85568513,
       0.85276968, 0.85276968, 0.85985401, 0.85233918, 0.85380117])