In [13]:
import pandas as pd
import matplotlib
import numpy as np
import matplotlib.pyplot as plt

from datetime import datetime as dt

In [62]:
# read data file
FlightData = pd.read_csv('data/FlightData.csv')

In [63]:
def SetUp_Xy(FlightData):
    print(FlightData.shape)
    # drop Alaska Airlines flights to shrink dataset
    FlightData.drop(FlightData[FlightData.OP_UNIQUE_CARRIER == 'AS'].index,inplace=True)
    print(FlightData.shape)
    
    # take a smaller sample of the data
    FlightData = FlightData.sample(n=1000)

    SEA_outbound = False  # drop outbound flights if True
    SEA_inbound = True    # drop inbound flights if True
    if not SEA_inbound:
        print('dropping records arriving in SEA')
        FlightData.drop(FlightData[FlightData.DEST == 'SEA'].index,inplace=True)

    elif not SEA_outbound:
        print('dropping flights departing SEA')
        FlightData.drop(FlightData[FlightData.ORIGIN == 'SEA'].index,inplace=True)
    print(FlightData.shape)

    # create one-hot columns for Airline Carrier
    X = pd.get_dummies(FlightData[['OP_UNIQUE_CARRIER']])
    #X = pd.DataFrame()
    
    # add datetime for FL_DATE (flight date)
    #X['FlightDate'] = pd.to_datetime(FlightData['FL_DATE'])
    X['FlightDate'] = pd.to_datetime(FlightData['FL_DATE']).apply(lambda x: x.toordinal())

    # add other columns 
    #X = pd.concat([X, FlightData[['CRS_ARR_TIME','ARR_TIME','DISTANCE']]], 
    #              axis=1, sort=False)
    X = pd.concat([X, FlightData[['CRS_ARR_TIME','DISTANCE']]], 
                  axis=1, sort=False)
        
    # set up target
    y = FlightData['Delay']

    return X,y

In [64]:
X,y = SetUp_Xy(FlightData)

(306810, 16)
(184876, 16)
dropping flights departing SEA
(568, 16)


In [45]:
# check for NaN
for index,row in FlightData.iterrows():
    for i in row:
        if i == np.nan:
            print(row)

In [46]:
print(X.mean())

CRS_ARR_TIME    1489.924188
dtype: float64


In [65]:
X.FlightDate = (X.FlightDate - X.FlightDate.mean())/X.FlightDate.std()
X.DISTANCE = (X.DISTANCE - X.DISTANCE.mean())/X.DISTANCE.std()
#X.ARR_TIME = (X.ARR_TIME - X.ARR_TIME.mean())/X.ARR_TIME.std()
X.CRS_ARR_TIME = (X.CRS_ARR_TIME - X.CRS_ARR_TIME.mean())/X.CRS_ARR_TIME.std()

## KNN

In [67]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split

# STEP 1: split X and y into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

print(y_pred)

[1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 0
 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1
 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0
 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 1 0 1 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
 0 0 0 0 1 1]


In [68]:
y_pred = knn.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))

0.701754385965
