In [1]:
#Initial commands

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns



Before next step, make sure lines in CSV file are randomly shuffled.  Use code from http://stackoverflow.com/questions/4618298/randomly-mix-lines-of-3-million-line-file.  But make sure to keep the first line intact so you don't forget the column names!

Code needs to modified in the following way: <br>
data = lines[1:] <br>
newlines = [lines[0]] + data <br>
write(newlines)

In [2]:
#Import data in batches instead of all at once

OnTimeDataFrames = []
for n in range(15):
    #100,001 rows total: 100,000 of data, plus first row for column names!  Easy to mess up...
    #Note we aren't using lat/long data in the model at the present time, but can change this later
    OnTimeDataFrames.append(pd.read_csv('Sample_2011_2016_carr_mark.csv', nrows=100001,
                                        skiprows=range(1, 1 + 100000*n) if n > 0 else None))

In [3]:
#Check shapes of all data frames - all except the last should have 100,000 rows, all should have same no. of columns
for X in OnTimeDataFrames: print X.shape

(100001, 68)
(100001, 68)
(100001, 68)
(100001, 68)
(100001, 68)
(100001, 68)
(100001, 68)
(100001, 68)
(100001, 68)
(100001, 68)
(100001, 68)
(100001, 68)
(100001, 68)
(100001, 68)
(90434, 68)


In [4]:
#Check column names of second data frame
OnTimeDataFrames[1].columns

Index([u'Unnamed: 0', u'Year', u'Quarter', u'Month', u'DayofMonth',
       u'DayOfWeek', u'FlightDate', u'UniqueCarrier', u'AirlineID', u'Carrier',
       u'TailNum', u'FlightNum', u'OriginAirportID', u'OriginAirportSeqID',
       u'OriginCityMarketID', u'Origin', u'OriginCityName', u'OriginState',
       u'OriginStateFips', u'OriginStateName', u'OriginWac', u'DestAirportID',
       u'DestAirportSeqID', u'DestCityMarketID', u'Dest', u'DestCityName',
       u'DestState', u'DestStateFips', u'DestStateName', u'DestWac',
       u'CRSDepTime', u'DepTime', u'DepDelay', u'DepDelayMinutes', u'DepDel15',
       u'DepartureDelayGroups', u'DepTimeBlk', u'TaxiOut', u'WheelsOff',
       u'WheelsOn', u'TaxiIn', u'CRSArrTime', u'ArrTime', u'ArrDelay',
       u'ArrDelayMinutes', u'ArrDel15', u'ArrivalDelayGroups', u'ArrTimeBlk',
       u'Cancelled', u'CancellationCode', u'CRSElapsedTime',
       u'ActualElapsedTime', u'AirTime', u'Flights', u'Distance',
       u'DistanceGroup', u'CarrierDelay', u'Weat

In [5]:
#Check that all data frames have same column names
assert np.all(X.columns == OnTimeDataFrames[0].columns for X in OnTimeDataFrames)

In [6]:
#Correct ExpressJet Airways issue
for X in OnTimeDataFrames:
    X["Carrier Name"] = X["Carrier Name"].replace("ExpressJet Airlines Inc. (1)",
                                                               "ExpressJet Airlines Inc.")

In [7]:
#Get rid of rows with null values for ArrDelay
OnTimeDataNoNull = [X[np.isfinite(X.ArrDelay)] for X in OnTimeDataFrames]
for X in OnTimeDataNoNull: print X.shape

(98231, 68)
(98163, 68)
(98252, 68)
(98174, 68)
(98197, 68)
(98184, 68)
(98200, 68)
(98205, 68)
(98198, 68)
(98190, 68)
(98234, 68)
(98315, 68)
(98177, 68)
(98145, 68)
(88801, 68)


In [8]:
#Compute interactions for airlines * airports with a certain %
def interactions_aa(X, pct):
    #dictionary of features with percentage "1"s
    PercentageDict = {}
    for feature in X.columns:
        PercentageDict[feature] = X[feature].mean(axis=1)
    #calculate interactions
    X1 = X.copy()
    for feat_A in [name for name in X.columns if name[0] == 'C']:
        for feat_B in [name for name in X.columns if name[0] in list('DO') and PercentageDict[name] > pct]:
            X1[feat_A + "*" + feat_B] = (X1[feat_A].to_dense() * X1[feat_B].to_dense()).to_sparse()
    return X1

In [30]:
#Create dummy variables
def DummyRegressors(data):
    #Create dummies for airline, origin and destination airports, month, and time of day
    #Drop first column in each data frame to avoid singular matrix
    #Also include ArrDelay variable so we don't have to grab this separately from the old data frame
    CarrierDummies = pd.get_dummies(data["Carrier Name"], prefix="C", sparse=True, drop_first=True)
    OriginDummies = pd.get_dummies(data.Origin, prefix="O", sparse=True, drop_first=True)
    DestDummies = pd.get_dummies(data.Dest, prefix="D", sparse=True, drop_first=True)
    MonthDummies = pd.get_dummies(data.Month, prefix="M", sparse=True, drop_first=True)
    TimeDummies = pd.get_dummies(data.DepTimeBlk, prefix="T", sparse=True, drop_first=True)
    ArrDelay = data.ArrDelay
    ArrDel15 = data.ArrDel15
    
    return pd.concat([CarrierDummies, OriginDummies, DestDummies, MonthDummies, TimeDummies,
                      ArrDelay, ArrDel15], axis=1)

In [31]:
DummyTest = DummyRegressors(OnTimeDataNoNull[0])

In [32]:
DummyTest.columns[pd.isnull(DummyTest).any()].tolist()

[]

In [33]:
#Create list of data frames with dummies instead of raw data
OnTimeDummies = [DummyRegressors(X) for X in OnTimeDataNoNull]

In [34]:
for X in OnTimeDummies: print X.shape

(98231, 702)
(98163, 706)
(98252, 703)
(98174, 703)
(98197, 705)
(98184, 705)
(98200, 696)
(98205, 707)
(98198, 707)
(98190, 704)
(98234, 699)
(98315, 698)
(98177, 707)
(98145, 708)
(88801, 701)


Problem: not all airports and airlines are in all the data frames!  This makes sense and is a drawback of reading in the data in separate frames rather than splitting it up afterward.  We need to fix it by making sure if a column name appears in one dummy frame, it appears in all of them.

In [35]:
#create set of dummies that appear in any data frame, so each only appears once
#Try it the less efficient way, but this seems to work
DummyList = []
for X in OnTimeDummies:
    for name in X.columns:
        DummyList.append(name)
DummySet = set(DummyList)
len(DummySet)

718

In [36]:
len(DummySet)

718

In [37]:
#Now add dummies for all variables in set to OnTimeDummies frames
for X in OnTimeDummies:
    SparseZeroes = pd.SparseSeries([0 for n in range(len(X))], index=X.index)
    print len(SparseZeroes), len(X)
    for name in DummySet:
        if name not in X.columns:
            X[name] = SparseZeroes.copy()
            if pd.isnull(X[name]).any(): print name

98231 98231
98163 98163
98252 98252
98174 98174
98197 98197
98184 98184
98200 98200
98205 98205
98198 98198
98190 98190
98234 98234
98315 98315
98177 98177
98145 98145
88801 88801


In [38]:
#Now check shapes of dummy frames again
for X in OnTimeDummies: print X.shape

(98231, 718)
(98163, 718)
(98252, 718)
(98174, 718)
(98197, 718)
(98184, 718)
(98200, 718)
(98205, 718)
(98198, 718)
(98190, 718)
(98234, 718)
(98315, 718)
(98177, 718)
(98145, 718)
(88801, 718)


In [39]:
#create list of data frames with interactions included
#airport threshold: 1.5% of flights
OnTimeDummies_Inter = [interactions_aa(X, .015) for X in OnTimeDummies]

In [40]:
for X in OnTimeDummies_Inter: print X.shape

(98231, 1398)
(98163, 1398)
(98252, 1398)
(98174, 1398)
(98197, 1398)
(98184, 1415)
(98200, 1415)
(98205, 1398)
(98198, 1398)
(98190, 1415)
(98234, 1398)
(98315, 1398)
(98177, 1398)
(98145, 1398)
(88801, 1415)


In [41]:
#Once again the number of dummies is not precisely the same between batches.  We need to do the DummySet procedure again.
InterList = []
for X in OnTimeDummies_Inter:
    for name in X.columns:
        InterList.append(name)
InterSet = set(InterList)
len(InterSet)

1432

In [42]:
#Add interactions to all frames
for X in OnTimeDummies_Inter:
    SparseZeroes = pd.SparseSeries([0 for n in range(len(X))], index=X.index)
    for name in InterSet:
        if name not in X.columns:
            X[name] = SparseZeroes.copy()

In [43]:
#Now check shapes again
for X in OnTimeDummies_Inter: print X.shape

(98231, 1432)
(98163, 1432)
(98252, 1432)
(98174, 1432)
(98197, 1432)
(98184, 1432)
(98200, 1432)
(98205, 1432)
(98198, 1432)
(98190, 1432)
(98234, 1432)
(98315, 1432)
(98177, 1432)
(98145, 1432)
(88801, 1432)


In [44]:
#Divide each interaction data set into training and test data sets
from sklearn.model_selection import train_test_split

In [62]:
OnTime_XTrainFrames, OnTime_XTestFrames, OnTime_YTrainFrames, OnTime_YTestFrames = [], [], [], []
for D in OnTimeDummies_Inter:
    X_train, X_test, Y_train, Y_test = train_test_split(D.drop(["ArrDelay", "ArrDel15"], axis=1), D.ArrDelay)
    OnTime_XTrainFrames.append(X_train)
    OnTime_XTestFrames.append(X_test)
    OnTime_YTrainFrames.append(Y_train)
    OnTime_YTestFrames.append(Y_test)

In [46]:
#import modeling formulas
from sklearn.linear_model import SGDRegressor, SGDClassifier

In [47]:
"""1. linear regression model using SGDRegressor, penalty = L1"""
linear = SGDRegressor(penalty='l1')

In [48]:
#Use SGDRegressor partial_fit function to train model on training data
for n in range(len(OnTime_XTrainFrames)):
    print n
    linear.partial_fit(OnTime_XTrainFrames[n], OnTime_YTrainFrames[n])

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14


In [49]:
#Aggregate test data into one frame
OnTime_XTest = pd.concat(OnTime_XTestFrames, axis=1)

KeyboardInterrupt: 

In [50]:
#OK then, test separately in batches to start out
linear.score(OnTime_XTestFrames[1], OnTime_YTestFrames[1])

0.015949835940547441

In [51]:
linear.score(OnTime_XTestFrames[0], OnTime_YTestFrames[0])

0.019897060414090451

In [52]:
for n in range(len(OnTime_XTestFrames)):
    print n, linear.score(OnTime_XTestFrames[n], OnTime_YTestFrames[n])

0 0.0198970604141
1 0.0159498359405
2 0.0184043245129
3 0.0158138414667
4 0.0172777173439
5 0.013277972406
6 0.000530028998183
7 0.0185260261079
8 0.0190934579301
9 0.0149732645629
10 0.0130594861394
11 0.00907258241209
12 0.0180887691068
13 0.0125697171668
14 0.0290951015303


In [61]:
"""2. Logistic regression using SGDClassifier"""
#Training and testing sets for logistic regression
OnTime_X2TrainFrames, OnTime_X2TestFrames, OnTime_Y2TrainFrames, OnTime_Y2TestFrames = [], [], [], []
for D in OnTimeDummies_Inter:
    X2_train, X2_test, Y2_train, Y2_test = train_test_split(D.drop(["ArrDelay", "ArrDel15"], axis=1), D.ArrDel15)
    OnTime_X2TrainFrames.append(X2_train)
    OnTime_X2TestFrames.append(X2_test)
    OnTime_Y2TrainFrames.append(Y2_train)
    OnTime_Y2TestFrames.append(Y2_test)

In [63]:
logistic = SGDClassifier(penalty='l1')

In [64]:
for n in range(len(OnTime_X2TrainFrames)):
    print n
    logistic.partial_fit(OnTime_X2TrainFrames[n], OnTime_Y2TrainFrames[n])

0


ValueError: classes must be passed on the first call to partial_fit.

In [None]:
for n in range(len(OnTime_X2TestFrames)):
    print n, logistic.score(OnTime_X2TestFrames[n], OnTime_Y2TestFrames[n])

In [26]:
"""Troubleshooting section"""

'Troubleshooting section'

In [65]:
len(OnTime_X2TestFrames)

15

In [70]:
assert 'ArrDel15' in OnTimeDummies_Inter[0].columns

In [73]:
OnTime_Y2TrainFrames[0]

77149    1.0
21963    0.0
66500    1.0
74508    1.0
91746    1.0
3913     0.0
68835    0.0
8560     0.0
57478    1.0
22020    0.0
72718    0.0
37120    0.0
79858    0.0
48332    0.0
85077    1.0
8038     0.0
63570    1.0
54874    0.0
75667    0.0
87074    0.0
63797    0.0
29908    0.0
24787    0.0
61158    0.0
69268    0.0
38383    0.0
75015    1.0
49974    1.0
42611    0.0
20727    0.0
        ... 
79699    0.0
6451     0.0
90886    0.0
24348    0.0
76338    1.0
95024    0.0
78531    0.0
54099    0.0
60797    1.0
10706    0.0
74023    0.0
61549    0.0
47256    0.0
28082    1.0
29369    0.0
20322    0.0
77904    0.0
56795    1.0
5971     0.0
14366    0.0
75874    0.0
84562    1.0
57630    0.0
35764    0.0
61455    0.0
76466    1.0
9026     0.0
73561    0.0
88325    0.0
76342    1.0
Name: ArrDel15, dtype: float64
BlockIndex
Block locations: array([0])
Block lengths: array([73673])