In [2]:
#Initial commands

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns



Before next step, make sure lines in CSV file are randomly shuffled.  Use code from http://stackoverflow.com/questions/4618298/randomly-mix-lines-of-3-million-line-file.  But make sure to keep the first line intact so you don't forget the column names!

Code needs to modified in the following way: <br>
data = lines[1:] <br>
newlines = [lines[0]] + data <br>
write(newlines)

In [4]:
#Import data in batches instead of all at once

OnTimeDataFrames = []
for n in range(15):
    #100,001 rows total: 100,000 of data, plus first row for column names!  Easy to mess up...
    #Note we aren't using lat/long data in the model at the present time, but can change this later
    OnTimeDataFrames.append(pd.read_csv('Sample_2011_2016_carr_mark.csv', nrows=100001,
                                        skiprows=range(1, 1 + 100000*n) if n > 0 else None))

In [5]:
#Check shapes of all data frames - all except the last should have 100,000 rows, all should have same no. of columns
for X in OnTimeDataFrames: print X.shape

(100001, 68)
(100001, 68)
(100001, 68)
(100001, 68)
(100001, 68)
(100001, 68)
(100001, 68)
(100001, 68)
(100001, 68)
(100001, 68)
(100001, 68)
(100001, 68)
(100001, 68)
(100001, 68)
(90434, 68)


In [6]:
#Check column names of second data frame
OnTimeDataFrames[1].columns

Index([u'Unnamed: 0', u'Year', u'Quarter', u'Month', u'DayofMonth',
       u'DayOfWeek', u'FlightDate', u'UniqueCarrier', u'AirlineID', u'Carrier',
       u'TailNum', u'FlightNum', u'OriginAirportID', u'OriginAirportSeqID',
       u'OriginCityMarketID', u'Origin', u'OriginCityName', u'OriginState',
       u'OriginStateFips', u'OriginStateName', u'OriginWac', u'DestAirportID',
       u'DestAirportSeqID', u'DestCityMarketID', u'Dest', u'DestCityName',
       u'DestState', u'DestStateFips', u'DestStateName', u'DestWac',
       u'CRSDepTime', u'DepTime', u'DepDelay', u'DepDelayMinutes', u'DepDel15',
       u'DepartureDelayGroups', u'DepTimeBlk', u'TaxiOut', u'WheelsOff',
       u'WheelsOn', u'TaxiIn', u'CRSArrTime', u'ArrTime', u'ArrDelay',
       u'ArrDelayMinutes', u'ArrDel15', u'ArrivalDelayGroups', u'ArrTimeBlk',
       u'Cancelled', u'CancellationCode', u'CRSElapsedTime',
       u'ActualElapsedTime', u'AirTime', u'Flights', u'Distance',
       u'DistanceGroup', u'CarrierDelay', u'Weat

In [7]:
#Check that all data frames have same column names
assert np.all(X.columns == OnTimeDataFrames[0].columns for X in OnTimeDataFrames)

In [8]:
#Correct ExpressJet Airways issue
for X in OnTimeDataFrames:
    X["Carrier Name"] = X["Carrier Name"].replace("ExpressJet Airlines Inc. (1)",
                                                               "ExpressJet Airlines Inc.")

In [9]:
#Get rid of rows with null values for ArrDelay
OnTimeDataNoNull = [X[np.isfinite(X.ArrDelay)] for X in OnTimeDataFrames]
for X in OnTimeDataNoNull: print X.shape

(98231, 68)
(98163, 68)
(98252, 68)
(98174, 68)
(98197, 68)
(98184, 68)
(98200, 68)
(98205, 68)
(98198, 68)
(98190, 68)
(98234, 68)
(98315, 68)
(98177, 68)
(98145, 68)
(88801, 68)


In [10]:
#Compute interactions for airlines * airports with a certain %
def interactions_aa(X, pct):
    #dictionary of features with percentage "1"s
    PercentageDict = {}
    for feature in X.columns:
        PercentageDict[feature] = X[feature].mean(axis=1)
    #calculate interactions
    X1 = X.copy()
    for feat_A in [name for name in X.columns if name[0] == 'C']:
        for feat_B in [name for name in X.columns if name[0] in list('DO') and PercentageDict[name] > pct]:
            X1[feat_A + "*" + feat_B] = (X1[feat_A].to_dense() * X1[feat_B].to_dense()).to_sparse()
    return X1

In [11]:
#Create dummy variables
def DummyRegressors(data):
    #Create dummies for airline, origin and destination airports, month, and time of day
    #Drop first column in each data frame to avoid singular matrix
    #Also include ArrDelay variable so we don't have to grab this separately from the old data frame
    CarrierDummies = pd.get_dummies(data["Carrier Name"], prefix="C", sparse=True, drop_first=True)
    OriginDummies = pd.get_dummies(data.Origin, prefix="O", sparse=True, drop_first=True)
    DestDummies = pd.get_dummies(data.Dest, prefix="D", sparse=True, drop_first=True)
    MonthDummies = pd.get_dummies(data.Month, prefix="M", sparse=True, drop_first=True)
    TimeDummies = pd.get_dummies(data.DepTimeBlk, prefix="T", sparse=True, drop_first=True)
    ArrDelay = data.ArrDelay
    
    return pd.concat([CarrierDummies, OriginDummies, DestDummies, MonthDummies, TimeDummies, ArrDelay], axis=1)

In [12]:
#Create list of data frames with dummies instead of raw data
OnTimeDummies = [DummyRegressors(X) for X in OnTimeDataNoNull]

In [13]:
for X in OnTimeDummies: print X.shape

(98231, 701)
(98163, 705)
(98252, 702)
(98174, 702)
(98197, 704)
(98184, 704)
(98200, 695)
(98205, 706)
(98198, 706)
(98190, 703)
(98234, 698)
(98315, 697)
(98177, 706)
(98145, 707)
(88801, 700)


Problem: not all airports and airlines are in all the data frames!  This makes sense and is a drawback of reading in the data in separate frames rather than splitting it up afterward.  We need to fix it by making sure if a column name appears in one dummy frame, it appears in all of them.

In [16]:
#create set of dummies that appear in any data frame, so each only appears once
#Try it the less efficient way, but this seems to work
DummyList = []
for X in OnTimeDummies:
    for name in X.columns:
        DummyList.append(name)
DummySet = set(DummyList)
len(DummySet)

717

In [17]:
len(DummySet)

717

In [18]:
#Now add dummies for all variables in set to OnTimeDummies frames
for X in OnTimeDummies:
    SparseZeroes = pd.SparseSeries([0 for n in range(len(X))])
    for name in DummySet:
        if name not in X.columns:
            X[name] = SparseZeroes.copy()

In [19]:
#Now check shapes of dummy frames again
for X in OnTimeDummies: print X.shape

(98231, 717)
(98163, 717)
(98252, 717)
(98174, 717)
(98197, 717)
(98184, 717)
(98200, 717)
(98205, 717)
(98198, 717)
(98190, 717)
(98234, 717)
(98315, 717)
(98177, 717)
(98145, 717)
(88801, 717)


In [20]:
#create list of data frames with interactions included
#airport threshold: 1.5% of flights
OnTimeDummies_Inter = [interactions_aa(X, .015) for X in OnTimeDummies]

In [21]:
for X in OnTimeDummies_Inter: print X.shape

(98231, 1397)
(98163, 1397)
(98252, 1397)
(98174, 1397)
(98197, 1397)
(98184, 1414)
(98200, 1414)
(98205, 1397)
(98198, 1397)
(98190, 1414)
(98234, 1397)
(98315, 1397)
(98177, 1397)
(98145, 1397)
(88801, 1414)


In [23]:
#Once again the number of dummies is not precisely the same between batches.  We need to do the DummySet procedure again.
InterList = []
for X in OnTimeDummies_Inter:
    for name in X.columns:
        InterList.append(name)
InterSet = set(InterList)
len(InterSet)

1431

In [24]:
#Add interactions to all frames
for X in OnTimeDummies_Inter:
    SparseZeroes = pd.SparseSeries([0 for n in range(len(X))])
    for name in InterSet:
        if name not in X.columns:
            X[name] = SparseZeroes.copy()

In [25]:
#Now check shapes again
for X in OnTimeDummies_Inter: print X.shape

(98231, 1431)
(98163, 1431)
(98252, 1431)
(98174, 1431)
(98197, 1431)
(98184, 1431)
(98200, 1431)
(98205, 1431)
(98198, 1431)
(98190, 1431)
(98234, 1431)
(98315, 1431)
(98177, 1431)
(98145, 1431)
(88801, 1431)


In [26]:
#Divide each interaction data set into training and test data sets
from sklearn.model_selection import train_test_split

In [27]:
OnTime_XTrainFrames, OnTime_XTestFrames, OnTime_YTrainFrames, OnTime_YTestFrames = [], [], [], []
for D in OnTimeDummies_Inter:
    X_train, X_test, Y_train, Y_test = train_test_split(D.drop("ArrDelay", axis=1), D.ArrDelay)
    OnTime_XTrainFrames.append(X_train)
    OnTime_XTestFrames.append(X_test)
    OnTime_YTrainFrames.append(Y_train)
    OnTime_YTestFrames.append(Y_test)

In [28]:
#import modeling formulas
from sklearn.linear_model import SGDRegressor, SGDClassifier

In [29]:
"""1. linear regression model using SGDRegressor, penalty = L1"""
linear = SGDRegressor(penalty='l1')

In [52]:
#Use SGDRegressor partial_fit function to train model on training data
for n in range(len(OnTime_XTrainFrames)):
    print n
    linear.partial_fit(OnTime_XTrainFrames[n], OnTime_YTrainFrames[n])

0


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [38]:
len(OnTime_YTrainFrames[0])

73673

In [41]:
assert all(np.isfinite(OnTime_YTrainFrames[0]))

In [42]:
assert all(np.isfinite(OnTime_XTrainFrames[0]))

In [43]:
OnTime_YTrainFrames[0].sort_values()

39928     -79.0
38709     -69.0
62641     -69.0
74466     -65.0
7449      -63.0
21596     -61.0
46069     -59.0
56949     -59.0
57195     -59.0
70687     -58.0
27533     -58.0
13691     -57.0
14042     -57.0
17133     -56.0
11618     -56.0
61942     -56.0
86407     -56.0
3908      -55.0
80027     -55.0
52488     -55.0
69780     -55.0
85826     -55.0
79866     -55.0
67301     -54.0
12729     -54.0
58330     -54.0
70099     -54.0
2098      -54.0
1583      -54.0
5577      -54.0
          ...  
73118     464.0
32803     466.0
47062     470.0
31777     473.0
79821     484.0
79056     500.0
46066     504.0
59862     512.0
62374     525.0
81511     534.0
75647     552.0
60303     572.0
75824     573.0
80648     610.0
40347     615.0
80600     647.0
71826     647.0
72440     664.0
3095      683.0
56270     717.0
77853     723.0
55382     755.0
38499     835.0
96890     856.0
2634      961.0
97997     978.0
53436    1006.0
6733     1018.0
42400    1141.0
26070    1272.0
Name: ArrDelay, dtype: f

In [45]:
OnTime_YTrainFrames[1].sort_values()

75324     -63.0
7008      -63.0
62173     -62.0
87626     -61.0
15431     -60.0
96339     -59.0
56481     -59.0
56182     -58.0
79692     -57.0
36407     -57.0
13620     -57.0
11362     -56.0
72224     -56.0
87182     -55.0
98603     -55.0
69238     -55.0
49856     -54.0
33405     -54.0
7019      -54.0
37186     -54.0
71736     -53.0
31424     -53.0
42865     -53.0
25175     -53.0
55349     -53.0
37575     -53.0
24281     -53.0
84165     -52.0
73818     -52.0
63301     -52.0
          ...  
7934      440.0
27330     441.0
18507     442.0
41920     443.0
96840     449.0
72640     464.0
51731     477.0
91276     485.0
28314     490.0
7169      490.0
75161     498.0
50931     527.0
73583     533.0
15416     543.0
8730      569.0
1519      576.0
78291     578.0
88298     596.0
96383     596.0
51572     603.0
26098     613.0
66881     617.0
59680     628.0
29981     635.0
28553     820.0
85385     899.0
90779     910.0
68979     919.0
19033    1084.0
36142    1530.0
Name: ArrDelay, dtype: f

In [47]:
OnTime_XTrainFrames[0].head()

Unnamed: 0,C_Alaska Airlines Inc.,C_American Airlines Inc.,C_Continental Air Lines Inc.,C_Delta Air Lines Inc.,C_Endeavor Air Inc.,C_Envoy Air,C_ExpressJet Airlines Inc.,C_Frontier Airlines Inc.,C_Hawaiian Airlines Inc.,C_JetBlue Airways,...,C_Hawaiian Airlines Inc.*O_MDW,C_United Air Lines Inc.*O_MDW,C_Frontier Airlines Inc.*D_MDW,C_American Airlines Inc.*D_MDW,C_Frontier Airlines Inc.*O_MDW,C_American Airlines Inc.*O_MDW,C_Virgin America*O_MDW,C_SkyWest Airlines Inc.*O_MDW,C_Southwest Airlines Co.*D_MDW,C_US Airways Inc.*D_MDW
33285,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
44745,0,0,0,0,0,0,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
987,0,0,0,0,0,0,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
58105,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34684,0,0,0,0,0,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [48]:
type(OnTime_XTrainFrames[0])

pandas.sparse.frame.SparseDataFrame