In [1]:
#Initial commands

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns



In [2]:
#Import data in batches instead of all at once

OnTimeDataFrames = []
for n in range(15):
    #100,001 rows total: 100,000 of data, plus first row for column names!  Easy to mess up...
    OnTimeDataFrames.append(pd.read_csv('Sample_2011_2016_car-mark-lat-long', nrows=100001,
                                        skiprows=range(1, 1 + 100000*n) if n > 0 else None))

In [3]:
#Check shapes of all data frames - all except the last should have 100,000 rows, all should have same no. of columns
for X in OnTimeDataFrames: print X.shape

(100001, 79)
(100001, 79)
(100001, 79)
(100001, 79)
(100001, 79)
(100001, 79)
(100001, 79)
(100001, 79)
(100001, 79)
(100001, 79)
(100001, 79)
(100001, 79)
(100001, 79)
(100001, 79)
(90434, 79)


In [4]:
#Check column names of second data frame
OnTimeDataFrames[1].columns

Index([u'Unnamed: 0', u'Unnamed: 0.1', u'Year', u'Quarter', u'Month',
       u'DayofMonth', u'DayOfWeek', u'FlightDate', u'UniqueCarrier',
       u'AirlineID', u'Carrier', u'TailNum', u'FlightNum', u'OriginAirportID',
       u'OriginAirportSeqID', u'OriginCityMarketID', u'Origin',
       u'OriginCityName', u'OriginState', u'OriginStateFips',
       u'OriginStateName', u'OriginWac', u'DestAirportID', u'DestAirportSeqID',
       u'DestCityMarketID', u'Dest', u'DestCityName', u'DestState',
       u'DestStateFips', u'DestStateName', u'DestWac', u'CRSDepTime',
       u'DepTime', u'DepDelay', u'DepDelayMinutes', u'DepDel15',
       u'DepartureDelayGroups', u'DepTimeBlk', u'TaxiOut', u'WheelsOff',
       u'WheelsOn', u'TaxiIn', u'CRSArrTime', u'ArrTime', u'ArrDelay',
       u'ArrDelayMinutes', u'ArrDel15', u'ArrivalDelayGroups', u'ArrTimeBlk',
       u'Cancelled', u'CancellationCode', u'CRSElapsedTime',
       u'ActualElapsedTime', u'AirTime', u'Flights', u'Distance',
       u'DistanceGroup',

In [5]:
#Check that all data frames have same column names
assert np.all(X.columns == OnTimeDataFrames[0].columns for X in OnTimeDataFrames)

In [6]:
#Correct ExpressJet Airways issue
for X in OnTimeDataFrames:
    X["Carrier Name"] = X["Carrier Name"].replace("ExpressJet Airlines Inc. (1)",
                                                               "ExpressJet Airlines Inc.")

In [7]:
#Get rid of rows with null values for ArrDelay
OnTimeDataNoNull = [X[np.isfinite(X.ArrDelay)] for X in OnTimeDataFrames]
for X in OnTimeDataNoNull: print X.shape

(97702, 79)
(96767, 79)
(98632, 79)
(98305, 79)
(98376, 79)
(98125, 79)
(98547, 79)
(98774, 79)
(98359, 79)
(98305, 79)
(98463, 79)
(99112, 79)
(98756, 79)
(96987, 79)
(88456, 79)


In [8]:
#Compute interactions for airlines * airports with a certain %
def interactions_aa(X, pct):
    #dictionary of features with percentage "1"s
    PercentageDict = {}
    for feature in X.columns:
        PercentageDict[feature] = X[feature].mean(axis=1)
    #calculate interactions
    X1 = X.copy()
    for feat_A in [name for name in X.columns if name[0] == 'C']:
        for feat_B in [name for name in X.columns if name[0] in list('DO') and PercentageDict[name] > pct]:
            X1[feat_A + "*" + feat_B] = (X1[feat_A].to_dense() * X1[feat_B].to_dense()).to_sparse()
    return X1

In [9]:
#Create dummy variables
def DummyRegressors(data):
    #Create dummies for airline, origin and destination airports, month, and time of day
    #Drop first column in each data frame to avoid singular matrix
    #Also include ArrDelay variable so we don't have to grab this separately from the old data frame
    CarrierDummies = pd.get_dummies(data["Carrier Name"], prefix="C", sparse=True, drop_first=True)
    OriginDummies = pd.get_dummies(data.Origin, prefix="O", sparse=True, drop_first=True)
    DestDummies = pd.get_dummies(data.Dest, prefix="D", sparse=True, drop_first=True)
    MonthDummies = pd.get_dummies(data.Month, prefix="M", sparse=True, drop_first=True)
    TimeDummies = pd.get_dummies(data.DepTimeBlk, prefix="T", sparse=True, drop_first=True)
    ArrDelay = data.ArrDelay
    
    return pd.concat([CarrierDummies, OriginDummies, DestDummies, MonthDummies, TimeDummies, ArrDelay], axis=1)

In [10]:
#Create list of data frames with dummies instead of raw data
OnTimeDummies = [DummyRegressors(X) for X in OnTimeDataNoNull]

In [11]:
for X in OnTimeDummies: print X.shape

(97702, 172)
(96767, 205)
(98632, 172)
(98305, 186)
(98376, 168)
(98125, 166)
(98547, 224)
(98774, 222)
(98359, 202)
(98305, 220)
(98463, 170)
(99112, 182)
(98756, 216)
(96987, 235)
(88456, 317)


Problem: not all airports and airlines are in all the data frames!  This makes sense and is a drawback of reading in the data in separate frames rather than splitting it up afterward.  We need to fix it by making sure if a column name appears in one dummy frame, it appears in all of them.

In [12]:
#create set of dummies that appear in any data frame, so each only appears once
DummySet = set([name for name in X.columns for X in OnTimeDummies])
DummySet

{'ArrDelay',
 'C_Alaska Airlines Inc.',
 'C_American Airlines Inc.',
 'C_Continental Air Lines Inc.',
 'C_Delta Air Lines Inc.',
 'C_Endeavor Air Inc.',
 'C_Envoy Air',
 'C_ExpressJet Airlines Inc.',
 'C_Frontier Airlines Inc.',
 'C_Hawaiian Airlines Inc.',
 'C_JetBlue Airways',
 'C_Mesa Airlines Inc.',
 'C_SkyWest Airlines Inc.',
 'C_Southwest Airlines Co.',
 'C_Spirit Air Lines',
 'C_US Airways Inc.',
 'C_United Air Lines Inc.',
 'D_ABY',
 'D_ACK',
 'D_ACT',
 'D_ACV',
 'D_ADK',
 'D_ADQ',
 'D_AGS',
 'D_AKN',
 'D_APN',
 'D_AZA',
 'D_BET',
 'D_BFL',
 'D_BGM',
 'D_BGR',
 'D_BJI',
 'D_BLI',
 'D_BPT',
 'D_BQK',
 'D_BQN',
 'D_BRD',
 'D_BRO',
 'D_BRW',
 'D_BTM',
 'D_BTR',
 'D_CDC',
 'D_CDV',
 'D_CEC',
 'D_CIC',
 'D_CIU',
 'D_CLD',
 'D_CLL',
 'D_CNY',
 'D_CPR',
 'D_CRP',
 'D_CSG',
 'D_CYS',
 'D_DAB',
 'D_DHN',
 'D_DIK',
 'D_DLG',
 'D_DRO',
 'D_DRT',
 'D_DVL',
 'D_ECP',
 'D_EKO',
 'D_ESC',
 'D_EUG',
 'D_EWN',
 'D_EYW',
 'D_FAT',
 'D_FAY',
 'D_FLG',
 'D_FSM',
 'D_GCC',
 'D_GCK',
 'D_GFK',
 'D_G

In [13]:
len(DummySet) # This seems wrong - only taking the dummies from the last frame

317

In [14]:
#Try it the less efficient but more correct way
DummyList = []
for X in OnTimeDummies:
    for name in X.columns:
        DummyList.append(name)
DummySet = set(DummyList)
len(DummySet)

706

In [15]:
#Now add dummies for all variables in set to OnTimeDummies frames
for X in OnTimeDummies:
    for name in DummySet:
        if name not in X.columns:
            X[name] = 0
#This code raised Memory Error - not sure why

706
706
706
706
706
706
706
706
706
706
706
706
706
706
706


In [None]:
#Now check shapes of dummy frames again
for X in OnTimeDummies: print X.shape

In [66]:
#create list of data frames with interactions included
#airport threshold: 1.5% of flights
OnTimeDummies_Inter = [interactions_aa(X, .015) for X in OnTimeDummies]

KeyboardInterrupt: 

In [None]:
for X in OnTimeDummies_Inter: print X.shape

In [29]:
#Divide each interaction data set into training and test data sets
from sklearn.model_selection import train_test_split

In [37]:
OnTime_XTrainFrames, OnTime_XTestFrames, OnTime_YTrainFrames, OnTime_YTestFrames = [], [], [], []
for D in OnTimeDummies_Inter:
    X_train, X_test, Y_train, Y_test = train_test_split(D.drop("ArrDelay", axis=1), D.ArrDelay)
    OnTime_XTrainFrames.append(X_train)
    OnTime_XTestFrames.append(X_test)
    OnTime_YTrainFrames.append(Y_train)
    OnTime_YTestFrames.append(Y_test)

In [38]:
#import modeling formulas
from sklearn.linear_model import SGDRegressor, SGDClassifier

In [39]:
"""1. linear regression model using SGDRegressor, penalty = L1"""
linear = SGDRegressor(penalty='l1')

In [40]:
#Use SGDRegressor partial_fit function to train model on training data
for n in range(len(OnTime_XTrainFrames)):
    linear.partial_fit(OnTime_XTrainFrames[n], OnTime_YTrainFrames[n])

ValueError: Number of features 612 does not match previous data 441.