In [None]:
#The follwoing notebook contains code that takes in FlightRows10000 and prepares for early data anyslsis
#Work done includes removing mostly empty columns, dummies, and fit/transforming.

In [2]:
import pandas as pd
import numpy as np

In [3]:
#Short Feature Engineering, remove columns that would not be avaiable until after flight is over. Shortcut by peaking at columns
#avaiable in flights_test
heading=pd.read_csv('TestRows10forHeading.txt', delimiter='\t')
colList=heading.columns

In [4]:
dfRead = pd.read_csv('FlightRows10000.txt', delimiter='\t')
dfRead.head(5)

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,...,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,first_dep_time,total_add_gtime,longest_add_gtime,no_name
0,2019-10-08,WN,WN,WN,2014,WN,N7858A,2014,12892,LAX,...,373,,,,,,,,,
1,2018-07-23,DL,DL_CODESHARE,DL,5388,9E,N8969A,5388,10821,BWI,...,255,0.0,7.0,3.0,0.0,46.0,,,,
2,2019-02-28,AS,AS_CODESHARE,AS,3354,OO,N193SY,3354,14831,SJC,...,1450,,,,,,,,,
3,2018-06-25,WN,WN,WN,464,WN,N7702A,464,11292,DEN,...,472,,,,,,,,,
4,2019-05-01,AS,AS_CODESHARE,AS,2677,QX,N637QX,2677,14679,SAN,...,1080,,,,,,,,,


In [5]:
#Get target variable, do not standard scale, we need to preseve 0 position and use sign interpret delay
y=dfRead['arr_delay']

In [6]:
#get set of predictor col, X, by cross referencing heading colList
df=dfRead[colList]

In [7]:
#check column data type, observe int64, and object
df.dtypes

fl_date               object
mkt_unique_carrier    object
branded_code_share    object
mkt_carrier           object
mkt_carrier_fl_num     int64
op_unique_carrier     object
tail_num              object
op_carrier_fl_num      int64
origin_airport_id      int64
origin                object
origin_city_name      object
dest_airport_id        int64
dest                  object
dest_city_name        object
crs_dep_time           int64
crs_arr_time           int64
dup                   object
crs_elapsed_time       int64
flights                int64
distance               int64
dtype: object

In [8]:
#Check Columns for any empty
df.isnull().sum(axis=0)

fl_date               0
mkt_unique_carrier    0
branded_code_share    0
mkt_carrier           0
mkt_carrier_fl_num    0
op_unique_carrier     0
tail_num              0
op_carrier_fl_num     0
origin_airport_id     0
origin                0
origin_city_name      0
dest_airport_id       0
dest                  0
dest_city_name        0
crs_dep_time          0
crs_arr_time          0
dup                   0
crs_elapsed_time      0
flights               0
distance              0
dtype: int64

In [9]:
#Unique value count within numerical col
df.select_dtypes(exclude='object').nunique(axis=0, dropna=True)

#Flight useless, should remove
#mkt_carrier_fl_num 7226 in full database
#op_carrier_fl_num 7252 in full database
#origin_airport_id 376 in full database
#dest_airport_id 376 in full database

mkt_carrier_fl_num    4840
op_carrier_fl_num     4841
origin_airport_id      305
dest_airport_id        319
crs_dep_time          1068
crs_arr_time          1191
crs_elapsed_time       386
flights                  1
distance              1286
dtype: int64

In [10]:
#Make dfNumericValues in preperation for fit transform.
dfNumericValues=df[['crs_dep_time','crs_arr_time','crs_elapsed_time','distance']]
dfNumericValues.head()

Unnamed: 0,crs_dep_time,crs_arr_time,crs_elapsed_time,distance
0,1840,2010,90,373
1,1526,1636,70,255
2,945,1520,215,1450
3,1350,1615,85,472
4,1420,1720,180,1080


In [11]:
#convert crs times, written in 2400 format, into hours by simply moving decimal place over 2 times, ignore errors

dfNumericValues.loc[:,'crs_dep_time']=dfRead.loc[:,'crs_dep_time']//100
dfNumericValues.loc[:,'crs_arr_time']=dfRead.loc[:,'crs_arr_time']//100
dfNumericValues

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item_labels[indexer[info_axis]]] = value


Unnamed: 0,crs_dep_time,crs_arr_time,crs_elapsed_time,distance
0,18,20,90,373
1,15,16,70,255
2,9,15,215,1450
3,13,16,85,472
4,14,17,180,1080
...,...,...,...,...
9995,10,12,85,337
9996,12,13,86,357
9997,12,14,98,468
9998,20,22,133,748


In [12]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X=scaler.fit_transform(dfNumericValues)
X.shape
#Don't scale y, negative signs important and fit transform will not preserve that property

(10000, 4)

In [13]:
#Convert X back to dataframe, add back col name, for pd.concat later
colList=dfNumericValues.columns
colList= [col+'FT' for col in colList]

X=pd.DataFrame(X, columns=colList)
X

Unnamed: 0,crs_dep_timeFT,crs_arr_timeFT,crs_elapsed_timeFT,distanceFT
0,1.015688,1.049576,-0.669158,-0.673926
1,0.400231,0.274710,-0.950475,-0.876863
2,-0.830681,0.080993,1.089078,1.178308
3,-0.010073,0.274710,-0.739487,-0.503665
4,0.195079,0.468426,0.596772,0.541979
...,...,...,...,...
9995,-0.625529,-0.500157,-0.739487,-0.735839
9996,-0.215225,-0.306440,-0.725421,-0.701443
9997,-0.215225,-0.112724,-0.556630,-0.510544
9998,1.425992,1.437009,-0.064325,-0.028998


In [14]:
#Convert numerical col that are actually categorical into objects
df=df.astype({
    'mkt_carrier_fl_num':object,
    'op_carrier_fl_num':object,
    'origin_airport_id':object,
    'dest_airport_id':object
})

In [15]:
#Number of Unique count in each objects, deciding which are worth grabbing for dummies
(
    df
    .select_dtypes(include='object')
    .nunique(axis=0, dropna=True)
)

fl_date                730
mkt_unique_carrier      11
branded_code_share      16
mkt_carrier             11
mkt_carrier_fl_num    4840
op_unique_carrier       28
tail_num              4664
op_carrier_fl_num     4841
origin_airport_id      305
origin                 305
origin_city_name       300
dest_airport_id        319
dest                   319
dest_city_name         312
dup                      1
dtype: int64

In [16]:
#Tail num seems useless, but actually in all 16mil, there are 6487 unique values. Consider keeping? But our model will need more data
#Dup has a single value and has no predictive properties. All others seem good
df['tail_num'], df['dup']

(0       N7858A
 1       N8969A
 2       N193SY
 3       N7702A
 4       N637QX
          ...  
 9995    N754SW
 9996    N868CA
 9997    N14993
 9998    N708SK
 9999    N448SW
 Name: tail_num, Length: 10000, dtype: object,
 0       N
 1       N
 2       N
 3       N
 4       N
        ..
 9995    N
 9996    N
 9997    N
 9998    N
 9999    N
 Name: dup, Length: 10000, dtype: object)

In [17]:
dfObjects=(df
    .select_dtypes(include='object')
    .drop(['dup'],1)
)
dfObjects

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,origin_city_name,dest_airport_id,dest,dest_city_name
0,2019-10-08,WN,WN,WN,2014,WN,N7858A,2014,12892,LAX,"Los Angeles, CA",14893,SMF,"Sacramento, CA"
1,2018-07-23,DL,DL_CODESHARE,DL,5388,9E,N8969A,5388,10821,BWI,"Baltimore, MD",14492,RDU,"Raleigh/Durham, NC"
2,2019-02-28,AS,AS_CODESHARE,AS,3354,OO,N193SY,3354,14831,SJC,"San Jose, CA",11259,DAL,"Dallas, TX"
3,2018-06-25,WN,WN,WN,464,WN,N7702A,464,11292,DEN,"Denver, CO",13871,OMA,"Omaha, NE"
4,2019-05-01,AS,AS_CODESHARE,AS,2677,QX,N637QX,2677,14679,SAN,"San Diego, CA",14004,PAE,"Everett, WA"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,2018-05-21,WN,WN,WN,1697,WN,N754SW,1697,13796,OAK,"Oakland, CA",12892,LAX,"Los Angeles, CA"
9996,2019-04-07,DL,DL_CODESHARE,DL,4005,OO,N868CA,4005,10397,ATL,"Atlanta, GA",14574,ROA,"Roanoke, VA"
9997,2019-01-28,UA,UA_CODESHARE,UA,4016,EV,N14993,4016,12266,IAH,"Houston, TX",13244,MEM,"Memphis, TN"
9998,2018-11-14,AA,AA_CODESHARE,AA,2996,OO,N708SK,2996,12892,LAX,"Los Angeles, CA",11603,EUG,"Eugene, OR"


In [18]:
# dateTimeConvert=pd.to_datetime(dfObjects['fl_date'], infer_datetime_format=True)
# dateTimeConvert=pd.DatetimeIndex(dateTimeConvert).month
# dateTimeConvert

#one line code of the above
dfObjects['month']=pd.DatetimeIndex(pd.to_datetime(df['fl_date'], infer_datetime_format=True)).month
dfObjects.drop(columns='fl_date', inplace=True, errors='ignore')

In [19]:
dfConcat= pd.concat([dfObjects,X], axis=1)
dfConcat

Unnamed: 0,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,origin_city_name,dest_airport_id,dest,dest_city_name,month,crs_dep_timeFT,crs_arr_timeFT,crs_elapsed_timeFT,distanceFT
0,WN,WN,WN,2014,WN,N7858A,2014,12892,LAX,"Los Angeles, CA",14893,SMF,"Sacramento, CA",10,1.015688,1.049576,-0.669158,-0.673926
1,DL,DL_CODESHARE,DL,5388,9E,N8969A,5388,10821,BWI,"Baltimore, MD",14492,RDU,"Raleigh/Durham, NC",7,0.400231,0.274710,-0.950475,-0.876863
2,AS,AS_CODESHARE,AS,3354,OO,N193SY,3354,14831,SJC,"San Jose, CA",11259,DAL,"Dallas, TX",2,-0.830681,0.080993,1.089078,1.178308
3,WN,WN,WN,464,WN,N7702A,464,11292,DEN,"Denver, CO",13871,OMA,"Omaha, NE",6,-0.010073,0.274710,-0.739487,-0.503665
4,AS,AS_CODESHARE,AS,2677,QX,N637QX,2677,14679,SAN,"San Diego, CA",14004,PAE,"Everett, WA",5,0.195079,0.468426,0.596772,0.541979
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,WN,WN,WN,1697,WN,N754SW,1697,13796,OAK,"Oakland, CA",12892,LAX,"Los Angeles, CA",5,-0.625529,-0.500157,-0.739487,-0.735839
9996,DL,DL_CODESHARE,DL,4005,OO,N868CA,4005,10397,ATL,"Atlanta, GA",14574,ROA,"Roanoke, VA",4,-0.215225,-0.306440,-0.725421,-0.701443
9997,UA,UA_CODESHARE,UA,4016,EV,N14993,4016,12266,IAH,"Houston, TX",13244,MEM,"Memphis, TN",1,-0.215225,-0.112724,-0.556630,-0.510544
9998,AA,AA_CODESHARE,AA,2996,OO,N708SK,2996,12892,LAX,"Los Angeles, CA",11603,EUG,"Eugene, OR",11,1.425992,1.437009,-0.064325,-0.028998


In [20]:
dfConcat.nunique(axis=0, dropna=True)

mkt_unique_carrier      11
branded_code_share      16
mkt_carrier             11
mkt_carrier_fl_num    4840
op_unique_carrier       28
tail_num              4664
op_carrier_fl_num     4841
origin_airport_id      305
origin                 305
origin_city_name       300
dest_airport_id        319
dest                   319
dest_city_name         312
month                   12
crs_dep_timeFT          24
crs_arr_timeFT          24
crs_elapsed_timeFT     386
distanceFT            1286
dtype: int64

In [21]:
#Last Chance to drop all highly correlated cols before calling dummies
#drop columns because highly correlated with another column
dfConcat.drop(columns=['mkt_unique_carrier','branded_code_share','mkt_carrier','mkt_carrier_fl_num','origin_airport_id','origin_city_name','dest_airport_id','dest_city_name'], inplace=True, errors='ignore')
#drop columns because of sample size issue. reintroduce when working with larget set, maybe
#tail_num and op_carrier_fl_num correlated. The sql join returns 6713 entries total. Can pick just one, or combine.
dfConcat.drop(columns=['tail_num','op_carrier_fl_num'], inplace=True, errors='ignore')
dfConcat

Unnamed: 0,op_unique_carrier,origin,dest,month,crs_dep_timeFT,crs_arr_timeFT,crs_elapsed_timeFT,distanceFT
0,WN,LAX,SMF,10,1.015688,1.049576,-0.669158,-0.673926
1,9E,BWI,RDU,7,0.400231,0.274710,-0.950475,-0.876863
2,OO,SJC,DAL,2,-0.830681,0.080993,1.089078,1.178308
3,WN,DEN,OMA,6,-0.010073,0.274710,-0.739487,-0.503665
4,QX,SAN,PAE,5,0.195079,0.468426,0.596772,0.541979
...,...,...,...,...,...,...,...,...
9995,WN,OAK,LAX,5,-0.625529,-0.500157,-0.739487,-0.735839
9996,OO,ATL,ROA,4,-0.215225,-0.306440,-0.725421,-0.701443
9997,EV,IAH,MEM,1,-0.215225,-0.112724,-0.556630,-0.510544
9998,OO,LAX,EUG,11,1.425992,1.437009,-0.064325,-0.028998


In [22]:
# dfComplete=pd.get_dummies(dfConcat)
# dfComplete
# #ok... 17000 columns. Let's get it down

In [23]:
# dfComplete.corr()

In [24]:
# # Create correlation matrix
# corr_matrix = dfConcat.corr().abs()

# # Select upper triangle of correlation matrix
# upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# # Find index of feature columns with correlation greater than 0.95
# to_drop = [column for column in upper.columns if any(upper[column] > 0.50)]
# to_drop

In [25]:
pd.get_dummies(dfConcat)

Unnamed: 0,month,crs_dep_timeFT,crs_arr_timeFT,crs_elapsed_timeFT,distanceFT,op_unique_carrier_9E,op_unique_carrier_9K,op_unique_carrier_AA,op_unique_carrier_AS,op_unique_carrier_AX,...,dest_TYR,dest_TYS,dest_USA,dest_VEL,dest_VPS,dest_WRG,dest_WYS,dest_XNA,dest_YKM,dest_YUM
0,10,1.015688,1.049576,-0.669158,-0.673926,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,0.400231,0.274710,-0.950475,-0.876863,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,-0.830681,0.080993,1.089078,1.178308,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,6,-0.010073,0.274710,-0.739487,-0.503665,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,0.195079,0.468426,0.596772,0.541979,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,5,-0.625529,-0.500157,-0.739487,-0.735839,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9996,4,-0.215225,-0.306440,-0.725421,-0.701443,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9997,1,-0.215225,-0.112724,-0.556630,-0.510544,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9998,11,1.425992,1.437009,-0.064325,-0.028998,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
# #Output X File
# pd.get_dummies(dfConcat).to_csv('X.csv')


In [27]:
dfRead['arr_delay'].values.shape

(10000,)

In [28]:
#Prepare y for fit transform. Grab signs of y now, as we will lose after FT
y_sign=((y>0)*1)
y=scaler.fit_transform(dfRead['arr_delay'].values.reshape(-1, 1))

In [29]:
dfy=pd.DataFrame(y)
dfy['y_sign']=y_sign
dfy

Unnamed: 0,0,y_sign
0,-0.422335,0
1,1.016008,1
2,0.043607,1
3,-0.280526,0
4,-0.077943,1
...,...,...
9995,0.408257,1
9996,0.489291,1
9997,-0.017168,1
9998,-0.118460,0


In [30]:
# #col y is transformed y values, col 2 is the original sign.
# dfy.to_csv('y.csv')

In [48]:
FinalDF=pd.get_dummies(dfConcat)
FinalDF['yFT']=y
FinalDF['y_sign']=y_sign
FinalDF.dropna(inplace=True)

In [50]:
FinalDF.iloc[:,:-2].to_csv('X.csv')
FinalDF.iloc[:,-2:].to_csv('y.csv')

In [32]:
# pd.get_dummies(dfConcat).to_csv('X.csv')

In [33]:
#The below are highly highly correlated columns. And some initial exploration
#But they can be dropped at next step. Left behind as examples

In [34]:
dfObjects.groupby(['origin_city_name','origin']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,dest_airport_id,dest,dest_city_name,month
origin_city_name,origin,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
"Abilene, TX",ABI,2,2,2,2,2,2,2,2,2,2,2,2
"Aguadilla, PR",BQN,3,3,3,3,3,3,3,3,3,3,3,3
"Akron, OH",CAK,9,9,9,9,9,9,9,9,9,9,9,9
"Albany, NY",ALB,18,18,18,18,18,18,18,18,18,18,18,18
"Albuquerque, NM",ABQ,40,40,40,40,40,40,40,40,40,40,40,40
...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Wichita, KS",ICT,12,12,12,12,12,12,12,12,12,12,12,12
"Williston, ND",ISN,1,1,1,1,1,1,1,1,1,1,1,1
"Wilmington, NC",ILM,9,9,9,9,9,9,9,9,9,9,9,9
"Worcester, MA",ORH,2,2,2,2,2,2,2,2,2,2,2,2


In [35]:
# Terrible code to find why origin city has 300 vs origin has 305, for curiosity
dfObjects.groupby(['origin_city_name','origin']).count().reset_index().groupby('origin_city_name').count().sort_values('origin')


Unnamed: 0_level_0,origin,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,dest_airport_id,dest,dest_city_name,month
origin_city_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
"Abilene, TX",1,1,1,1,1,1,1,1,1,1,1,1,1
"Norfolk, VA",1,1,1,1,1,1,1,1,1,1,1,1,1
"Niagara Falls, NY",1,1,1,1,1,1,1,1,1,1,1,1,1
"Newport News/Williamsburg, VA",1,1,1,1,1,1,1,1,1,1,1,1,1
"Newburgh/Poughkeepsie, NY",1,1,1,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Chicago, IL",2,2,2,2,2,2,2,2,2,2,2,2,2
"New York, NY",2,2,2,2,2,2,2,2,2,2,2,2,2
"Houston, TX",2,2,2,2,2,2,2,2,2,2,2,2,2
"Phoenix, AZ",2,2,2,2,2,2,2,2,2,2,2,2,2


In [36]:
dfObjects.loc[dfObjects['origin_city_name']=='Chicago, IL']['origin'].unique()

array(['ORD', 'MDW'], dtype=object)

In [37]:
dfObjects.loc[dfObjects['origin_city_name']=='New York, NY']['origin'].unique()

array(['LGA', 'JFK'], dtype=object)

In [38]:
dfObjects.loc[dfObjects['origin_city_name']=='Houston, TX']['origin'].unique()

array(['HOU', 'IAH'], dtype=object)

In [39]:
dfObjects.loc[dfObjects['origin_city_name']=='Phoenix, AZ']['origin'].unique()

array(['PHX', 'AZA'], dtype=object)

In [40]:
dfObjects.loc[dfObjects['origin_city_name']=='Washington, DC']['origin'].unique()

array(['DCA', 'IAD'], dtype=object)

In [41]:
#Keep Code Share, maybe it matters
dfObjects.groupby(['mkt_unique_carrier','branded_code_share']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,origin_city_name,dest_airport_id,dest,dest_city_name,month
mkt_unique_carrier,branded_code_share,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
AA,AA,1158,1158,1158,1158,1158,1158,1158,1158,1158,1158,1158,1158
AA,AA_CODESHARE,1439,1439,1439,1439,1439,1439,1439,1439,1439,1439,1439,1439
AS,AS,333,333,333,333,333,333,333,333,333,333,333,333
AS,AS_CODESHARE,194,194,194,194,194,194,194,194,194,194,194,194
B6,B6,370,370,370,370,370,370,370,370,370,370,370,370
DL,DL,1217,1217,1217,1217,1217,1217,1217,1217,1217,1217,1217,1217
DL,DL_CODESHARE,986,986,986,986,986,986,986,986,986,986,986,986
F9,F9,158,158,158,158,158,158,158,158,158,158,158,158
G4,G4,124,124,124,124,124,124,124,124,124,124,124,124
HA,HA,98,98,98,98,98,98,98,98,98,98,98,98


In [42]:
dfObjects.drop(['origin_city_name','dest_city_name','mkt_unique_carrier'], inplace=True,errors='ignore', axis=1)
dfObjects

Unnamed: 0,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,dest_airport_id,dest,month
0,WN,WN,2014,WN,N7858A,2014,12892,LAX,14893,SMF,10
1,DL_CODESHARE,DL,5388,9E,N8969A,5388,10821,BWI,14492,RDU,7
2,AS_CODESHARE,AS,3354,OO,N193SY,3354,14831,SJC,11259,DAL,2
3,WN,WN,464,WN,N7702A,464,11292,DEN,13871,OMA,6
4,AS_CODESHARE,AS,2677,QX,N637QX,2677,14679,SAN,14004,PAE,5
...,...,...,...,...,...,...,...,...,...,...,...
9995,WN,WN,1697,WN,N754SW,1697,13796,OAK,12892,LAX,5
9996,DL_CODESHARE,DL,4005,OO,N868CA,4005,10397,ATL,14574,ROA,4
9997,UA_CODESHARE,UA,4016,EV,N14993,4016,12266,IAH,13244,MEM,1
9998,AA_CODESHARE,AA,2996,OO,N708SK,2996,12892,LAX,11603,EUG,11


In [43]:
pd.DataFrame(y).isnull()

Unnamed: 0,0
0,False
1,False
2,False
3,False
4,False
...,...
9995,False
9996,False
9997,False
9998,False
