In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import collections

In [2]:
df = pd.read_csv('data/train_data_raw.csv')

In [3]:
df.columns

Index(['fl_date', 'mkt_unique_carrier', 'branded_code_share', 'mkt_carrier',
       'mkt_carrier_fl_num', 'op_unique_carrier', 'tail_num',
       'op_carrier_fl_num', 'origin_airport_id', 'origin', 'origin_city_name',
       'dest_airport_id', 'dest', 'dest_city_name', 'crs_dep_time', 'dep_time',
       'dep_delay', 'taxi_out', 'wheels_off', 'wheels_on', 'taxi_in',
       'crs_arr_time', 'arr_time', 'arr_delay', 'cancelled',
       'cancellation_code', 'diverted', 'dup', 'crs_elapsed_time',
       'actual_elapsed_time', 'air_time', 'flights', 'distance',
       'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay',
       'late_aircraft_delay', 'first_dep_time', 'total_add_gtime',
       'longest_add_gtime'],
      dtype='object')

In [4]:
df.shape

(300000, 41)

__Dataframe detail:__
- Each month (Jan - Dec) has randomly 25000 flights. 
- The data only take from 2019

### Data Cleaning

__Check the ralation between cancelled and diverted with arr_delay__

In [5]:
df[(df['cancelled'] == 1) | (df['diverted'] == 1)]['arr_delay'].isna().sum()

6554

In [6]:
# If the flight is cancelled or diverted 
# => we dont have the value for arr_delay
# => we remove the rows that the flights had beed cancelled or diverted

In [7]:
df = df[df['cancelled'] != 1]
df = df[df['diverted'] != 1]

In [8]:
df.shape

(293446, 41)

#### Table flights_test
This table consists of subset of columns from table flights. It represents flights from January 2020 which will be used for evaluation. Therefore, we are missing some features that we are not suppossed to know before the flight lands.

##### Variables:

__fl_date__: Flight Date (yyyy-mm-dd)  
__mkt_unique_carrier__: Unique Marketing Carrier Code. When the same code has been used by multiple carriers, a numeric suffix is used for earlier users, for example, PA, PA(1), PA(2). Use this field for analysis across a range of years.  
__branded_code_share__: Reporting Carrier Operated or Branded Code Share Partners  
__mkt_carrier__: Code assigned by IATA and commonly used to identify a carrier. As the same code may have been assigned to different carriers over time, the code is not always unique. For analysis, use the Unique Carrier Code.  
__mkt_carrier_fl_num__: Flight Number  
__op_unique_carrier__: Unique Scheduled Operating Carrier Code. When the same code has been used by multiple carriers, a numeric suffix is used for earlier users,for example, PA, PA(1), PA(2). Use this field for analysis across a range of years.  
__tail_num__: Tail Number  
__op_carrier_fl_num__: Flight Number  
__origin_airport_id__: Origin Airport, Airport ID. An identification number assigned by US DOT to identify a unique airport. Use this field for airport analysis across a range of years because an airport can change its airport code and airport codes can be reused.  
__origin__: Origin Airport  
__origin_city_name__: Origin Airport, City Name  
__dest_airport_id__: Destination Airport, Airport ID. An identification number assigned by US DOT to identify a unique airport. Use this field for airport analysis across a range of years because an airport can change its airport code and airport codes can be reused.  
__dest__: Destination Airport  
__dest_city_name__: Destination Airport, City Name  
__crs_dep_time__: CRS Departure Time (local time: hhmm)  
__crs_arr_time__: CRS Arrival Time (local time: hhmm)  
__dup__: Duplicate flag marked Y if the flight is swapped based on Form-3A data  
__crs_elapsed_time__: CRS Elapsed Time of Flight, in Minutes  
__flights__: Number of Flights  
__distance__: Distance between airports (miles)  

#### _Discussion_
- _We only have 19 features (columns) in the test data, so we try to use the same feature for our sample data for futher exploration_
- _We add 1 more feature_ __arr_delay__ _is our target_
- I will use 'carrier_delay','weather_delay','dep_delay' to convert categorical feature into ordinal features based on weight

In [9]:
df_flight = df[['fl_date', 'mkt_unique_carrier', 'branded_code_share', 
        'mkt_carrier', 'mkt_carrier_fl_num',
        'op_unique_carrier', 'tail_num', 'op_carrier_fl_num', 'origin_airport_id',
        'origin', 'origin_city_name', 'dest_airport_id', 'dest', 'dest_city_name',
        'crs_dep_time', 'crs_arr_time', 'dup', 'crs_elapsed_time', 'flights', 'distance',
        'arr_delay', 'carrier_delay','weather_delay','dep_delay', 'taxi_out', 'taxi_in']]
# 'carrier_delay','weather_delay','dep_delay' is used to convert categorical feature into ordinal features based on weight

In [10]:
df_flight_test = pd.read_csv('data/flights_test.csv')

In [11]:
df_flight.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 293446 entries, 0 to 299999
Data columns (total 26 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   fl_date             293446 non-null  object 
 1   mkt_unique_carrier  293446 non-null  object 
 2   branded_code_share  293446 non-null  object 
 3   mkt_carrier         293446 non-null  object 
 4   mkt_carrier_fl_num  293446 non-null  int64  
 5   op_unique_carrier   293446 non-null  object 
 6   tail_num            293446 non-null  object 
 7   op_carrier_fl_num   293446 non-null  int64  
 8   origin_airport_id   293446 non-null  int64  
 9   origin              293446 non-null  object 
 10  origin_city_name    293446 non-null  object 
 11  dest_airport_id     293446 non-null  int64  
 12  dest                293446 non-null  object 
 13  dest_city_name      293446 non-null  object 
 14  crs_dep_time        293446 non-null  int64  
 15  crs_arr_time        293446 non-nul

In [12]:
df_flight.isna().sum()

fl_date                    0
mkt_unique_carrier         0
branded_code_share         0
mkt_carrier                0
mkt_carrier_fl_num         0
op_unique_carrier          0
tail_num                   0
op_carrier_fl_num          0
origin_airport_id          0
origin                     0
origin_city_name           0
dest_airport_id            0
dest                       0
dest_city_name             0
crs_dep_time               0
crs_arr_time               0
dup                        0
crs_elapsed_time           0
flights                    0
distance                   0
arr_delay                  0
carrier_delay         236653
weather_delay         236653
dep_delay                  0
taxi_out                   0
taxi_in                    0
dtype: int64

In [13]:
# transform the feature fl_date into datetime_type
df_flight['fl_date'] = pd.to_datetime(df_flight['fl_date'])
df_flight['fl_date'].dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_flight['fl_date'] = pd.to_datetime(df_flight['fl_date'])


dtype('<M8[ns]')

In [14]:
df_flight_test['fl_date'] = pd.to_datetime(df_flight_test['fl_date'])
df_flight_test['fl_date'].dtypes

dtype('<M8[ns]')

__Check the relation between ['mkt_unique_carrier', 'mkt_carrier', 'branded_code_share']__

In [15]:
df_flight[['mkt_unique_carrier', 'mkt_carrier', 'branded_code_share']]

Unnamed: 0,mkt_unique_carrier,mkt_carrier,branded_code_share
0,DL,DL,DL
1,WN,WN,WN
2,AA,AA,AA
3,UA,UA,UA_CODESHARE
4,DL,DL,DL
...,...,...,...
299995,DL,DL,DL
299996,AA,AA,AA
299997,AA,AA,AA
299998,UA,UA,UA_CODESHARE


In [16]:
df[df['mkt_unique_carrier'] != df['mkt_carrier']]

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,...,flights,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,first_dep_time,total_add_gtime,longest_add_gtime


In [17]:
df_flight.groupby(['mkt_unique_carrier','mkt_carrier']).arr_delay.count().sort_values(ascending=False)
# duplicate columns, use mkt_unique_carrier

mkt_unique_carrier  mkt_carrier
AA                  AA             76281
DL                  DL             64773
UA                  UA             56590
WN                  WN             48898
AS                  AS             16160
B6                  B6             10944
NK                  NK              7689
F9                  F9              4820
G4                  G4              3829
HA                  HA              3462
Name: arr_delay, dtype: int64

In [18]:
df_flight = df_flight.drop(columns=['mkt_carrier'])

In [19]:
df_flight_test = df_flight_test.drop(columns=['mkt_carrier'])

In [20]:
df_flight[df_flight['mkt_unique_carrier'] != df_flight['branded_code_share']]\
[['mkt_unique_carrier', 'branded_code_share']]

Unnamed: 0,mkt_unique_carrier,branded_code_share
3,UA,UA_CODESHARE
6,AA,AA_CODESHARE
8,AA,AA_CODESHARE
10,UA,UA_CODESHARE
11,AA,AA_CODESHARE
...,...,...
299987,AA,AA_CODESHARE
299989,UA,UA_CODESHARE
299993,UA,UA_CODESHARE
299994,UA,UA_CODESHARE


__Discussion:__
- We can see that there is no difference between 'mkt_unique_carrier' and 'mkt_carrier'
- The differences between 'mkt_unique_carrier' and 'branded_code_share' is only the suffix _CODESHARE  
=> So we can use only 1 feature among these 3 features:  (**'mkt_unique_carrier'**)

In [21]:
df_flight = df_flight.drop(columns=['branded_code_share'])

In [22]:
df_flight_test = df_flight_test.drop(columns=['branded_code_share'])

In [23]:
df_flight.shape

(293446, 24)

In [24]:
df_flight_test.shape

(150623, 19)

In [25]:
df_flight_test.isna().sum()

Unnamed: 0              0
fl_date                 0
mkt_unique_carrier      0
mkt_carrier_fl_num      0
op_unique_carrier       0
tail_num              124
op_carrier_fl_num       0
origin_airport_id       0
origin                  0
origin_city_name        0
dest_airport_id         0
dest                    0
dest_city_name          0
crs_dep_time            0
crs_arr_time            0
dup                     0
crs_elapsed_time        0
flights                 0
distance                0
dtype: int64

### Origin airport 
__*a) Origin airport with high incidence of weather_delay*__

In [26]:
bad_weather_airport = df_flight.groupby(['origin']).weather_delay.mean().round(0).sort_values(ascending=False)
bad_weather_airport

origin
PIR    217.0
ATY    166.0
PLN    161.0
RHI    121.0
BGM    114.0
       ...  
HGR      NaN
HYA      NaN
SMX      NaN
STC      NaN
WYS      NaN
Name: weather_delay, Length: 373, dtype: float64

In [27]:
bad_weather_airport= bad_weather_airport.reset_index()
bad_weather_airport

Unnamed: 0,origin,weather_delay
0,PIR,217.0
1,ATY,166.0
2,PLN,161.0
3,RHI,121.0
4,BGM,114.0
...,...,...
368,HGR,
369,HYA,
370,SMX,
371,STC,


In [28]:
bad_weather_airport.fillna(0,inplace= True)

In [29]:
# automaticly device the delay time to 10 bin
pd.cut(bad_weather_airport.weather_delay, 10, precision= 0).unique()

[(195.0, 217.0], (152.0, 174.0], (108.0, 130.0], (65.0, 87.0], (22.0, 43.0], (-0.0, 22.0]]
Categories (10, interval[float64, right]): [(-0.0, 22.0] < (22.0, 43.0] < (43.0, 65.0] < (65.0, 87.0] ... (130.0, 152.0] < (152.0, 174.0] < (174.0, 195.0] < (195.0, 217.0]]

In [30]:
bins = [-np.inf,1,22,43,65,87,108,130,152,195,np.inf]
group_names = [1,2,3,4,5,6,7,8,9,10]
bad_weather_airport['bin']= pd.cut(bad_weather_airport.weather_delay, 
                                   bins, right = False, labels = group_names)

In [31]:
# check
bad_weather_airport.head()

Unnamed: 0,origin,weather_delay,bin
0,PIR,217.0,10
1,ATY,166.0,9
2,PLN,161.0,9
3,RHI,121.0,7
4,BGM,114.0,7


In [32]:
bad_weather_airport.columns= ['origin','origin_weather_delay','origin_weather_delay_bin']

In [33]:
bad_weather_airport.head()

Unnamed: 0,origin,origin_weather_delay,origin_weather_delay_bin
0,PIR,217.0,10
1,ATY,166.0,9
2,PLN,161.0,9
3,RHI,121.0,7
4,BGM,114.0,7


__*b) Origin airport with high incidence of departure_delay*__

In [34]:
departure_delay_airport= df_flight.groupby(['origin']).dep_delay.mean().round(0).sort_values(ascending=False)
departure_delay_airport= departure_delay_airport.reset_index()

In [35]:
departure_delay_airport.head()

Unnamed: 0,origin,dep_delay
0,CYS,58.0
1,OGD,55.0
2,BGM,55.0
3,SHD,55.0
4,JMS,54.0


In [36]:
pd.cut(departure_delay_airport.dep_delay, 10, precision= 0).unique()

[(50.0, 58.0], (42.0, 50.0], (34.0, 42.0], (26.0, 34.0], (18.0, 26.0], (10.0, 18.0], (2.0, 10.0], (-6.0, 2.0], (-14.0, -6.0], (-22.0, -14.0]]
Categories (10, interval[float64, right]): [(-22.0, -14.0] < (-14.0, -6.0] < (-6.0, 2.0] < (2.0, 10.0] ... (26.0, 34.0] < (34.0, 42.0] < (42.0, 50.0] < (50.0, 58.0]]

In [37]:
bins = [-np.inf,-14,-6,2,10,18,26,34,42,50,np.inf]
group_names = [1,2,3,4,5,6,7,8,9,10]
departure_delay_airport['bin']= pd.cut(departure_delay_airport.dep_delay, bins, right = False, labels = group_names)

In [38]:
departure_delay_airport.head()

Unnamed: 0,origin,dep_delay,bin
0,CYS,58.0,10
1,OGD,55.0,10
2,BGM,55.0,10
3,SHD,55.0,10
4,JMS,54.0,10


In [39]:
departure_delay_airport.columns=['origin','origin_dep_delay','origin_dep_delay_bin']

In [40]:
departure_delay_airport.head()

Unnamed: 0,origin,origin_dep_delay,origin_dep_delay_bin
0,CYS,58.0,10
1,OGD,55.0,10
2,BGM,55.0,10
3,SHD,55.0,10
4,JMS,54.0,10


__*c) Origin airport with high incidence of arr_delay*__

In [41]:
bad_delay_airport = df_flight.groupby(['origin']).arr_delay.mean().round(0).sort_values(ascending=False)
bad_delay_airport

origin
CYS    57.0
BGM    54.0
DIK    53.0
OGD    52.0
JMS    51.0
       ... 
INL   -13.0
BKG   -14.0
DLG   -14.0
STC   -20.0
GST   -22.0
Name: arr_delay, Length: 373, dtype: float64

In [42]:
bad_delay_airport = bad_delay_airport.reset_index()
bad_delay_airport

Unnamed: 0,origin,arr_delay
0,CYS,57.0
1,BGM,54.0
2,DIK,53.0
3,OGD,52.0
4,JMS,51.0
...,...,...
368,INL,-13.0
369,BKG,-14.0
370,DLG,-14.0
371,STC,-20.0


In [43]:
bad_delay_airport.columns= ['origin','origin_arr_delay']

In [44]:
bad_delay_airport

Unnamed: 0,origin,origin_arr_delay
0,CYS,57.0
1,BGM,54.0
2,DIK,53.0
3,OGD,52.0
4,JMS,51.0
...,...,...
368,INL,-13.0
369,BKG,-14.0
370,DLG,-14.0
371,STC,-20.0


__*d) Origin airport with high incidence of taxi_out*__

In [45]:
taxi_out_airport = df_flight.groupby(['origin']).taxi_out.mean().round(0).sort_values(ascending=False)
taxi_out_airport

origin
PQI    29.0
XWA    27.0
LGA    27.0
MMH    27.0
JFK    26.0
       ... 
DUT     5.0
LNY     5.0
DLG     5.0
MKK     4.0
JHM     3.0
Name: taxi_out, Length: 373, dtype: float64

In [46]:
taxi_out_airport = taxi_out_airport.reset_index()
taxi_out_airport

Unnamed: 0,origin,taxi_out
0,PQI,29.0
1,XWA,27.0
2,LGA,27.0
3,MMH,27.0
4,JFK,26.0
...,...,...
368,DUT,5.0
369,LNY,5.0
370,DLG,5.0
371,MKK,4.0


In [47]:
taxi_out_airport.columns = ['origin', 'origin_taxi_out']
taxi_out_airport

Unnamed: 0,origin,origin_taxi_out
0,PQI,29.0
1,XWA,27.0
2,LGA,27.0
3,MMH,27.0
4,JFK,26.0
...,...,...
368,DUT,5.0
369,LNY,5.0
370,DLG,5.0
371,MKK,4.0


__*e) Origin airport with high incidence of taxi_in*__

In [48]:
taxi_in_airport = df_flight.groupby(['origin']).taxi_in.mean().round(0).sort_values(ascending=False)
taxi_in_airport

origin
BKG    17.0
UIN    17.0
EAU    17.0
CMI    16.0
VEL    16.0
       ... 
OME     4.0
DLG     4.0
ADK     3.0
GST     3.0
SPN     3.0
Name: taxi_in, Length: 373, dtype: float64

In [49]:
taxi_in_airport = taxi_in_airport.reset_index()
taxi_in_airport

Unnamed: 0,origin,taxi_in
0,BKG,17.0
1,UIN,17.0
2,EAU,17.0
3,CMI,16.0
4,VEL,16.0
...,...,...
368,OME,4.0
369,DLG,4.0
370,ADK,3.0
371,GST,3.0


In [50]:
taxi_in_airport.columns = ['origin', 'origin_taxi_in']
taxi_in_airport

Unnamed: 0,origin,origin_taxi_in
0,BKG,17.0
1,UIN,17.0
2,EAU,17.0
3,CMI,16.0
4,VEL,16.0
...,...,...
368,OME,4.0
369,DLG,4.0
370,ADK,3.0
371,GST,3.0


In [51]:
df_flight[['origin', 'origin_airport_id', 'origin_city_name']]

Unnamed: 0,origin,origin_airport_id,origin_city_name
0,BOS,10721,"Boston, MA"
1,TPA,15304,"Tampa, FL"
2,PHL,14100,"Philadelphia, PA"
3,MHT,13296,"Manchester, NH"
4,SLC,14869,"Salt Lake City, UT"
...,...,...,...
299995,LAX,12892,"Los Angeles, CA"
299996,PHX,14107,"Phoenix, AZ"
299997,CLT,11057,"Charlotte, NC"
299998,CWA,11203,"Mosinee, WI"


In [52]:
df_flight['origin'].nunique()

373

In [53]:
df_flight['origin_airport_id'].nunique()

373

In [54]:
df_flight['origin_city_name'].nunique()

365

- We can see that the 'origin' and 'origin_airport_id' have the same meaning
- One city can have more than 1 airport
=> we can only use the feature **origin** among 3 features 

- We can also use this filter for the destination

In [55]:
df_flight = df_flight.drop(columns=['origin_airport_id', 'origin_city_name'])

In [56]:
df_flight_test = df_flight_test.drop(columns=['origin_airport_id', 'origin_city_name'])

In [57]:
df_flight.shape

(293446, 22)

In [58]:
df_flight.columns

Index(['fl_date', 'mkt_unique_carrier', 'mkt_carrier_fl_num',
       'op_unique_carrier', 'tail_num', 'op_carrier_fl_num', 'origin',
       'dest_airport_id', 'dest', 'dest_city_name', 'crs_dep_time',
       'crs_arr_time', 'dup', 'crs_elapsed_time', 'flights', 'distance',
       'arr_delay', 'carrier_delay', 'weather_delay', 'dep_delay', 'taxi_out',
       'taxi_in'],
      dtype='object')

In [59]:
df_flight = df_flight.drop(columns=['dest_airport_id', 'dest_city_name'])

In [60]:
df_flight_test.shape

(150623, 17)

In [61]:
df_flight_test = df_flight_test.drop(columns=['dest_airport_id', 'dest_city_name'])

In [62]:
# The dup column has only data 'N'
df_flight['dup'].unique()

array(['N'], dtype=object)

In [63]:
# remove the dup column
df_flight = df_flight.drop(columns=['dup'])

In [64]:
df_flight_test = df_flight_test.drop(columns=['dup'])

### mkt carrier
__a) *mkt carrier with high incidence of carrier_delay*__

In [65]:
delayed_mkt_carrier= df_flight.groupby(['mkt_unique_carrier']).carrier_delay.mean().round(0).sort_values(ascending=False)
delayed_mkt_carrier.head()

mkt_unique_carrier
B6    28.0
DL    25.0
G4    24.0
UA    22.0
AA    21.0
Name: carrier_delay, dtype: float64

In [66]:
delayed_mkt_carrier= delayed_mkt_carrier.reset_index()

In [67]:
delayed_mkt_carrier

Unnamed: 0,mkt_unique_carrier,carrier_delay
0,B6,28.0
1,DL,25.0
2,G4,24.0
3,UA,22.0
4,AA,21.0
5,HA,21.0
6,F9,18.0
7,WN,17.0
8,NK,16.0
9,AS,14.0


In [68]:
delayed_mkt_carrier.columns= ['mkt_unique_carrier','mkt_carrier_delay']

In [69]:
delayed_mkt_carrier.head()

Unnamed: 0,mkt_unique_carrier,mkt_carrier_delay
0,B6,28.0
1,DL,25.0
2,G4,24.0
3,UA,22.0
4,AA,21.0


__*b) mkt_unique_carrier with high incidence of departure delay*__

In [70]:
departure_delay_mkt_carrier= df_flight.groupby(['mkt_unique_carrier']).dep_delay.mean().round(0).sort_values(ascending=False)
departure_delay_mkt_carrier= departure_delay_mkt_carrier.reset_index()

In [71]:
departure_delay_mkt_carrier

Unnamed: 0,mkt_unique_carrier,dep_delay
0,B6,18.0
1,F9,15.0
2,UA,15.0
3,NK,11.0
4,AA,10.0
5,DL,10.0
6,G4,10.0
7,WN,10.0
8,AS,4.0
9,HA,2.0


In [72]:
departure_delay_mkt_carrier.columns=['mkt_unique_carrier','mkt_carrier_dep_delay']

In [73]:
departure_delay_mkt_carrier.head()

Unnamed: 0,mkt_unique_carrier,mkt_carrier_dep_delay
0,B6,18.0
1,F9,15.0
2,UA,15.0
3,NK,11.0
4,AA,10.0


In [74]:
# very high number of difference between mkt_unique_carrier and op_unique_carrier
df_flight[df_flight['mkt_unique_carrier'] != df_flight['op_unique_carrier']].count()

fl_date               110661
mkt_unique_carrier    110661
mkt_carrier_fl_num    110661
op_unique_carrier     110661
tail_num              110661
op_carrier_fl_num     110661
origin                110661
dest                  110661
crs_dep_time          110661
crs_arr_time          110661
crs_elapsed_time      110661
flights               110661
distance              110661
arr_delay             110661
carrier_delay          22289
weather_delay          22289
dep_delay             110661
taxi_out              110661
taxi_in               110661
dtype: int64

__c) *mkt carrier with high incidence of arr_delay*__

In [75]:
delayed_mkt_arr= df_flight.groupby(['mkt_unique_carrier']).arr_delay.mean().round(0).sort_values(ascending=False)
delayed_mkt_arr.head()

mkt_unique_carrier
B6    12.0
UA    11.0
F9     9.0
G4     9.0
AA     6.0
Name: arr_delay, dtype: float64

In [76]:
delayed_mkt_arr= delayed_mkt_arr.reset_index()

In [77]:
delayed_mkt_arr

Unnamed: 0,mkt_unique_carrier,arr_delay
0,B6,12.0
1,UA,11.0
2,F9,9.0
3,G4,9.0
4,AA,6.0
5,NK,5.0
6,DL,3.0
7,WN,2.0
8,AS,1.0
9,HA,1.0


In [78]:
delayed_mkt_arr.columns= ['mkt_unique_carrier','mkt_arr_delay']

### op carrier
__*a) op carrier with high incidence of carrier_delay*__

In [79]:
delayed_op_carrier=df_flight.groupby(['op_unique_carrier']).carrier_delay.mean().round(0).sort_values(ascending=False)
delayed_op_carrier= delayed_op_carrier.reset_index()

In [80]:
delayed_op_carrier.head()

Unnamed: 0,op_unique_carrier,carrier_delay
0,OO,30.0
1,B6,28.0
2,CP,28.0
3,YV,27.0
4,EV,27.0


In [81]:
delayed_op_carrier.columns= ['op_unique_carrier','op_carrier_delay']

In [82]:
delayed_op_carrier.head()

Unnamed: 0,op_unique_carrier,op_carrier_delay
0,OO,30.0
1,B6,28.0
2,CP,28.0
3,YV,27.0
4,EV,27.0


__*b) op_unique_carrier with high incidence of departure delay*__

In [83]:
departure_delay_op_carrier=df_flight.groupby(['op_unique_carrier']).dep_delay.mean().round(0).sort_values(ascending=False)
departure_delay_op_carrier= departure_delay_op_carrier.reset_index()

In [84]:
departure_delay_op_carrier.head()

Unnamed: 0,op_unique_carrier,dep_delay
0,C5,26.0
1,KS,22.0
2,EV,20.0
3,B6,18.0
4,AX,17.0


In [85]:
departure_delay_op_carrier.columns= ['op_unique_carrier','op_carrier_dep_delay']

In [86]:
departure_delay_op_carrier.head()

Unnamed: 0,op_unique_carrier,op_carrier_dep_delay
0,C5,26.0
1,KS,22.0
2,EV,20.0
3,B6,18.0
4,AX,17.0


__*c) op carrier with high incidence of arr_delay*__

In [87]:
delayed_op_arr=df_flight.groupby(['op_unique_carrier']).arr_delay.mean().round(0).sort_values(ascending=False)
delayed_op_arr= delayed_op_arr.reset_index()

In [88]:
delayed_op_arr.head()

Unnamed: 0,op_unique_carrier,arr_delay
0,C5,24.0
1,KS,21.0
2,EV,19.0
3,AX,16.0
4,B6,12.0


In [89]:
delayed_op_arr.columns= ['op_unique_carrier','op_arr_delay']

In [90]:
delayed_op_arr.head()

Unnamed: 0,op_unique_carrier,op_arr_delay
0,C5,24.0
1,KS,21.0
2,EV,19.0
3,AX,16.0
4,B6,12.0


__Relation between 'mkt_carrier_fl_num' and 'op_carrier_fl_num'__

In [91]:
df_flight[df_flight['mkt_carrier_fl_num'] != df_flight['op_carrier_fl_num']][['mkt_carrier_fl_num', 'op_carrier_fl_num']].count()

mkt_carrier_fl_num    30
op_carrier_fl_num     30
dtype: int64

- We have only 30 row that has the difference between 'mkt_carrier_fl_num' and 'op_carrier_fl_num'
=> we can assume that 'mkt_carrier_fl_num' and 'op_carrier_fl_num' is the same thing
=> keep the feature **'op_carrier_fl_num'**

In [92]:
df_flight = df_flight.drop(columns=['mkt_carrier_fl_num'])

In [93]:
df_flight_test = df_flight_test.drop(columns=['mkt_carrier_fl_num'])

In [94]:
df_flight.shape

(293446, 18)

In [95]:
df_flight.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 293446 entries, 0 to 299999
Data columns (total 18 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   fl_date             293446 non-null  datetime64[ns]
 1   mkt_unique_carrier  293446 non-null  object        
 2   op_unique_carrier   293446 non-null  object        
 3   tail_num            293446 non-null  object        
 4   op_carrier_fl_num   293446 non-null  int64         
 5   origin              293446 non-null  object        
 6   dest                293446 non-null  object        
 7   crs_dep_time        293446 non-null  int64         
 8   crs_arr_time        293446 non-null  int64         
 9   crs_elapsed_time    293446 non-null  float64       
 10  flights             293446 non-null  float64       
 11  distance            293446 non-null  float64       
 12  arr_delay           293446 non-null  float64       
 13  carrier_delay       56793 non

In [96]:
# Flights has only 1 value = 1
df_flight['flights'].unique()

array([1.])

In [97]:
# Drop the flights column
df_flight = df_flight.drop(columns=['flights'])

In [98]:
df_flight_test = df_flight_test.drop(columns=['flights'])

In [99]:
df_flight.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 293446 entries, 0 to 299999
Data columns (total 17 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   fl_date             293446 non-null  datetime64[ns]
 1   mkt_unique_carrier  293446 non-null  object        
 2   op_unique_carrier   293446 non-null  object        
 3   tail_num            293446 non-null  object        
 4   op_carrier_fl_num   293446 non-null  int64         
 5   origin              293446 non-null  object        
 6   dest                293446 non-null  object        
 7   crs_dep_time        293446 non-null  int64         
 8   crs_arr_time        293446 non-null  int64         
 9   crs_elapsed_time    293446 non-null  float64       
 10  distance            293446 non-null  float64       
 11  arr_delay           293446 non-null  float64       
 12  carrier_delay       56793 non-null   float64       
 13  weather_delay       56793 non

In [100]:
# We dont use the carrier_delay, weather_delay and dep_delay anymore
df_flight = df_flight.drop(columns=['carrier_delay','weather_delay', 
                                    'dep_delay', 'taxi_out', 'taxi_in'])

In [101]:
df_flight.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 293446 entries, 0 to 299999
Data columns (total 12 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   fl_date             293446 non-null  datetime64[ns]
 1   mkt_unique_carrier  293446 non-null  object        
 2   op_unique_carrier   293446 non-null  object        
 3   tail_num            293446 non-null  object        
 4   op_carrier_fl_num   293446 non-null  int64         
 5   origin              293446 non-null  object        
 6   dest                293446 non-null  object        
 7   crs_dep_time        293446 non-null  int64         
 8   crs_arr_time        293446 non-null  int64         
 9   crs_elapsed_time    293446 non-null  float64       
 10  distance            293446 non-null  float64       
 11  arr_delay           293446 non-null  float64       
dtypes: datetime64[ns](1), float64(3), int64(3), object(5)
memory usage: 29.1+ MB


In [102]:
df_flight_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150623 entries, 0 to 150622
Data columns (total 12 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   Unnamed: 0          150623 non-null  int64         
 1   fl_date             150623 non-null  datetime64[ns]
 2   mkt_unique_carrier  150623 non-null  object        
 3   op_unique_carrier   150623 non-null  object        
 4   tail_num            150499 non-null  object        
 5   op_carrier_fl_num   150623 non-null  int64         
 6   origin              150623 non-null  object        
 7   dest                150623 non-null  object        
 8   crs_dep_time        150623 non-null  int64         
 9   crs_arr_time        150623 non-null  int64         
 10  crs_elapsed_time    150623 non-null  int64         
 11  distance            150623 non-null  int64         
dtypes: datetime64[ns](1), int64(6), object(5)
memory usage: 13.8+ MB


In [103]:
df_flight_test = df_flight_test.iloc[:,1:]

In [104]:
df_flight_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150623 entries, 0 to 150622
Data columns (total 11 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   fl_date             150623 non-null  datetime64[ns]
 1   mkt_unique_carrier  150623 non-null  object        
 2   op_unique_carrier   150623 non-null  object        
 3   tail_num            150499 non-null  object        
 4   op_carrier_fl_num   150623 non-null  int64         
 5   origin              150623 non-null  object        
 6   dest                150623 non-null  object        
 7   crs_dep_time        150623 non-null  int64         
 8   crs_arr_time        150623 non-null  int64         
 9   crs_elapsed_time    150623 non-null  int64         
 10  distance            150623 non-null  int64         
dtypes: datetime64[ns](1), int64(5), object(5)
memory usage: 12.6+ MB


### Feature Engineering

__1. create new features of day of week and month for fl_date__

In [105]:
# extract the weekday and month from fl_date
df_flight['fl_month'] = df_flight['fl_date'].dt.month
df_flight['fl_wday'] = df_flight['fl_date'].dt.weekday

In [106]:
df_flight_test['fl_month'] = df_flight_test['fl_date'].dt.month
df_flight_test['fl_wday'] = df_flight_test['fl_date'].dt.weekday

In [107]:
df_flight.head()

Unnamed: 0,fl_date,mkt_unique_carrier,op_unique_carrier,tail_num,op_carrier_fl_num,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,distance,arr_delay,fl_month,fl_wday
0,2019-01-07,DL,DL,N378DA,2185,BOS,MSP,1629,1858,209.0,1124.0,-11.0,1,0
1,2019-01-27,WN,WN,N8719Q,5073,TPA,PHX,1635,1915,280.0,1788.0,12.0,1,6
2,2019-01-27,AA,AA,N702UW,1684,PHL,ATL,1235,1453,138.0,666.0,-15.0,1,6
3,2019-01-28,UA,YX,N642RW,3592,MHT,EWR,1125,1259,94.0,209.0,-17.0,1,0
4,2019-01-10,DL,DL,N593NW,1486,SLC,MCO,1730,2340,250.0,1931.0,-13.0,1,3


__2. mkt_unique_carrier >> feature: mkt_carrier_delay, mkt_carrier_dep_delay, mkt_arr_delay (Ordinal encoder)__

In [108]:
#carrier_delay, carrier with high incidence of carrier delay
df_flight = df_flight.merge(delayed_mkt_carrier,how='left')

In [109]:
df_flight_test = df_flight_test.merge(delayed_mkt_carrier,how='left')

In [110]:
df_flight.head()

Unnamed: 0,fl_date,mkt_unique_carrier,op_unique_carrier,tail_num,op_carrier_fl_num,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,distance,arr_delay,fl_month,fl_wday,mkt_carrier_delay
0,2019-01-07,DL,DL,N378DA,2185,BOS,MSP,1629,1858,209.0,1124.0,-11.0,1,0,25.0
1,2019-01-27,WN,WN,N8719Q,5073,TPA,PHX,1635,1915,280.0,1788.0,12.0,1,6,17.0
2,2019-01-27,AA,AA,N702UW,1684,PHL,ATL,1235,1453,138.0,666.0,-15.0,1,6,21.0
3,2019-01-28,UA,YX,N642RW,3592,MHT,EWR,1125,1259,94.0,209.0,-17.0,1,0,22.0
4,2019-01-10,DL,DL,N593NW,1486,SLC,MCO,1730,2340,250.0,1931.0,-13.0,1,3,25.0


In [111]:
#dep_delay, carrier with high incidence of dep_delay
df_flight= df_flight.merge(departure_delay_mkt_carrier,how='left')

In [112]:
df_flight_test= df_flight_test.merge(departure_delay_mkt_carrier,how='left')

In [113]:
df_flight.head()

Unnamed: 0,fl_date,mkt_unique_carrier,op_unique_carrier,tail_num,op_carrier_fl_num,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,distance,arr_delay,fl_month,fl_wday,mkt_carrier_delay,mkt_carrier_dep_delay
0,2019-01-07,DL,DL,N378DA,2185,BOS,MSP,1629,1858,209.0,1124.0,-11.0,1,0,25.0,10.0
1,2019-01-27,WN,WN,N8719Q,5073,TPA,PHX,1635,1915,280.0,1788.0,12.0,1,6,17.0,10.0
2,2019-01-27,AA,AA,N702UW,1684,PHL,ATL,1235,1453,138.0,666.0,-15.0,1,6,21.0,10.0
3,2019-01-28,UA,YX,N642RW,3592,MHT,EWR,1125,1259,94.0,209.0,-17.0,1,0,22.0,15.0
4,2019-01-10,DL,DL,N593NW,1486,SLC,MCO,1730,2340,250.0,1931.0,-13.0,1,3,25.0,10.0


In [114]:
df_flight = df_flight.merge(delayed_mkt_arr,how='left')

In [115]:
df_flight_test = df_flight_test.merge(delayed_mkt_arr,how='left')

In [116]:
df_flight[['mkt_carrier_delay','mkt_carrier_dep_delay', 'mkt_arr_delay']].isna().sum()

mkt_carrier_delay        0
mkt_carrier_dep_delay    0
mkt_arr_delay            0
dtype: int64

__4. op_unique_carrier >> feature: op_carrier_delay, op_carrier_dep_delay (Ordinal encoder)__

In [117]:
#op_carrier_delay, op_carrier with high incidence of carrier delay
df_flight = df_flight.merge(delayed_op_carrier,how='left')

In [118]:
df_flight_test = df_flight_test.merge(delayed_op_carrier,how='left')

In [119]:
df_flight.head()

Unnamed: 0,fl_date,mkt_unique_carrier,op_unique_carrier,tail_num,op_carrier_fl_num,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,distance,arr_delay,fl_month,fl_wday,mkt_carrier_delay,mkt_carrier_dep_delay,mkt_arr_delay,op_carrier_delay
0,2019-01-07,DL,DL,N378DA,2185,BOS,MSP,1629,1858,209.0,1124.0,-11.0,1,0,25.0,10.0,3.0,24.0
1,2019-01-27,WN,WN,N8719Q,5073,TPA,PHX,1635,1915,280.0,1788.0,12.0,1,6,17.0,10.0,2.0,17.0
2,2019-01-27,AA,AA,N702UW,1684,PHL,ATL,1235,1453,138.0,666.0,-15.0,1,6,21.0,10.0,6.0,24.0
3,2019-01-28,UA,YX,N642RW,3592,MHT,EWR,1125,1259,94.0,209.0,-17.0,1,0,22.0,15.0,11.0,14.0
4,2019-01-10,DL,DL,N593NW,1486,SLC,MCO,1730,2340,250.0,1931.0,-13.0,1,3,25.0,10.0,3.0,24.0


In [120]:
#op_carrier_delay, op_carrier with high incidence of carrier delay
df_flight = df_flight.merge(delayed_op_carrier,how='left')

In [121]:
df_flight_test = df_flight_test.merge(delayed_op_carrier,how='left')

In [122]:
df_flight.head()

Unnamed: 0,fl_date,mkt_unique_carrier,op_unique_carrier,tail_num,op_carrier_fl_num,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,distance,arr_delay,fl_month,fl_wday,mkt_carrier_delay,mkt_carrier_dep_delay,mkt_arr_delay,op_carrier_delay
0,2019-01-07,DL,DL,N378DA,2185,BOS,MSP,1629,1858,209.0,1124.0,-11.0,1,0,25.0,10.0,3.0,24.0
1,2019-01-27,WN,WN,N8719Q,5073,TPA,PHX,1635,1915,280.0,1788.0,12.0,1,6,17.0,10.0,2.0,17.0
2,2019-01-27,AA,AA,N702UW,1684,PHL,ATL,1235,1453,138.0,666.0,-15.0,1,6,21.0,10.0,6.0,24.0
3,2019-01-28,UA,YX,N642RW,3592,MHT,EWR,1125,1259,94.0,209.0,-17.0,1,0,22.0,15.0,11.0,14.0
4,2019-01-10,DL,DL,N593NW,1486,SLC,MCO,1730,2340,250.0,1931.0,-13.0,1,3,25.0,10.0,3.0,24.0


In [123]:
#op_carrier_delay, op_carrier with high incidence of departure delay
df_flight = df_flight.merge(departure_delay_op_carrier,how='left')

In [124]:
df_flight_test = df_flight_test.merge(departure_delay_op_carrier,how='left')

In [125]:
df_flight.head()

Unnamed: 0,fl_date,mkt_unique_carrier,op_unique_carrier,tail_num,op_carrier_fl_num,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,distance,arr_delay,fl_month,fl_wday,mkt_carrier_delay,mkt_carrier_dep_delay,mkt_arr_delay,op_carrier_delay,op_carrier_dep_delay
0,2019-01-07,DL,DL,N378DA,2185,BOS,MSP,1629,1858,209.0,1124.0,-11.0,1,0,25.0,10.0,3.0,24.0,8.0
1,2019-01-27,WN,WN,N8719Q,5073,TPA,PHX,1635,1915,280.0,1788.0,12.0,1,6,17.0,10.0,2.0,17.0,10.0
2,2019-01-27,AA,AA,N702UW,1684,PHL,ATL,1235,1453,138.0,666.0,-15.0,1,6,21.0,10.0,6.0,24.0,12.0
3,2019-01-28,UA,YX,N642RW,3592,MHT,EWR,1125,1259,94.0,209.0,-17.0,1,0,22.0,15.0,11.0,14.0,8.0
4,2019-01-10,DL,DL,N593NW,1486,SLC,MCO,1730,2340,250.0,1931.0,-13.0,1,3,25.0,10.0,3.0,24.0,8.0


In [126]:
df_flight[['op_carrier_delay','op_carrier_dep_delay']].isna().sum()

op_carrier_delay        0
op_carrier_dep_delay    0
dtype: int64

**5. origin >> Feature: origin_weather_delay, origin_dep_delay**

In [127]:
df_flight= df_flight.merge(bad_delay_airport,how='left')
df_flight.head()

Unnamed: 0,fl_date,mkt_unique_carrier,op_unique_carrier,tail_num,op_carrier_fl_num,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,distance,arr_delay,fl_month,fl_wday,mkt_carrier_delay,mkt_carrier_dep_delay,mkt_arr_delay,op_carrier_delay,op_carrier_dep_delay,origin_arr_delay
0,2019-01-07,DL,DL,N378DA,2185,BOS,MSP,1629,1858,209.0,1124.0,-11.0,1,0,25.0,10.0,3.0,24.0,8.0,7.0
1,2019-01-27,WN,WN,N8719Q,5073,TPA,PHX,1635,1915,280.0,1788.0,12.0,1,6,17.0,10.0,2.0,17.0,10.0,4.0
2,2019-01-27,AA,AA,N702UW,1684,PHL,ATL,1235,1453,138.0,666.0,-15.0,1,6,21.0,10.0,6.0,24.0,12.0,4.0
3,2019-01-28,UA,YX,N642RW,3592,MHT,EWR,1125,1259,94.0,209.0,-17.0,1,0,22.0,15.0,11.0,14.0,8.0,-3.0
4,2019-01-10,DL,DL,N593NW,1486,SLC,MCO,1730,2340,250.0,1931.0,-13.0,1,3,25.0,10.0,3.0,24.0,8.0,0.0


In [128]:
df_flight_test= df_flight_test.merge(bad_delay_airport,how='left')

In [129]:
df_flight= df_flight.merge(taxi_out_airport,how='left')
df_flight.head()

Unnamed: 0,fl_date,mkt_unique_carrier,op_unique_carrier,tail_num,op_carrier_fl_num,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,...,arr_delay,fl_month,fl_wday,mkt_carrier_delay,mkt_carrier_dep_delay,mkt_arr_delay,op_carrier_delay,op_carrier_dep_delay,origin_arr_delay,origin_taxi_out
0,2019-01-07,DL,DL,N378DA,2185,BOS,MSP,1629,1858,209.0,...,-11.0,1,0,25.0,10.0,3.0,24.0,8.0,7.0,20.0
1,2019-01-27,WN,WN,N8719Q,5073,TPA,PHX,1635,1915,280.0,...,12.0,1,6,17.0,10.0,2.0,17.0,10.0,4.0,14.0
2,2019-01-27,AA,AA,N702UW,1684,PHL,ATL,1235,1453,138.0,...,-15.0,1,6,21.0,10.0,6.0,24.0,12.0,4.0,22.0
3,2019-01-28,UA,YX,N642RW,3592,MHT,EWR,1125,1259,94.0,...,-17.0,1,0,22.0,15.0,11.0,14.0,8.0,-3.0,15.0
4,2019-01-10,DL,DL,N593NW,1486,SLC,MCO,1730,2340,250.0,...,-13.0,1,3,25.0,10.0,3.0,24.0,8.0,0.0,18.0


In [130]:
df_flight_test= df_flight_test.merge(taxi_out_airport,how='left')

In [131]:
df_flight= df_flight.merge(taxi_in_airport,how='left')
df_flight.head()

Unnamed: 0,fl_date,mkt_unique_carrier,op_unique_carrier,tail_num,op_carrier_fl_num,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,...,fl_month,fl_wday,mkt_carrier_delay,mkt_carrier_dep_delay,mkt_arr_delay,op_carrier_delay,op_carrier_dep_delay,origin_arr_delay,origin_taxi_out,origin_taxi_in
0,2019-01-07,DL,DL,N378DA,2185,BOS,MSP,1629,1858,209.0,...,1,0,25.0,10.0,3.0,24.0,8.0,7.0,20.0,8.0
1,2019-01-27,WN,WN,N8719Q,5073,TPA,PHX,1635,1915,280.0,...,1,6,17.0,10.0,2.0,17.0,10.0,4.0,14.0,8.0
2,2019-01-27,AA,AA,N702UW,1684,PHL,ATL,1235,1453,138.0,...,1,6,21.0,10.0,6.0,24.0,12.0,4.0,22.0,7.0
3,2019-01-28,UA,YX,N642RW,3592,MHT,EWR,1125,1259,94.0,...,1,0,22.0,15.0,11.0,14.0,8.0,-3.0,15.0,8.0
4,2019-01-10,DL,DL,N593NW,1486,SLC,MCO,1730,2340,250.0,...,1,3,25.0,10.0,3.0,24.0,8.0,0.0,18.0,7.0


In [132]:
df_flight_test = df_flight_test.merge(taxi_in_airport,how='left')

In [133]:
# Ordinal encoder by weather_delay (origin airport with high incidence of weather delay)
df_flight= df_flight.merge(bad_weather_airport,how='left')
df_flight.head()

Unnamed: 0,fl_date,mkt_unique_carrier,op_unique_carrier,tail_num,op_carrier_fl_num,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,...,mkt_carrier_delay,mkt_carrier_dep_delay,mkt_arr_delay,op_carrier_delay,op_carrier_dep_delay,origin_arr_delay,origin_taxi_out,origin_taxi_in,origin_weather_delay,origin_weather_delay_bin
0,2019-01-07,DL,DL,N378DA,2185,BOS,MSP,1629,1858,209.0,...,25.0,10.0,3.0,24.0,8.0,7.0,20.0,8.0,4.0,2
1,2019-01-27,WN,WN,N8719Q,5073,TPA,PHX,1635,1915,280.0,...,17.0,10.0,2.0,17.0,10.0,4.0,14.0,8.0,2.0,2
2,2019-01-27,AA,AA,N702UW,1684,PHL,ATL,1235,1453,138.0,...,21.0,10.0,6.0,24.0,12.0,4.0,22.0,7.0,2.0,2
3,2019-01-28,UA,YX,N642RW,3592,MHT,EWR,1125,1259,94.0,...,22.0,15.0,11.0,14.0,8.0,-3.0,15.0,8.0,3.0,2
4,2019-01-10,DL,DL,N593NW,1486,SLC,MCO,1730,2340,250.0,...,25.0,10.0,3.0,24.0,8.0,0.0,18.0,7.0,3.0,2


In [134]:
df_flight_test= df_flight_test.merge(bad_weather_airport,how='left')

In [135]:
# Ordinal encoder by departure_delay (origin airport with high incidence of departure delay)
df_flight= df_flight.merge(departure_delay_airport,how='left')
df_flight.head()

Unnamed: 0,fl_date,mkt_unique_carrier,op_unique_carrier,tail_num,op_carrier_fl_num,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,...,mkt_arr_delay,op_carrier_delay,op_carrier_dep_delay,origin_arr_delay,origin_taxi_out,origin_taxi_in,origin_weather_delay,origin_weather_delay_bin,origin_dep_delay,origin_dep_delay_bin
0,2019-01-07,DL,DL,N378DA,2185,BOS,MSP,1629,1858,209.0,...,3.0,24.0,8.0,7.0,20.0,8.0,4.0,2,13.0,5
1,2019-01-27,WN,WN,N8719Q,5073,TPA,PHX,1635,1915,280.0,...,2.0,17.0,10.0,4.0,14.0,8.0,2.0,2,9.0,4
2,2019-01-27,AA,AA,N702UW,1684,PHL,ATL,1235,1453,138.0,...,6.0,24.0,12.0,4.0,22.0,7.0,2.0,2,8.0,4
3,2019-01-28,UA,YX,N642RW,3592,MHT,EWR,1125,1259,94.0,...,11.0,14.0,8.0,-3.0,15.0,8.0,3.0,2,3.0,4
4,2019-01-10,DL,DL,N593NW,1486,SLC,MCO,1730,2340,250.0,...,3.0,24.0,8.0,0.0,18.0,7.0,3.0,2,7.0,4


In [136]:
df_flight_test= df_flight_test.merge(departure_delay_airport,how='left')

**6. traffic by airport by hour**

In [137]:
# round up dep_time to hour
df_flight['crs_dep_hour']= np.floor(df_flight.crs_dep_time/100)
df_flight.head()

Unnamed: 0,fl_date,mkt_unique_carrier,op_unique_carrier,tail_num,op_carrier_fl_num,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,...,op_carrier_delay,op_carrier_dep_delay,origin_arr_delay,origin_taxi_out,origin_taxi_in,origin_weather_delay,origin_weather_delay_bin,origin_dep_delay,origin_dep_delay_bin,crs_dep_hour
0,2019-01-07,DL,DL,N378DA,2185,BOS,MSP,1629,1858,209.0,...,24.0,8.0,7.0,20.0,8.0,4.0,2,13.0,5,16.0
1,2019-01-27,WN,WN,N8719Q,5073,TPA,PHX,1635,1915,280.0,...,17.0,10.0,4.0,14.0,8.0,2.0,2,9.0,4,16.0
2,2019-01-27,AA,AA,N702UW,1684,PHL,ATL,1235,1453,138.0,...,24.0,12.0,4.0,22.0,7.0,2.0,2,8.0,4,12.0
3,2019-01-28,UA,YX,N642RW,3592,MHT,EWR,1125,1259,94.0,...,14.0,8.0,-3.0,15.0,8.0,3.0,2,3.0,4,11.0
4,2019-01-10,DL,DL,N593NW,1486,SLC,MCO,1730,2340,250.0,...,24.0,8.0,0.0,18.0,7.0,3.0,2,7.0,4,17.0


In [138]:
df_flight_test['crs_dep_hour']= np.floor(df_flight_test.crs_dep_time/100)

In [139]:
# at the departure time, how many flights are schedule to departure within the same timeslot (hrs) at the origin airport
df_flight.groupby(['origin','fl_date','crs_dep_hour'])['mkt_unique_carrier'].count().sort_values(ascending=False)

origin  fl_date     crs_dep_hour
ORD     2019-09-30  8.0             11
        2019-02-05  11.0            11
ATL     2019-09-22  9.0             10
CLT     2019-11-22  16.0            10
DEN     2019-09-30  19.0            10
                                    ..
HOU     2019-08-19  6.0              1
                    11.0             1
        2019-08-20  6.0              1
                    9.0              1
YUM     2019-12-29  21.0             1
Name: mkt_unique_carrier, Length: 205222, dtype: int64

In [140]:
df_flight['departure_traffic']=df_flight.groupby(['origin','fl_date','crs_dep_hour'])['mkt_unique_carrier'].transform('count')

In [141]:
df_flight_test['departure_traffic']=df_flight_test.groupby(['origin','fl_date','crs_dep_hour'])['mkt_unique_carrier'].transform('count')

In [142]:
df_flight.head()

Unnamed: 0,fl_date,mkt_unique_carrier,op_unique_carrier,tail_num,op_carrier_fl_num,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,...,op_carrier_dep_delay,origin_arr_delay,origin_taxi_out,origin_taxi_in,origin_weather_delay,origin_weather_delay_bin,origin_dep_delay,origin_dep_delay_bin,crs_dep_hour,departure_traffic
0,2019-01-07,DL,DL,N378DA,2185,BOS,MSP,1629,1858,209.0,...,8.0,7.0,20.0,8.0,4.0,2,13.0,5,16.0,4
1,2019-01-27,WN,WN,N8719Q,5073,TPA,PHX,1635,1915,280.0,...,10.0,4.0,14.0,8.0,2.0,2,9.0,4,16.0,1
2,2019-01-27,AA,AA,N702UW,1684,PHL,ATL,1235,1453,138.0,...,12.0,4.0,22.0,7.0,2.0,2,8.0,4,12.0,3
3,2019-01-28,UA,YX,N642RW,3592,MHT,EWR,1125,1259,94.0,...,8.0,-3.0,15.0,8.0,3.0,2,3.0,4,11.0,1
4,2019-01-10,DL,DL,N593NW,1486,SLC,MCO,1730,2340,250.0,...,8.0,0.0,18.0,7.0,3.0,2,7.0,4,17.0,1


## Modeling

In [143]:
from sklearn.model_selection import train_test_split

In [144]:
X = df_flight.drop(columns=['arr_delay'])

In [145]:
Y = df_flight['arr_delay']

In [146]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 293446 entries, 0 to 293445
Data columns (total 27 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   fl_date                   293446 non-null  datetime64[ns]
 1   mkt_unique_carrier        293446 non-null  object        
 2   op_unique_carrier         293446 non-null  object        
 3   tail_num                  293446 non-null  object        
 4   op_carrier_fl_num         293446 non-null  int64         
 5   origin                    293446 non-null  object        
 6   dest                      293446 non-null  object        
 7   crs_dep_time              293446 non-null  int64         
 8   crs_arr_time              293446 non-null  int64         
 9   crs_elapsed_time          293446 non-null  float64       
 10  distance                  293446 non-null  float64       
 11  fl_month                  293446 non-null  int64         
 12  fl

In [147]:
X = X.drop(columns=['fl_date', 'mkt_unique_carrier', 'op_unique_carrier',
                               'tail_num', 'op_carrier_fl_num', 'origin', 'dest',
                               'origin_weather_delay_bin','origin_dep_delay_bin','departure_traffic'])

In [148]:
# day = pd.get_dummies(X['fl_wday'], drop_first=True)
# month = pd.get_dummies(X['fl_month'], prefix='M', drop_first=True)

In [149]:
# X = X.merge(day, left_index=True, right_index=True)
# X = X.merge(month, left_index=True, right_index=True)

In [150]:
# X = X.drop(columns=['fl_wday', 'fl_month'])

In [151]:
X_train, X_test, y_train, y_test = train_test_split(X,Y,
                                                   test_size=0.2,
                                                   random_state=42)

### Linear Regression

In [152]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 234756 entries, 78882 to 121958
Data columns (total 17 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   crs_dep_time           234756 non-null  int64  
 1   crs_arr_time           234756 non-null  int64  
 2   crs_elapsed_time       234756 non-null  float64
 3   distance               234756 non-null  float64
 4   fl_month               234756 non-null  int64  
 5   fl_wday                234756 non-null  int64  
 6   mkt_carrier_delay      234756 non-null  float64
 7   mkt_carrier_dep_delay  234756 non-null  float64
 8   mkt_arr_delay          234756 non-null  float64
 9   op_carrier_delay       234756 non-null  float64
 10  op_carrier_dep_delay   234756 non-null  float64
 11  origin_arr_delay       234756 non-null  float64
 12  origin_taxi_out        234756 non-null  float64
 13  origin_taxi_in         234756 non-null  float64
 14  origin_weather_delay   234756 no

In [153]:
from sklearn.linear_model import LinearRegression

In [154]:
model_lr = LinearRegression()
model_lr.fit(X_train, y_train)

LinearRegression()

In [155]:
model_lr.score(X_train, y_train)

0.017634945489523268

In [156]:
model_lr.score(X_test, y_test)

0.018639721497034856

### Decision Tree

In [157]:
from sklearn.tree import DecisionTreeRegressor

In [158]:
model_tree = DecisionTreeRegressor(min_samples_leaf=5)
model_tree.fit(X_train, y_train)

DecisionTreeRegressor(min_samples_leaf=5)

In [159]:
model_tree.score(X_train, y_train)

0.3472608671773464

In [160]:
model_tree.score(X_test, y_test)

-0.2921643803850491

### Random Forest

In [161]:
X_train.shape

(234756, 17)

In [162]:
from sklearn.ensemble import RandomForestRegressor

__Model 1__

In [163]:
model = RandomForestRegressor(n_estimators=300, max_features=10, min_samples_leaf=5,)
model.fit(X_train, y_train)

RandomForestRegressor(max_features=10, min_samples_leaf=5, n_estimators=300)

In [164]:
model.score(X_train,y_train)

0.3472931305433614

In [165]:
model.score(X_test,y_test)

0.009952024440650908

In [166]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 234756 entries, 78882 to 121958
Data columns (total 17 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   crs_dep_time           234756 non-null  int64  
 1   crs_arr_time           234756 non-null  int64  
 2   crs_elapsed_time       234756 non-null  float64
 3   distance               234756 non-null  float64
 4   fl_month               234756 non-null  int64  
 5   fl_wday                234756 non-null  int64  
 6   mkt_carrier_delay      234756 non-null  float64
 7   mkt_carrier_dep_delay  234756 non-null  float64
 8   mkt_arr_delay          234756 non-null  float64
 9   op_carrier_delay       234756 non-null  float64
 10  op_carrier_dep_delay   234756 non-null  float64
 11  origin_arr_delay       234756 non-null  float64
 12  origin_taxi_out        234756 non-null  float64
 13  origin_taxi_in         234756 non-null  float64
 14  origin_weather_delay   234756 no

__Model 2__

In [167]:
model_2 = RandomForestRegressor(n_estimators=500, max_features=10, min_samples_leaf=5,)
model_2.fit(X_train, y_train)

RandomForestRegressor(max_features=10, min_samples_leaf=5, n_estimators=500)

In [168]:
model_2.score(X_train,y_train)

0.34790563322056467

In [169]:
model_2.score(X_test,y_test)

0.010950314470383415

__Model 3__

In [170]:
model_3 = RandomForestRegressor(n_estimators=500, min_samples_leaf=10,)
model_3.fit(X_train, y_train)

RandomForestRegressor(min_samples_leaf=10, n_estimators=500)

In [171]:
model_3.score(X_train,y_train)

0.24311202711505575

In [172]:
model_3.score(X_test,y_test)

0.02342401866227839

### Test set

In [173]:
X_test_2020 = df_flight_test.drop(columns=['fl_date', 'mkt_unique_carrier', 'op_unique_carrier',
                               'tail_num', 'op_carrier_fl_num', 'origin', 'dest',
                               'origin_weather_delay_bin','origin_dep_delay_bin','departure_traffic'])

In [177]:
X_test_2020.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150623 entries, 0 to 150622
Data columns (total 17 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   crs_dep_time           150623 non-null  int64  
 1   crs_arr_time           150623 non-null  int64  
 2   crs_elapsed_time       150623 non-null  int64  
 3   distance               150623 non-null  int64  
 4   fl_month               150623 non-null  int64  
 5   fl_wday                150623 non-null  int64  
 6   mkt_carrier_delay      150623 non-null  float64
 7   mkt_carrier_dep_delay  150623 non-null  float64
 8   mkt_arr_delay          150623 non-null  float64
 9   op_carrier_delay       150623 non-null  float64
 10  op_carrier_dep_delay   150623 non-null  float64
 11  origin_arr_delay       150623 non-null  float64
 12  origin_taxi_out        150623 non-null  float64
 13  origin_taxi_in         150623 non-null  float64
 14  origin_weather_delay   150623 non-nu

In [178]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 234756 entries, 78882 to 121958
Data columns (total 17 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   crs_dep_time           234756 non-null  int64  
 1   crs_arr_time           234756 non-null  int64  
 2   crs_elapsed_time       234756 non-null  float64
 3   distance               234756 non-null  float64
 4   fl_month               234756 non-null  int64  
 5   fl_wday                234756 non-null  int64  
 6   mkt_carrier_delay      234756 non-null  float64
 7   mkt_carrier_dep_delay  234756 non-null  float64
 8   mkt_arr_delay          234756 non-null  float64
 9   op_carrier_delay       234756 non-null  float64
 10  op_carrier_dep_delay   234756 non-null  float64
 11  origin_arr_delay       234756 non-null  float64
 12  origin_taxi_out        234756 non-null  float64
 13  origin_taxi_in         234756 non-null  float64
 14  origin_weather_delay   234756 no

In [179]:
y_predict_2020 = model_3.predict(X_test_2020)

In [180]:
y_predict_2020.shape

(150623,)

In [181]:
result = pd.read_csv('data/flights_test.csv',
                    usecols=['fl_date', 'mkt_carrier', 'mkt_carrier_fl_num',
                            'origin', 'dest'])

In [182]:
result['predicted_delay'] = y_predict_2020

In [183]:
result.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150623 entries, 0 to 150622
Data columns (total 6 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   fl_date             150623 non-null  object 
 1   mkt_carrier         150623 non-null  object 
 2   mkt_carrier_fl_num  150623 non-null  int64  
 3   origin              150623 non-null  object 
 4   dest                150623 non-null  object 
 5   predicted_delay     150623 non-null  float64
dtypes: float64(1), int64(1), object(4)
memory usage: 6.9+ MB


In [185]:
result.to_csv('data/result_submission', index=False)