In [12]:
import pandas as pd 
import numpy as np
import datetime

pd.set_option('display.max_columns', None)

#load data
df = pd.read_csv('../../data/127000_rand.csv')

## Functions

In [39]:
def load_data():
    """
    loads sample data extracted from db
    """
    return pd.read_csv('../../data/db_extracts/large_rand.csv') #change this to the path of file

######Inital Processing######
def remove_rows(df):
    """
    remove outlier rows and certain null values
        - arr_delay outliers above 350
        - arr_delay where null
    """

    df = df.loc[(df['arr_delay'].notnull()) | (df['arr_delay'] <= 350)]
    df = df.loc[df['arr_delay'] <= 350]
    #df = df.loc[df['taxi_out'] <= 75]
    #df = df.loc[df['taxi_in'] <= 75]

    return df

def create_datetime(df):
    """
    creates columns for month, year, hour onto the dataframe
    """
    df['month'] = pd.DatetimeIndex(df['fl_date']).month
    df['year'] = pd.DatetimeIndex(df['fl_date']).year
    df['day'] = pd.DatetimeIndex(df['fl_date']).dayofweek
    df['dep_hour'] = df['crs_dep_time'].round(-2)/100
    df['arr_hour']= df['crs_arr_time'].round(-2)/100

    return df

def create_is_late(df):
    """
    creates a column 0/1 to indicate if the flight was late or not
    """
    df['is_late'] = df['arr_delay'].apply(lambda x: 1 if x > 0 else 0)

    return df

def create_speed(df):
    """
    create column calculating expected speed of flight. miles/minute
    """
    df['speed'] = (df['distance'] / df['crs_elapsed_time']).round(2)

    return df

def create_route_index(df):
    """
    create key column that combines carrier_origin_dest
    """
    df['route_key'] = df['mkt_carrier'] + "_" + df['origin'] + "_" + df['dest']
    return df
 

######Table Creations######

    ## Airports Table ##
def create_table_airports(df=df):
    """
    input flights table and create a unique airports table with stats that will be used for features later
    """
    df_airports = df.copy()
    df_airports = df[['origin_airport_id', 'origin', 'origin_city_name']]
    df_airports = df_airports.drop_duplicates(subset=['origin_airport_id'])
    df_airports.rename(columns={'origin_airport_id': 'airport_id', 'origin':'airport_code'}, inplace=True)
    split = df_airports['origin_city_name'].str.split(",", n=1, expand=True)
    df_airports['city'] = split[0]
    df_airports['state'] = split[1]
    df_airports.drop('origin_city_name', axis=1, inplace=True)
    
    return df_airports

def create_dep_delay(df_flights, df_airports):
    """
    creates a column showing mean departure delay for each airport
    we used mean because previously have 0d out any negatives and mean results in 0 99% of the time
    """
    airports_temp = pd.DataFrame(df_flights.groupby('origin_airport_id').agg('mean')['dep_delay'])
    airports_temp['dep_delay'] = airports_temp['dep_delay'].round(2)
    airports_temp.reset_index(inplace=True)
    airports_temp.rename(columns={'origin_airport_id':'airport_id'}, inplace=True)

    df_airports = df_airports.merge(airports_temp, on='airport_id')
    df_airports.rename(columns={'dep_delay':'mean_d_delay'}, inplace=True)


    return df_airports

def create_arr_delay(df_flights, df_airports):
    """
    creates a column showing mean arrival delay for each airport
    we used mean because previously have 0d out any negatives and medium results in 0 99% of the time
    """
    airports_temp = pd.DataFrame(df_flights.groupby('dest_airport_id').agg('mean')['arr_delay'])
    airports_temp['arr_delay'] = airports_temp['arr_delay'].round(2)
    airports_temp.reset_index(inplace=True)
    airports_temp.rename(columns={'dest_airport_id':'airport_id'}, inplace=True)

    df_airports = df_airports.merge(airports_temp, on='airport_id')
    df_airports.rename(columns={'arr_delay':'mean_arr_delay'}, inplace=True)

    return df_airports

def create_flight_counts(df_flights, df_airports):
    """
    add columns counting amount of flights in and out of the airports
    this will then be used to calculate a column about % delayed 
    """
    airports_temp = pd.DataFrame(df_flights.groupby('origin_airport_id').count()['flights'])
    airports_temp.reset_index(inplace=True)
    airports_temp.rename(columns={'origin_airport_id':'airport_id', 'flights':'dep_flight_count'}, inplace=True)
    df_airports = df_airports.merge(airports_temp, on='airport_id')
    
    airports_temp = pd.DataFrame(df_flights.groupby('dest_airport_id').count()['flights'])
    airports_temp.reset_index(inplace=True)
    airports_temp.rename(columns={'dest_airport_id':'airport_id', 'flights':'arr_flight_count'}, inplace=True)
    df_airports = df_airports.merge(airports_temp, on='airport_id')    
    
    return df_airports

def create_delay_counts(df_flights, df_airports):
    """
    create columns counting number of flights lates for depart and arrival airports
    """
    airports_temp = pd.DataFrame(df_flights.groupby(by='origin_airport_id').agg('sum')['is_late'])
    airports_temp.reset_index(inplace=True)
    airports_temp.rename(columns={'origin_airport_id':'airport_id', 'is_late':'dep_late_count'}, inplace=True)
    df_airports = df_airports.merge(airports_temp, on='airport_id')

    airports_temp = pd.DataFrame(df_flights.groupby(by='dest_airport_id').agg('sum')['is_late'])
    airports_temp.reset_index(inplace=True)
    airports_temp.rename(columns={'dest_airport_id':'airport_id', 'is_late':'arr_late_count'}, inplace=True)
    df_airports = df_airports.merge(airports_temp, on='airport_id')

    return df_airports

def create_late_per(df_airports):
    """
    creates columns showing percent late for arrival and departing airports
    """
    df_airports['dep_late_perc'] = (df_airports['dep_late_count'] / df_airports['dep_flight_count']).round(3)
    df_airports['arr_late_perc'] = (df_airports['arr_late_count'] / df_airports['arr_flight_count']).round(3)

    return df_airports

def create_taxi(df_flights, df_airports):
    """
    add columns shows mean taxi time
    this will then be used to calculate a column about % delayed 
    """
    airports_temp = pd.DataFrame(df_flights.groupby('origin_airport_id').agg('mean')['taxi_out'])
    airports_temp.reset_index(inplace=True)
    airports_temp.rename(columns={'origin_airport_id':'airport_id', 'taxi_out':'dep_taxi'}, inplace=True)
    df_airports = df_airports.merge(airports_temp, on='airport_id')
    
    airports_temp = pd.DataFrame(df_flights.groupby('dest_airport_id').agg('mean')['taxi_in'])
    airports_temp.reset_index(inplace=True)
    airports_temp.rename(columns={'dest_airport_id':'airport_id', 'taxi_in':'arr_taxi'}, inplace=True)
    df_airports = df_airports.merge(airports_temp, on='airport_id')    
    
    return df_airports

    ## Tail Number Table ##
def create_tail_table(df_flights):
    """
    creates tabel with tail number stats from flights table
    """
    df_tail = df_flights[['tail_num', 'fl_date', 'is_late', 'arr_delay', 'arr_time', 'distance', 'carrier_delay', 
                          'late_aircraft_delay', 'weather_delay']].groupby(by='tail_num').agg({'fl_date':'count', 
                                                                                               'is_late':'sum', 
                                                                                               'arr_delay':['median', 'mean', 'std'],
                                                                                               'arr_time':['median', 'mean'],
                                                                                               'distance':'mean',
                                                                                               'carrier_delay':'count',
                                                                                               'late_aircraft_delay':'count',
                                                                                               'weather_delay':'count'})
    df_tail['late_perc'] = (df_tail['is_late']['sum'] / df_tail['fl_date']['count']).round(3)
    
    #flatterning the multi layered headers for easier use later
    flat_cols = list(df_tail.columns.map('_'.join))
    df_tail.columns = flat_cols

    return df_tail

    ## Carrier Table ##
def create_carrier_table(df_flights):
    """
    creates tabel with carrier stats from flights table
    """
    df_carrier = df_flights[['mkt_carrier', 'fl_date', 'is_late', 
                             'dep_delay', 'arr_delay', 'carrier_delay']].groupby(by='mkt_carrier').agg({'fl_date':'count',
                                                                                           'is_late':'sum',
                                                                                           'dep_delay': 'mean',
                                                                                           'arr_delay': 'mean',
                                                                                           'carrier_delay': 'mean'})
    df_carrier['late_perc'] = (df_carrier['is_late'] / df_carrier['fl_date']).round(3)
    df_carrier['carrier_delay'] = df_carrier['carrier_delay'].round(2)
    df_carrier['dep_delay'] = df_carrier['dep_delay'].round(2)
    df_carrier['arr_delay'] = df_carrier['arr_delay'].round(2)

    return df_carrier

    ## Hourly Table ##
def create_hourly_table(df_flights):
    """
    creates tabel with hourly stats from flights table
    """
    df_hours = df_flights[['dep_hour', 'fl_date', 'is_late', 
                             'dep_delay', 'arr_delay']].groupby(by='dep_hour').agg({'fl_date':'count',
                                                                                           'is_late':'sum',
                                                                                           'dep_delay': 'mean',
                                                                                           'arr_delay': 'mean'})
    df_hours['late_perc'] = (df_hours['is_late'] / df_hours['fl_date']).round(3)
    df_hours['dep_delay'] = df_hours['dep_delay'].round(2)
    df_hours['arr_delay'] = df_hours['arr_delay'].round(2)

    return df_hours

    ## Daily Table ##
def create_day_table(df_flights):
    """
    creates tabel with day stats from flights table
    """
    df_days = df_flights[['day', 'fl_date', 'is_late', 
                             'dep_delay', 'arr_delay']].groupby(by='day').agg({'fl_date':'count',
                                                                                           'is_late':'sum',
                                                                                           'dep_delay': 'mean',
                                                                                           'arr_delay': 'mean'})
    df_days['late_perc'] = (df_days['is_late'] / df_days['fl_date']).round(3)
    df_days['dep_delay'] = df_days['dep_delay'].round(2)
    df_days['arr_delay'] = df_days['arr_delay'].round(2)
    
    #bucket carrier category as orginal

    return df_days

    ## Route Table ##
def create_route_table(df_flights):
    """
    creates a table with stats about each route for each airline
    """
    routes = df[['route_key', 'dep_delay', 'arr_delay', 'flights']].groupby('route_key').agg({'dep_delay':'mean',
                                                                                              'arr_delay':'mean',
                                                                                              'flights':'count'})
    routes['route_var'] = routes['arr_delay']-routes['dep_delay']
    routes.rename(columns={'dep_delay':'route_dep_delay', 
                           'arr_delay':'route_arr_delay',
                           'flights': 'route_count'}, inplace=True)
    routes.reset_index(inplace=True)
    routes.fillna(0, inplace=True)
    

    return routes

###### Exporting Created Tables ######
def save_tables():
    airports.to_csv('../../data/Exported_Tables/stats_airports.csv',  index=False)
    tail.to_csv('../../data/Exported_Tables/stats_tail.csv')
    carrier.to_csv('../../data/Exported_Tables/stats_carrier.csv')
    hourly.to_csv('../../data/Exported_Tables/stats_hourly.csv')
    daily.to_csv('../../data/Exported_Tables/stats_daily.csv')
    routes.to_csv('../../data/Exported_Tables/stats_routes.csv', index=False)


###### Loading Created Tables ######
def load_f_tables():
    airports = pd.read_csv('../../data/Exported_Tables/stats_airports.csv')
    tail = pd.read_csv('../../data/Exported_Tables/stats_tail.csv',index_col='tail_num')
    carrier = pd.read_csv('../../data/Exported_Tables/stats_carrier.csv', index_col='mkt_carrier')
    hourly = pd.read_csv('../../data/Exported_Tables/stats_hourly.csv', index_col='dep_hour')
    daily = pd.read_csv('../../data/Exported_Tables/stats_daily.csv', index_col='day')
    routes = pd.read_csv('../../data/Exported_Tables/stats_routes.csv')

    return airports, tail, carrier, hourly, daily, routes

###### Mearging Features Onto Origional Table ######

def get_features_airport(df_flights, df_airports):
    """
    merges engineered features from the airport stats table onto the main dataframe as additional columns
    """
    airport_dep = df_airports[['airport_id', 'mean_d_delay', 'dep_late_perc', 'dep_taxi']].rename(columns={'mean_d_delay':'air_mean_d_delay', 'dep_late_perc':'air_dep_late_perc', 'dep_taxi':'air_dep_taxi'})
    airport_arr = df_airports[['airport_id', 'mean_arr_delay', 'arr_late_perc', 'arr_taxi']].rename(columns={'mean_arr_delay':'air_mean_arr_delay', 'arr_late_perc':'air_arr_late_perc', 'arr_taxi':'air_arr_taxi'})

    #merge each onto the main dataframe and rename

    df_temp = df_flights.merge(airport_dep, left_on='origin_airport_id', right_on='airport_id')
    df_temp = df_temp.merge(airport_arr, left_on='dest_airport_id', right_on='airport_id')
    df_temp.drop(['airport_id_x', 'airport_id_y'], axis = 1, inplace=True)

    return df_temp

def get_features_hourly(df_flights, df_hourly):
    """
    merges engineered features from the hourly stats table onto the main dataframe as additional columns
    """
    hourly_filter = df_hourly.reset_index()[['dep_hour', 'fl_date', 
                                             'arr_delay', 'late_perc']].rename(columns={'fl_date':'hour_count', 
                                                                                        'arr_delay':'hour_arr_delay',
                                                                                        'late_perc':'hour_late_perc'})
    df_temp = df_flights.merge(hourly_filter, left_on='dep_hour', right_on='dep_hour')
    #df_temp.drop(['airport_id_x', 'airport_id_y'], axis = 1, inplace=True)

    return df_temp

def get_features_tail(df_flights, df_tail):
    """
    merges engineered features from the Tail Num stats table onto the main dataframe as additional columns
    """
    tail_filter = df_tail.reset_index()[['tail_num', 'fl_date_count', 'arr_delay_mean', 'arr_delay_std', 'late_perc_']]
    tail_filter.rename(columns={'fl_date_count':'tail_count', 'arr_delay_mean':'tail_arr_delay','arr_delay_std':'tail_arr_delay_std' , 'late_perc_':'tail_late_perc'}, inplace=True)
    tail_filter['tail_arr_delay'] = tail_filter['tail_arr_delay'].round(2)
    tail_filter['tail_arr_delay_std'] = tail_filter['tail_arr_delay_std'].round(2)
    tail_filter.drop('tail_arr_delay_std', axis=1, inplace=True)

    df_output = df_flights.merge(tail_filter, left_on='tail_num', right_on='tail_num')

    return df_output

def get_features_carrier(df_flights, df_carrier):
    """
    merges engineered features from carries
    """
    carrier_filter = df_carrier.reset_index()[['mkt_carrier', 'fl_date', 'arr_delay', 
                                               'carrier_delay', 'late_perc']].rename(columns={'fl_date':'carrier_count', 
                                                                                              'arr_delay':'carrier_arr_delay',
                                                                                              'carrier_delay':'carrier_carrier_delay',
                                                                                              'late_perc':'carrier_late_perc'})
    temp = df_flights.merge(carrier_filter, left_on='mkt_carrier', right_on='mkt_carrier')

    return temp

def get_features_day(df_flights, df_daily):
    daily_filter = df_daily.reset_index()[['day', 'fl_date', 'dep_delay', 'arr_delay', 
                                            'late_perc']].rename(columns={'fl_date':'day_count', 
                                                                          'dep_delay':'day_dep_delay',
                                                                          'arr_delay':'day_arr_delay',
                                                                          'late_perc':'day_late_perc'})
    
    output = df_flights.merge(daily_filter, left_on='day', right_on='day')

    return output 

def get_features_routes(df_flights, df_routes):
    """
    merges wanted features from the routes table
    """
    output = df_flights.merge(df_routes, on='route_key')
    return output


###### Drop Uneeded Columns ######
def drop_it_og(df):
    """
    drop unneeded columns to be used in the model
    """
    output = df.copy().drop(['fl_date', 'mkt_unique_carrier', 'branded_code_share', 'mkt_carrier','mkt_carrier_fl_num', 
                'op_unique_carrier', 'tail_num', 'op_carrier_fl_num', 'origin_airport_id','origin', 'origin_city_name', 'dest_airport_id',
                'dest', 'dest_city_name', 'crs_dep_time', 'dep_time', 'dep_delay', 'taxi_out', 'wheels_off', 'wheels_on', 'taxi_in', 
                'crs_arr_time', 'arr_time', 'cancelled', 'cancellation_code', 'diverted', 'dup', 'actual_elapsed_time', 'air_time', 'flights',
                'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay', 'first_dep_time', 'total_add_gtime', 
                'longest_add_gtime', 'no_name', 'year', 'route_key', 'is_late'], axis=1)
    return output

def save_final_features(df):
    """
    export csv final features table to specificed location
    """
    df.to_csv('../../data/Exported_Tables/final_features.csv')
    return 'Sucessfully exported final features'



###### Transform Jan Test Table ######
def transform_test(data):
    """
    takes the jan test table format and transformes it to a usable state for prediction purposes
    """
    data['dep_hour'] = df['crs_dep_time'].round(-2)/100
    data['arr_hour']= df['crs_arr_time'].round(-2)/100
    data['speed'] = (df['distance'] / df['crs_elapsed_time']).round(2)
    data['route_key'] = df['mkt_carrier'] + "_" + df['origin'] + "_" + df['dest']

    data = get_features_airport(data, airports)
    data = get_features_hourly(data, hourly)
    data = get_features_tail(data, tail)
    data = get_features_carrier(data, carrier)
    data = get_features_day(data, daily)
    data = get_features_routes(data, routes)
    data.drop(['fl_date', 'mkt_unique_carrier','branded_code_share','mkt_carrier','mkt_carrier_fl_num',
               'op_unique_carrier','tail_num','op_carrier_fl_num', 'origin_airport_id', 'origin', 'origin_city_name',
               'dest_airport_id', 'dest', 'dest_city_name', 'crs_dep_time', 'crs_arr_time', 'dup', 
               'flights', 'route_key'], axis=1, inplace=True)

    return data

## Process

In [40]:
#### LOAD & CLEAN ####
df = (
            load_data()
            .pipe(remove_rows)
            .pipe(create_datetime)
            .pipe(create_is_late)
            .pipe(create_speed)
            .pipe(create_route_index)
            )
print('---- Basic Proprocessing Complete! ----')

#### TABLE CREATION ####
# create airport table
airports = create_table_airports()
airports = create_dep_delay(df, airports)
airports = create_arr_delay(df, airports)
airports = create_flight_counts(df, airports)
airports = create_delay_counts(df, airports)
airports = create_late_per(airports)
airports = create_taxi(df, airports)
print('---- Airport Table Complete! ----')

# create tail numbers table
tail = create_tail_table(df)
print('---- Tail Table Complete!    ----')
# create carrier table
carrier = create_carrier_table(df)
print('---- Carrier Table Complete! ----')
# create hourly table
hourly = create_hourly_table(df)
print('---- hourly Table Complete!  ----')
# create daily table
daily = create_day_table(df)
print('---- Daily Table Complete!   ----')
# create routes table
routes = create_route_table(df)

# Export all tables for later use #
save_tables()
print('---- Export Tables Complete! ----')

#### Load Created Tables #####
airports, tail, carrier, hourly, daily, routes = load_f_tables()
print('---- Loading Complete!       ----')

#### MERGING TO DF ##### 
df_merged = df.copy()
df_merged = get_features_airport(df_merged, airports)
df_merged = get_features_hourly(df_merged, hourly)
df_merged = get_features_tail(df_merged, tail)
df_merged = get_features_carrier(df_merged, carrier)
df_merged = get_features_day(df_merged, daily)
df_merged = get_features_routes(df_merged, routes)
print('---- Merge Complete!         ----')

#### DROP UNEEEDED ####
df_dropped = drop_it_og(df_merged)
print('---- Final Drop Complete!    ----')

#### DROP UNEEEDED ####
save_final_features(df_dropped)
print('---- Saving Complete!        ----')

  This is separate from the ipykernel package so we can avoid doing imports until


---- Basic Proprocessing Complete! ----
---- Airport Table Complete! ----
---- Tail Table Complete!    ----
---- Carrier Table Complete! ----
---- hourly Table Complete!  ----
---- Daily Table Complete!   ----
---- Export Tables Complete! ----
---- Loading Complete!       ----
---- Merge Complete!         ----
---- Final Drop Complete!    ----
---- Saving Complete!        ----


# Transforming Test Table

In [42]:
df_test = pd.read_csv('../../data/db_extracts/flights_test.csv')
df_test_transformed = df_test.copy()
df_test_transformed = transform_test(df_test)


df_test_transformed.to_csv('../../data/Exported_Tables/X_test_real.csv')

In [43]:
df_test_transformed

Unnamed: 0,crs_elapsed_time,distance,month,day,dep_hour,arr_hour,speed,air_mean_d_delay,air_dep_late_perc,air_dep_taxi,air_mean_arr_delay,air_arr_late_perc,air_arr_taxi,hour_count,hour_arr_delay,hour_late_perc,tail_count,tail_arr_delay,tail_late_perc,carrier_count,carrier_arr_delay,carrier_carrier_delay,carrier_late_perc,day_count,day_dep_delay,day_arr_delay,day_late_perc,route_dep_delay,route_arr_delay,route_count,route_var
0,95,363,1,3,15.0,16.0,3.56,5.92,0.301,11.496901,8.83,0.401,8.549240,36672,7.09,0.388,185,2.82,0.357,106319,3.49,16.51,0.364,92233,10.05,5.64,0.372,-3.0,1.088235,34,4.088235
1,121,620,1,3,15.0,16.0,3.33,10.18,0.371,22.060101,2.43,0.376,5.096078,36672,7.09,0.388,120,4.82,0.400,120847,6.55,15.72,0.362,92233,10.05,5.64,0.372,-3.0,1.088235,34,4.088235
2,175,978,1,3,15.0,16.0,3.56,5.46,0.320,13.858680,2.39,0.393,9.964247,36672,7.09,0.388,99,8.18,0.495,33451,0.59,12.34,0.352,92233,10.05,5.64,0.372,-3.0,1.088235,34,4.088235
3,119,551,1,3,15.0,16.0,3.56,8.51,0.314,16.719732,4.43,0.350,5.575926,36672,7.09,0.388,123,1.66,0.317,137949,0.64,18.38,0.291,92233,10.05,5.64,0.372,-3.0,1.088235,34,4.088235
4,161,1005,1,1,15.0,16.0,3.33,12.08,0.419,23.038196,5.48,0.361,9.337009,36672,7.09,0.388,82,10.11,0.366,120847,6.55,15.72,0.362,89649,8.44,3.33,0.334,-3.0,1.088235,34,4.088235
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
619999,40,100,1,0,11.0,13.0,6.20,2.20,0.347,10.589962,-0.13,0.335,7.510488,36694,1.72,0.320,215,0.54,0.353,7311,0.49,22.26,0.333,93581,10.04,5.10,0.358,-4.0,-3.000000,1,1.000000
620000,42,100,1,0,11.0,13.0,4.41,1.70,0.310,15.632421,-1.06,0.301,6.768895,36694,1.72,0.320,240,-0.31,0.304,7311,0.49,22.26,0.333,93581,10.04,5.10,0.358,73.0,44.000000,1,-29.000000
620001,256,1576,1,0,9.0,12.0,5.81,8.66,0.347,22.615398,8.25,0.429,6.057171,33772,-0.39,0.297,124,7.80,0.419,9978,9.62,17.97,0.422,93581,10.04,5.10,0.358,-5.0,-22.000000,1,-17.000000
620002,89,349,1,0,20.0,23.0,7.28,10.95,0.382,18.060786,4.32,0.371,5.385531,34086,9.00,0.417,135,6.44,0.467,9978,9.62,17.97,0.422,93581,10.04,5.10,0.358,-8.0,-22.000000,1,-14.000000


In [32]:
def transform_test(data):
    data['dep_hour'] = df['crs_dep_time'].round(-2)/100
    data['arr_hour']= df['crs_arr_time'].round(-2)/100
    data['speed'] = (df['distance'] / df['crs_elapsed_time']).round(2)
    data['route_key'] = df['mkt_carrier'] + "_" + df['origin'] + "_" + df['dest']

    data = get_features_airport(data, airports)
    data = get_features_hourly(data, hourly)
    data = get_features_tail(data, tail)
    data = get_features_carrier(data, carrier)
    data = get_features_day(data, daily)
    data = get_features_routes(data, routes)
    data.drop(['fl_date', 'mkt_unique_carrier','branded_code_share','mkt_carrier','mkt_carrier_fl_num',
               'op_unique_carrier','tail_num','op_carrier_fl_num', 'origin_airport_id', 'origin', 'origin_city_name',
               'dest_airport_id', 'dest', 'dest_city_name', 'crs_dep_time', 'crs_arr_time', 'dup', 
               'flights', 'route_key'], axis=1, inplace=True)

    return data


# Scrap

In [3]:
df

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,origin_city_name,dest_airport_id,dest,dest_city_name,crs_dep_time,dep_time,dep_delay,taxi_out,wheels_off,wheels_on,taxi_in,crs_arr_time,arr_time,arr_delay,cancelled,cancellation_code,diverted,dup,crs_elapsed_time,actual_elapsed_time,air_time,flights,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,first_dep_time,total_add_gtime,longest_add_gtime,no_name
0,2018-06-07,DL,DL,DL,1826,DL,N805DN,1826,10397,ATL,"Atlanta, GA",11697,FLL,"Fort Lauderdale, FL",1310,1322.0,12.0,8.0,1330.0,1826.0,8.0,1501,1834.0,,0,,1,N,111,,,1,581,,,,,,,,,
1,2018-06-07,DL,DL,DL,2232,DL,N912DE,2232,11042,CLE,"Cleveland, OH",10397,ATL,"Atlanta, GA",1041,1041.0,0.0,10.0,1051.0,1210.0,9.0,1223,1219.0,-4.0,0,,0,N,102,98.0,79.0,1,554,,,,,,,,,
2,2018-06-07,DL,DL,DL,2292,DL,N335NB,2292,12953,LGA,"New York, NY",13487,MSP,"Minneapolis, MN",1825,1824.0,-1.0,22.0,1846.0,2012.0,10.0,2034,2022.0,-12.0,0,,0,N,189,178.0,146.0,1,1020,,,,,,,,,
3,2018-06-07,DL,DL,DL,2415,DL,N988AT,2415,10397,ATL,"Atlanta, GA",11618,EWR,"Newark, NJ",2232,2224.0,-8.0,21.0,2245.0,29.0,3.0,41,32.0,-9.0,0,,0,N,129,128.0,104.0,1,746,,,,,,,,,
4,2018-06-07,DL,DL,DL,2444,DL,N391DA,2444,14771,SFO,"San Francisco, CA",12892,LAX,"Los Angeles, CA",1100,1234.0,94.0,19.0,1253.0,1346.0,11.0,1243,1357.0,74.0,0,,0,N,103,83.0,53.0,1,337,0.0,0.0,5.0,0.0,69.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127475,2018-06-07,UA,UA_CODESHARE,UA,3855,ZW,N448AW,3855,12339,IND,"Indianapolis, IN",13930,ORD,"Chicago, IL",1655,1642.0,-13.0,12.0,1654.0,1640.0,14.0,1705,1654.0,-11.0,0,,0,N,70,72.0,46.0,1,177,,,,,,,,,
127476,2018-06-07,DL,DL,DL,1107,DL,N776DE,1107,13204,MCO,"Orlando, FL",11193,CVG,"Cincinnati, OH",1112,1138.0,26.0,11.0,1149.0,1333.0,5.0,1314,1338.0,24.0,0,,0,N,122,120.0,104.0,1,757,0.0,0.0,0.0,0.0,24.0,,,,
127477,2018-06-07,DL,DL,DL,1269,DL,N3771K,1269,11433,DTW,"Detroit, MI",12892,LAX,"Los Angeles, CA",1750,1748.0,-2.0,13.0,1801.0,1906.0,5.0,1938,1911.0,-27.0,0,,0,N,288,263.0,245.0,1,1979,,,,,,,,,
127478,2018-06-07,DL,DL,DL,1347,DL,N6701,1347,12892,LAX,"Los Angeles, CA",13487,MSP,"Minneapolis, MN",1315,1329.0,14.0,17.0,1346.0,1851.0,4.0,1854,1855.0,1.0,0,,0,N,219,206.0,185.0,1,1535,,,,,,,,,


In [41]:
df_dropped

Unnamed: 0,arr_delay,crs_elapsed_time,distance,month,day,dep_hour,arr_hour,speed,air_mean_d_delay,air_dep_late_perc,air_dep_taxi,air_mean_arr_delay,air_arr_late_perc,air_arr_taxi,hour_count,hour_arr_delay,hour_late_perc,tail_count,tail_arr_delay,tail_late_perc,carrier_count,carrier_arr_delay,carrier_carrier_delay,carrier_late_perc,day_count,day_dep_delay,day_arr_delay,day_late_perc,route_dep_delay,route_arr_delay,route_count,route_var
0,67.0,45,160,5,6,15.0,16.0,3.56,-0.29,0.288,13.861981,1.50,0.391,3.843750,36672,7.09,0.388,135,-5.31,0.222,33451,0.59,12.34,0.352,88708,9.05,3.77,0.344,-3.0,1.088235,34,4.088235
1,-3.0,49,160,11,6,15.0,16.0,3.27,-0.29,0.288,13.861981,1.50,0.391,3.843750,36672,7.09,0.388,132,-5.30,0.235,33451,0.59,12.34,0.352,88708,9.05,3.77,0.344,-3.0,1.088235,34,4.088235
2,1.0,47,160,4,6,15.0,16.0,3.40,-0.29,0.288,13.861981,1.50,0.391,3.843750,36672,7.09,0.388,148,-4.11,0.257,33451,0.59,12.34,0.352,88708,9.05,3.77,0.344,-3.0,1.088235,34,4.088235
3,-6.0,50,160,1,6,15.0,16.0,3.20,-0.29,0.288,13.861981,1.50,0.391,3.843750,36672,7.09,0.388,94,0.32,0.298,33451,0.59,12.34,0.352,88708,9.05,3.77,0.344,-3.0,1.088235,34,4.088235
4,6.0,45,160,3,5,15.0,16.0,3.56,-0.29,0.288,13.861981,1.50,0.391,3.843750,36672,7.09,0.388,135,-5.31,0.222,33451,0.59,12.34,0.352,76006,7.10,1.02,0.308,-3.0,1.088235,34,4.088235
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
623415,19.0,211,1514,6,2,16.0,21.0,7.18,7.37,0.318,13.910368,9.54,0.449,6.225316,36801,8.39,0.405,73,5.27,0.438,7938,6.25,20.01,0.379,90814,8.01,3.26,0.342,29.0,19.000000,1,-10.000000
623416,-14.0,161,1090,11,2,8.0,11.0,6.77,6.36,0.279,10.659574,3.55,0.274,4.197115,42868,-0.81,0.294,93,11.76,0.398,7938,6.25,20.01,0.379,90814,8.01,3.26,0.342,-15.0,-14.000000,1,1.000000
623417,18.0,217,1550,6,2,6.0,11.0,7.14,8.15,0.324,18.227439,4.75,0.355,6.479798,44842,-3.98,0.237,99,-2.83,0.343,7938,6.25,20.01,0.379,90814,8.01,3.26,0.342,-1.0,18.000000,1,19.000000
623418,48.0,158,1117,7,2,9.0,12.0,7.07,7.88,0.321,13.797566,9.54,0.449,6.225316,33772,-0.39,0.297,88,9.84,0.477,7938,6.25,20.01,0.379,90814,8.01,3.26,0.342,44.0,48.000000,1,4.000000


In [54]:
routes.fillna(0).isnull().sum()

route_key          0
route_dep_delay    0
route_arr_delay    0
route_count        0
route_var          0
dtype: int64

In [10]:
df
df['mkt_carrier'] + "_" + df['origin'] + "_" + df['dest']

0         AS_ANC_CDV
1         AS_ATL_SEA
2         AS_PDX_ANC
3         AS_EWR_SFO
4         AS_SEA_IAH
             ...    
637436    UA_IAH_BNA
637437    UA_MCI_IAH
637438    UA_ORD_OKC
637439    WN_ABQ_PHX
637440    AS_JNU_YAK
Length: 623434, dtype: object

In [25]:
routes = df[['route_key', 'dep_delay', 'arr_delay']].groupby('route_key').agg('mean')
routes.head(10)

Unnamed: 0_level_0,dep_delay,arr_delay
route_key,Unnamed: 1_level_1,Unnamed: 2_level_1
AA_ABE_CLT,1.623529,-2.035294
AA_ABE_ORD,6.392857,0.285714
AA_ABE_PHL,-3.8,-2.146667
AA_ABI_DFW,6.02924,3.631579
AA_ABQ_DFW,4.253731,0.144279
AA_ABQ_LAX,-1.764045,-3.898876
AA_ABQ_ORD,8.962264,3.45283
AA_ABQ_PHX,7.639344,2.254098
AA_ACK_CLT,-5.0,-19.0
AA_ACK_DCA,6.0,3.307692


In [38]:
routes = create_route_table(df)
routes

Unnamed: 0,route_key,route_dep_delay,route_arr_delay,route_count,route_var
0,AA_ABE_CLT,1.623529,-2.035294,85,-3.658824
1,AA_ABE_ORD,6.392857,0.285714,28,-6.107143
2,AA_ABE_PHL,-3.800000,-2.146667,75,1.653333
3,AA_ABI_DFW,6.029240,3.631579,171,-2.397661
4,AA_ABQ_DFW,4.253731,0.144279,201,-4.109453
...,...,...,...,...,...
9855,WN_TUS_LAX,7.358974,3.217949,78,-4.141026
9856,WN_TUS_MDW,0.294118,-8.323529,34,-8.617647
9857,WN_TUS_OAK,8.000000,-6.000000,1,-14.000000
9858,WN_TUS_SAN,3.066667,-3.283333,60,-6.350000


In [41]:
get_features_routes(df, routes)

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,origin_city_name,dest_airport_id,dest,dest_city_name,crs_dep_time,dep_time,dep_delay,taxi_out,wheels_off,wheels_on,taxi_in,crs_arr_time,arr_time,arr_delay,cancelled,cancellation_code,diverted,dup,crs_elapsed_time,actual_elapsed_time,air_time,flights,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,first_dep_time,total_add_gtime,longest_add_gtime,no_name,month,year,day,dep_hour,arr_hour,is_late,speed,route_key,route_dep_delay,route_arr_delay,route_count,route_var
0,2019-05-19,AS,AS,AS,66,AS,N619AS,66,10299,ANC,"Anchorage, AK",10926,CDV,"Cordova, AK",1525,1624.0,59.0,14.0,1638.0,1713.0,4.0,1610,1717.0,67.0,0,,0,N,45,53.0,35.0,1,160,59.0,0.0,8.0,0.0,0.0,1532.0,17.0,17.0,,5,2019,6,15.0,16.0,1,3.56,AS_ANC_CDV,-3.0,1.088235,34,4.088235
1,2019-04-10,AS,AS,AS,66,AS,N607AS,66,10299,ANC,"Anchorage, AK",10926,CDV,"Cordova, AK",1525,1518.0,-7.0,15.0,1533.0,1608.0,4.0,1610,1612.0,2.0,0,,0,N,45,54.0,35.0,1,160,,,,,,,,,,4,2019,2,15.0,16.0,1,3.56,AS_ANC_CDV,-3.0,1.088235,34,4.088235
2,2019-04-17,AS,AS,AS,66,AS,N593AS,66,10299,ANC,"Anchorage, AK",10926,CDV,"Cordova, AK",1525,1519.0,-6.0,20.0,1539.0,1615.0,4.0,1610,1619.0,9.0,0,,0,N,45,60.0,36.0,1,160,,,,,,,,,,4,2019,2,15.0,16.0,1,3.56,AS_ANC_CDV,-3.0,1.088235,34,4.088235
3,2019-03-23,AS,AS,AS,66,AS,N619AS,66,10299,ANC,"Anchorage, AK",10926,CDV,"Cordova, AK",1525,1519.0,-6.0,17.0,1536.0,1613.0,3.0,1610,1616.0,6.0,0,,0,N,45,57.0,37.0,1,160,,,,,,,,,,3,2019,5,15.0,16.0,1,3.56,AS_ANC_CDV,-3.0,1.088235,34,4.088235
4,2019-07-25,AS,AS,AS,66,AS,N612AS,66,10299,ANC,"Anchorage, AK",10926,CDV,"Cordova, AK",1510,1514.0,4.0,23.0,1537.0,1607.0,4.0,1600,1611.0,11.0,0,,0,N,50,57.0,30.0,1,160,,,,,,,,,,7,2019,3,15.0,16.0,1,3.20,AS_ANC_CDV,-3.0,1.088235,34,4.088235
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
623429,2019-05-04,WN,WN,WN,2513,WN,N8631A,2513,12953,LGA,"New York, NY",14027,PBI,"West Palm Beach/Palm Beach, FL",1215,1209.0,-6.0,26.0,1235.0,1514.0,4.0,1520,1518.0,-2.0,0,,0,N,185,189.0,159.0,1,1035,,,,,,,,,,5,2019,5,12.0,15.0,0,5.59,WN_LGA_PBI,-6.0,-2.000000,1,4.000000
623430,2019-05-12,AA,AA_CODESHARE,AA,4619,YX,N107HQ,4619,11278,DCA,"Washington, DC",13360,MLB,"Melbourne, FL",845,840.0,-5.0,23.0,903.0,1101.0,7.0,1110,1108.0,-2.0,0,,0,N,145,148.0,118.0,1,769,,,,,,,,,,5,2019,6,8.0,11.0,0,5.30,AA_DCA_MLB,-5.0,-2.000000,1,3.000000
623431,2019-05-16,G4,G4,G4,1619,G4,247NV,1619,12223,HTS,"Ashland, WV",14082,PGD,"Punta Gorda, FL",848,852.0,4.0,6.0,858.0,1039.0,5.0,1048,1044.0,-4.0,0,,0,N,120,112.0,101.0,1,790,,,,,,,,,,5,2019,3,8.0,10.0,0,6.58,G4_HTS_PGD,4.0,-4.000000,1,-8.000000
623432,2019-05-18,NK,NK,NK,397,NK,N534NK,397,13198,MCI,"Kansas City, MO",13577,MYR,"Myrtle Beach, SC",940,1027.0,47.0,13.0,1040.0,1341.0,7.0,1305,1348.0,43.0,0,,0,N,145,141.0,121.0,1,959,0.0,0.0,0.0,0.0,43.0,,,,,5,2019,5,9.0,13.0,1,6.61,NK_MCI_MYR,47.0,43.000000,1,-4.000000


In [40]:
df

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,origin_city_name,dest_airport_id,dest,dest_city_name,crs_dep_time,dep_time,dep_delay,taxi_out,wheels_off,wheels_on,taxi_in,crs_arr_time,arr_time,arr_delay,cancelled,cancellation_code,diverted,dup,crs_elapsed_time,actual_elapsed_time,air_time,flights,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,first_dep_time,total_add_gtime,longest_add_gtime,no_name,month,year,day,dep_hour,arr_hour,is_late,speed,route_key
0,2019-05-19,AS,AS,AS,66,AS,N619AS,66,10299,ANC,"Anchorage, AK",10926,CDV,"Cordova, AK",1525,1624.0,59.0,14.0,1638.0,1713.0,4.0,1610,1717.0,67.0,0,,0,N,45,53.0,35.0,1,160,59.0,0.0,8.0,0.0,0.0,1532.0,17.0,17.0,,5,2019,6,15.0,16.0,1,3.56,AS_ANC_CDV
1,2019-05-19,AS,AS,AS,85,AS,N265AK,85,10397,ATL,"Atlanta, GA",14747,SEA,"Seattle, WA",625,622.0,-3.0,15.0,637.0,813.0,8.0,845,821.0,-24.0,0,,0,N,320,299.0,276.0,1,2182,,,,,,,,,,5,2019,6,6.0,8.0,0,6.82,AS_ATL_SEA
2,2019-05-19,AS,AS,AS,143,AS,N590AS,143,14057,PDX,"Portland, OR",10299,ANC,"Anchorage, AK",2040,2032.0,-8.0,16.0,2048.0,2249.0,4.0,2320,2253.0,-27.0,0,,0,N,220,201.0,181.0,1,1542,,,,,,,,,,5,2019,6,20.0,23.0,0,7.01,AS_PDX_ANC
3,2019-05-19,AS,AS,AS,317,AS,N442AS,317,11618,EWR,"Newark, NJ",14771,SFO,"San Francisco, CA",1940,2148.0,128.0,119.0,2347.0,224.0,7.0,2305,231.0,206.0,0,,0,N,385,463.0,337.0,1,2565,0.0,0.0,78.0,0.0,128.0,,,,,5,2019,6,19.0,23.0,1,6.66,AS_EWR_SFO
4,2019-05-19,AS,AS,AS,350,AS,N495AS,350,14747,SEA,"Seattle, WA",12266,IAH,"Houston, TX",1200,1201.0,1.0,28.0,1229.0,1827.0,11.0,1820,1838.0,18.0,0,,0,N,260,277.0,238.0,1,1874,0.0,0.0,18.0,0.0,0.0,,,,,5,2019,6,12.0,18.0,1,7.21,AS_SEA_IAH
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
637436,2019-05-19,UA,UA_CODESHARE,UA,3601,YX,N651RW,3601,12266,IAH,"Houston, TX",10693,BNA,"Nashville, TN",1620,1620.0,0.0,24.0,1644.0,1811.0,31.0,1817,1842.0,25.0,0,,0,N,117,142.0,87.0,1,657,0.0,0.0,25.0,0.0,0.0,,,,,5,2019,6,16.0,18.0,1,5.62,UA_IAH_BNA
637437,2019-05-19,UA,UA_CODESHARE,UA,3611,YX,N729YX,3611,13198,MCI,"Kansas City, MO",12266,IAH,"Houston, TX",1830,1907.0,37.0,15.0,1922.0,2102.0,7.0,2037,2109.0,32.0,0,,0,N,127,122.0,100.0,1,643,0.0,0.0,0.0,0.0,32.0,,,,,5,2019,6,18.0,20.0,1,5.06,UA_MCI_IAH
637438,2019-05-19,UA,UA_CODESHARE,UA,3615,YX,N650RW,3615,13930,ORD,"Chicago, IL",13851,OKC,"Oklahoma City, OK",1246,1319.0,33.0,28.0,1347.0,1549.0,7.0,1507,1556.0,49.0,0,,0,N,141,157.0,122.0,1,693,33.0,0.0,16.0,0.0,0.0,,,,,5,2019,6,12.0,15.0,1,4.91,UA_ORD_OKC
637439,2019-05-19,WN,WN,WN,2980,WN,N296WN,2980,10140,ABQ,"Albuquerque, NM",14107,PHX,"Phoenix, AZ",1420,1450.0,30.0,9.0,1459.0,1457.0,6.0,1435,1503.0,28.0,0,,0,N,75,73.0,58.0,1,328,10.0,0.0,0.0,0.0,18.0,,,,,5,2019,6,14.0,14.0,1,4.37,WN_ABQ_PHX


In [22]:
df.describe()

Unnamed: 0,mkt_carrier_fl_num,op_carrier_fl_num,origin_airport_id,dest_airport_id,crs_dep_time,dep_time,dep_delay,taxi_out,wheels_off,wheels_on,taxi_in,crs_arr_time,arr_time,arr_delay,cancelled,diverted,crs_elapsed_time,actual_elapsed_time,air_time,flights,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,first_dep_time,total_add_gtime,longest_add_gtime,no_name,month,year,day,dep_hour,arr_hour,is_late,speed
count,623434.0,623434.0,623434.0,623434.0,623434.0,623434.0,623262.0,623190.0,623190.0,623189.0,623189.0,623434.0,623433.0,623434.0,623434.0,623434.0,623434.0,623433.0,623189.0,623434.0,623434.0,118674.0,118674.0,118674.0,118674.0,118674.0,3720.0,3720.0,3720.0,0.0,623434.0,623434.0,623434.0,623434.0,623434.0,623434.0,623434.0
mean,2724.716876,2724.651408,12677.266299,12678.546318,1328.498824,1333.911673,9.005641,17.512728,1358.559699,1459.691915,7.669418,1483.037754,1464.069841,3.998816,0.0,0.0,138.166197,133.16135,108.01793,1.0,771.014975,16.713914,2.835432,15.679281,0.092767,25.538054,1326.218548,35.402419,35.064516,,6.585493,2018.50739,2.935397,13.150194,14.694444,0.348547,4.996167
std,1863.357531,1863.383216,1523.241185,1523.318306,489.908779,503.698231,35.11053,10.023355,505.05352,532.220984,6.073373,516.9314,536.490019,37.989566,0.0,0.0,71.989516,71.797057,69.85365,0.0,588.822895,36.687724,17.685611,30.14976,2.765911,44.276667,492.109796,29.18567,28.808398,,3.399466,0.499946,1.996044,4.905533,5.180856,0.476511,1.511757
min,1.0,1.0,10135.0,10135.0,1.0,1.0,-141.0,0.0,1.0,1.0,0.0,1.0,1.0,-230.0,0.0,0.0,-35.0,16.0,8.0,1.0,16.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,1.0,,1.0,2018.0,0.0,0.0,0.0,0.0,-18.6
25%,1115.0,1115.0,11292.0,11292.0,915.0,917.0,-6.0,11.0,933.0,1044.0,4.0,1100.0,1048.0,-15.0,0.0,0.0,87.0,82.0,58.0,1.0,341.0,0.0,0.0,0.0,0.0,0.0,858.0,16.0,16.0,,4.0,2018.0,1.0,9.0,11.0,0.0,3.99
50%,2301.0,2301.0,12889.0,12889.0,1320.0,1326.0,-2.0,15.0,1341.0,1459.0,6.0,1510.0,1503.0,-6.0,0.0,0.0,119.0,115.0,89.0,1.0,604.0,0.0,0.0,3.0,0.0,3.0,1339.0,27.0,27.0,,7.0,2019.0,3.0,13.0,15.0,0.0,5.12
75%,4309.0,4308.0,14057.0,14057.0,1735.0,1744.0,6.0,20.0,1759.0,1909.0,9.0,1915.0,1914.0,8.0,0.0,0.0,168.0,163.0,137.0,1.0,1008.0,17.0,0.0,20.0,0.0,33.0,1732.0,43.0,43.0,,10.0,2019.0,5.0,17.0,19.0,1.0,6.08
max,9390.0,9390.0,16869.0,16869.0,2359.0,2400.0,383.0,178.0,2400.0,2400.0,212.0,2400.0,2400.0,350.0,0.0,0.0,1542.0,725.0,696.0,1.0,5095.0,350.0,350.0,349.0,291.0,350.0,2357.0,216.0,216.0,,12.0,2019.0,6.0,24.0,24.0,1.0,318.0
