In [23]:
# Import Required Libraries
import pandas as pd
import pickle
import numpy as np

### Import Data Sample

In [24]:
file = open('flight_2019_sample','rb')
Flight_sample = pickle.load(file)
file.close()

In [25]:
# Remove any columns with More than 30% NaN values
Flight_sample = Flight_sample.loc[:, Flight_sample.isnull().mean() < .3]

### Determine correlated Feature Data

In [26]:
# Find any records with multiple flights, if none - then remove column from dataset as all values are 1
print(Flight_sample.loc[Flight_sample['flights'] == 0.0])
print(Flight_sample.loc[Flight_sample['flights'] < 1.0])
Flight_sample.drop(['flights','year'], axis=1, inplace=True)

Empty DataFrame
Columns: [mkt_unique_carrier, branded_code_share, mkt_carrier_fl_num, op_unique_carrier, tail_num, op_carrier_fl_num, origin_city_name, dest_city_name, origin_airport_id, dest_airport_id, crs_dep_time, dep_time, dep_delay, taxi_out, wheels_off, wheels_on, taxi_in, crs_arr_time, arr_time, arr_delay, cancelled, diverted, dup, crs_elapsed_time, actual_elapsed_time, air_time, flights, distance, carrier_delay, weather_delay, nas_delay, security_delay, late_aircraft_delay, fl_date, year, month, total_delay, is_delay]
Index: []

[0 rows x 38 columns]
Empty DataFrame
Columns: [mkt_unique_carrier, branded_code_share, mkt_carrier_fl_num, op_unique_carrier, tail_num, op_carrier_fl_num, origin_city_name, dest_city_name, origin_airport_id, dest_airport_id, crs_dep_time, dep_time, dep_delay, taxi_out, wheels_off, wheels_on, taxi_in, crs_arr_time, arr_time, arr_delay, cancelled, diverted, dup, crs_elapsed_time, actual_elapsed_time, air_time, flights, distance, carrier_delay, weather_d

In [27]:
# Calculate Correlated features from sample
Corr = Flight_sample.corr(method ='pearson')

# Drop 100% correlational values(diagonal values in matrix)
Corr1 = Corr[Corr < 1].unstack().transpose().sort_values(ascending=False).drop_duplicates()

In [29]:
# Get all correlations above threshold value
threshold = 0.50
Corr1 = Corr1[Corr1 > threshold]

# Extract all feature names from correlations above threshold
Corr_data = Corr1.index.values

# Flatten into list format
lst = [Corr_data[x][y] for x in range(len(Corr_data)) for y in range(2)]

# Isolate Unique feature names
unique_features = list(set(lst))
unique_features

['air_time',
 'crs_arr_time',
 'late_aircraft_delay',
 'mkt_carrier_fl_num',
 'wheels_off',
 'wheels_on',
 'dep_delay',
 'dep_time',
 'arr_time',
 'op_carrier_fl_num',
 'total_delay',
 'carrier_delay',
 'crs_elapsed_time',
 'actual_elapsed_time',
 'arr_delay',
 'crs_dep_time',
 'distance']

### Prune Data Sample

In [36]:
# Keep features with high correlations
Sample_data = Flight_sample[unique_features]
Sample_data

Unnamed: 0,air_time,crs_arr_time,late_aircraft_delay,mkt_carrier_fl_num,wheels_off,wheels_on,dep_delay,dep_time,arr_time,op_carrier_fl_num,total_delay,carrier_delay,crs_elapsed_time,actual_elapsed_time,arr_delay,crs_dep_time,distance
5817519,71.0,1618,0.0,4001,1602.0,1613.0,0.0,1548.0,1618.0,4001,0.0,0.0,90.0,90.0,0.0,1548,376.0
3440584,48.0,1750,0.0,4316,1648.0,1736.0,-7.0,1631.0,1737.0,4316,-20.0,0.0,72.0,66.0,-13.0,1638,272.0
2213756,76.0,2250,0.0,2152,2138.0,2254.0,14.0,2123.0,2256.0,2152,20.0,0.0,101.0,93.0,6.0,2109,547.0
4372882,60.0,857,0.0,445,733.0,833.0,-5.0,715.0,849.0,445,-13.0,0.0,97.0,94.0,-8.0,720,370.0
2224046,49.0,955,0.0,4040,848.0,937.0,-4.0,838.0,944.0,4040,-15.0,0.0,73.0,66.0,-11.0,842,190.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3048865,104.0,735,0.0,3405,533.0,717.0,-3.0,522.0,720.0,3405,-18.0,0.0,130.0,118.0,-15.0,525,687.0
7686938,48.0,1424,0.0,5739,1317.0,1405.0,-4.0,1256.0,1411.0,5739,-17.0,0.0,84.0,75.0,-13.0,1300,236.0
5571728,400.0,1300,0.0,328,1003.0,1243.0,0.0,940.0,1253.0,328,-7.0,0.0,440.0,433.0,-7.0,940,3365.0
319962,64.0,1534,0.0,1061,1420.0,1524.0,-2.0,1402.0,1532.0,1061,-4.0,0.0,90.0,90.0,-2.0,1404,481.0
