In [1]:
import pandas as pd
from sqlalchemy import create_engine
from config import db_password

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
file_loc = "resources/flights.csv"
df = pd.read_csv(file_loc, low_memory=False)
df.head(20)

Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,TAXI_OUT,WHEELS_OFF,SCHEDULED_TIME,ELAPSED_TIME,AIR_TIME,DISTANCE,WHEELS_ON,TAXI_IN,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
0,2015,1,1,4,AS,98,N407AS,ANC,SEA,5,2354.0,-11.0,21.0,15.0,205.0,194.0,169.0,1448,404.0,4.0,430,408.0,-22.0,0,0,,,,,,
1,2015,1,1,4,AA,2336,N3KUAA,LAX,PBI,10,2.0,-8.0,12.0,14.0,280.0,279.0,263.0,2330,737.0,4.0,750,741.0,-9.0,0,0,,,,,,
2,2015,1,1,4,US,840,N171US,SFO,CLT,20,18.0,-2.0,16.0,34.0,286.0,293.0,266.0,2296,800.0,11.0,806,811.0,5.0,0,0,,,,,,
3,2015,1,1,4,AA,258,N3HYAA,LAX,MIA,20,15.0,-5.0,15.0,30.0,285.0,281.0,258.0,2342,748.0,8.0,805,756.0,-9.0,0,0,,,,,,
4,2015,1,1,4,AS,135,N527AS,SEA,ANC,25,24.0,-1.0,11.0,35.0,235.0,215.0,199.0,1448,254.0,5.0,320,259.0,-21.0,0,0,,,,,,
5,2015,1,1,4,DL,806,N3730B,SFO,MSP,25,20.0,-5.0,18.0,38.0,217.0,230.0,206.0,1589,604.0,6.0,602,610.0,8.0,0,0,,,,,,
6,2015,1,1,4,NK,612,N635NK,LAS,MSP,25,19.0,-6.0,11.0,30.0,181.0,170.0,154.0,1299,504.0,5.0,526,509.0,-17.0,0,0,,,,,,
7,2015,1,1,4,US,2013,N584UW,LAX,CLT,30,44.0,14.0,13.0,57.0,273.0,249.0,228.0,2125,745.0,8.0,803,753.0,-10.0,0,0,,,,,,
8,2015,1,1,4,AA,1112,N3LAAA,SFO,DFW,30,19.0,-11.0,17.0,36.0,195.0,193.0,173.0,1464,529.0,3.0,545,532.0,-13.0,0,0,,,,,,
9,2015,1,1,4,DL,1173,N826DN,LAS,ATL,30,33.0,3.0,12.0,45.0,221.0,203.0,186.0,1747,651.0,5.0,711,656.0,-15.0,0,0,,,,,,


In [3]:
# 5714008 rows starting
df['MONTH'].count()

5819079

In [4]:
df.isnull().sum()

YEAR                         0
MONTH                        0
DAY                          0
DAY_OF_WEEK                  0
AIRLINE                      0
FLIGHT_NUMBER                0
TAIL_NUMBER              14721
ORIGIN_AIRPORT               0
DESTINATION_AIRPORT          0
SCHEDULED_DEPARTURE          0
DEPARTURE_TIME           86153
DEPARTURE_DELAY          86153
TAXI_OUT                 89047
WHEELS_OFF               89047
SCHEDULED_TIME               6
ELAPSED_TIME            105071
AIR_TIME                105071
DISTANCE                     0
WHEELS_ON                92513
TAXI_IN                  92513
SCHEDULED_ARRIVAL            0
ARRIVAL_TIME             92513
ARRIVAL_DELAY           105071
DIVERTED                     0
CANCELLED                    0
CANCELLATION_REASON    5729195
AIR_SYSTEM_DELAY       4755640
SECURITY_DELAY         4755640
AIRLINE_DELAY          4755640
LATE_AIRCRAFT_DELAY    4755640
WEATHER_DELAY          4755640
dtype: int64

In [5]:
df.columns

Index(['YEAR', 'MONTH', 'DAY', 'DAY_OF_WEEK', 'AIRLINE', 'FLIGHT_NUMBER',
       'TAIL_NUMBER', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT',
       'SCHEDULED_DEPARTURE', 'DEPARTURE_TIME', 'DEPARTURE_DELAY', 'TAXI_OUT',
       'WHEELS_OFF', 'SCHEDULED_TIME', 'ELAPSED_TIME', 'AIR_TIME', 'DISTANCE',
       'WHEELS_ON', 'TAXI_IN', 'SCHEDULED_ARRIVAL', 'ARRIVAL_TIME',
       'ARRIVAL_DELAY', 'DIVERTED', 'CANCELLED', 'CANCELLATION_REASON',
       'AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY',
       'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY'],
      dtype='object')

# Goal: predict delay time

## clean rows/columns with data irrelavent to our analysis

In [6]:
df['CANCELLED'].value_counts()

0    5729195
1      89884
Name: CANCELLED, dtype: int64

In [7]:
df['DIVERTED'].value_counts()

0    5803892
1      15187
Name: DIVERTED, dtype: int64

In [8]:
# drop all rows with cancelled flights

df = df.drop(df[df['CANCELLED'] == 1].index)
df = df.drop(df[df['DIVERTED'] == 1].index)

In [9]:
# Drop year: data is from 2015, 
# drop cancelled and diverted rows / columns as we are concerned with flights that were completed, with a delay time
# Departure time: predicting delay, having sceduled time and departure time will be giving the answer to the ml model
# Linear dependancy, drop redundant columns 
#     SCHEDULED_DEPARTURE - DEPARTURE_TIME = DEPARTURE_DELAY   drop DEPARTURE_TIME
#     TAXI_OUT + WHEELS_OFF + SCHEDULED_TIME = ELAPSED_TIME    drop TAXI_OUT, WHEELS_OFF, SCHEDULED_TIME
#     ARRIVAL_DELAY = AIR_SYSTEM_DELAY + SECURITY_DELAY + AIRLINE_DELAY + LATE_AIRCRAFT_DELAY + WEATHER_DELAY      drop ?

df.drop(columns=['YEAR', 'CANCELLED', 'CANCELLATION_REASON', 'DIVERTED', 'DEPARTURE_TIME', 'TAXI_OUT', 'WHEELS_OFF', 'SCHEDULED_TIME'], inplace=True)

In [10]:
df.head()

Unnamed: 0,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_DELAY,ELAPSED_TIME,AIR_TIME,DISTANCE,WHEELS_ON,TAXI_IN,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
0,1,1,4,AS,98,N407AS,ANC,SEA,5,-11.0,194.0,169.0,1448,404.0,4.0,430,408.0,-22.0,,,,,
1,1,1,4,AA,2336,N3KUAA,LAX,PBI,10,-8.0,279.0,263.0,2330,737.0,4.0,750,741.0,-9.0,,,,,
2,1,1,4,US,840,N171US,SFO,CLT,20,-2.0,293.0,266.0,2296,800.0,11.0,806,811.0,5.0,,,,,
3,1,1,4,AA,258,N3HYAA,LAX,MIA,20,-5.0,281.0,258.0,2342,748.0,8.0,805,756.0,-9.0,,,,,
4,1,1,4,AS,135,N527AS,SEA,ANC,25,-1.0,215.0,199.0,1448,254.0,5.0,320,259.0,-21.0,,,,,


In [30]:
# Explore vlues in the following columns
# These 5 columns all have 4755640/5714008 null values
columns = ['AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY']

for column in columns:
    print(column + '\n')
    print(df[column].value_counts())
    print('\n\n-----------------------\n\n')

AIR_SYSTEM_DELAY

0.0       498613
1.0        28003
15.0       23199
2.0        22981
3.0        21446
16.0       21357
4.0        20305
17.0       18738
5.0        18737
6.0        17671
18.0       17139
7.0        16582
8.0        15644
19.0       15381
9.0        14716
20.0       14183
10.0       13677
21.0       12641
11.0       12309
22.0       11819
12.0       11748
13.0       10903
23.0       10660
14.0       10231
24.0        9882
25.0        9000
26.0        8307
27.0        7595
28.0        7129
29.0        6534
30.0        6156
31.0        5661
32.0        5387
33.0        4930
34.0        4537
35.0        4380
36.0        3972
37.0        3862
38.0        3577
39.0        3324
40.0        3237
41.0        3084
42.0        2820
43.0        2685
44.0        2668
45.0        2432
46.0        2381
47.0        2263
48.0        2137
49.0        2048
50.0        1957
51.0        1905
52.0        1762
53.0        1755
54.0        1630
55.0        1596
56.0        1519
57.0        1

0.0       506486
15.0       14522
16.0       13824
17.0       12908
18.0       12259
19.0       11794
14.0       11183
20.0       11079
13.0       10930
11.0       10517
21.0       10451
12.0       10342
10.0       10271
22.0        9945
8.0         9912
9.0         9897
7.0         9629
1.0         9575
23.0        9563
6.0         9501
2.0         9388
24.0        9100
5.0         9038
3.0         9012
4.0         8854
25.0        8656
26.0        8197
27.0        8083
28.0        7553
29.0        7280
30.0        7059
31.0        6681
32.0        6377
33.0        6322
34.0        6107
35.0        5932
36.0        5469
37.0        5260
38.0        5148
39.0        5039
40.0        5002
41.0        4596
43.0        4474
42.0        4412
44.0        4174
45.0        4113
46.0        4036
47.0        3935
48.0        3695
49.0        3636
50.0        3540
51.0        3516
52.0        3349
53.0        3265
54.0        3193
55.0        3045
56.0        2984
57.0        2899
58.0        28

In [12]:
df_delays = df.groupby(['LATE_AIRCRAFT_DELAY'])

In [13]:
#df_delays.head(10)

Unnamed: 0,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_DELAY,ELAPSED_TIME,AIR_TIME,DISTANCE,WHEELS_ON,TAXI_IN,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
0,1,1,4,AS,98,N407AS,ANC,SEA,5,-11.0,194.0,169.0,1448,404.0,4.0,430,408.0,-22.0,,,,,
1,1,1,4,AA,2336,N3KUAA,LAX,PBI,10,-8.0,279.0,263.0,2330,737.0,4.0,750,741.0,-9.0,,,,,
2,1,1,4,US,840,N171US,SFO,CLT,20,-2.0,293.0,266.0,2296,800.0,11.0,806,811.0,5.0,,,,,
3,1,1,4,AA,258,N3HYAA,LAX,MIA,20,-5.0,281.0,258.0,2342,748.0,8.0,805,756.0,-9.0,,,,,
4,1,1,4,AS,135,N527AS,SEA,ANC,25,-1.0,215.0,199.0,1448,254.0,5.0,320,259.0,-21.0,,,,,
5,1,1,4,DL,806,N3730B,SFO,MSP,25,-5.0,230.0,206.0,1589,604.0,6.0,602,610.0,8.0,,,,,
6,1,1,4,NK,612,N635NK,LAS,MSP,25,-6.0,170.0,154.0,1299,504.0,5.0,526,509.0,-17.0,,,,,
7,1,1,4,US,2013,N584UW,LAX,CLT,30,14.0,249.0,228.0,2125,745.0,8.0,803,753.0,-10.0,,,,,
8,1,1,4,AA,1112,N3LAAA,SFO,DFW,30,-11.0,193.0,173.0,1464,529.0,3.0,545,532.0,-13.0,,,,,
9,1,1,4,DL,1173,N826DN,LAS,ATL,30,3.0,203.0,186.0,1747,651.0,5.0,711,656.0,-15.0,,,,,


In [36]:
df_no_null = df.dropna()

In [37]:
df_no_null.head(5)

Unnamed: 0,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_DELAY,ELAPSED_TIME,AIR_TIME,DISTANCE,WHEELS_ON,TAXI_IN,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
27,1,1,4,NK,597,N528NK,MSP,FLL,115,12.0,220.0,166.0,1487,527.0,40.0,542,607.0,25.0,25.0,0.0,0.0,0.0,0.0
30,1,1,4,NK,168,N629NK,PHX,ORD,125,72.0,175.0,156.0,1440,622.0,10.0,549,632.0,43.0,43.0,0.0,0.0,0.0,0.0
35,1,1,4,HA,17,N389HA,LAS,HNL,145,0.0,385.0,361.0,2762,602.0,8.0,555,610.0,15.0,0.0,0.0,15.0,0.0,0.0
50,1,1,4,B6,1030,N239JB,BQN,MCO,307,-3.0,196.0,160.0,1129,509.0,11.0,500,520.0,20.0,20.0,0.0,0.0,0.0,0.0
52,1,1,4,B6,2134,N307JB,SJU,MCO,400,95.0,175.0,163.0,1189,727.0,3.0,605,730.0,85.0,0.0,0.0,85.0,0.0,0.0


In [38]:
df_no_null['MONTH'].count()

1063439

In [39]:
df.df_no_null().sum()

AttributeError: 'DataFrame' object has no attribute 'df_no_null'

In [40]:
# how to deal with these null value??

In [41]:
df_no_null.head(40)

Unnamed: 0,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_DELAY,ELAPSED_TIME,AIR_TIME,DISTANCE,WHEELS_ON,TAXI_IN,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
27,1,1,4,NK,597,N528NK,MSP,FLL,115,12.0,220.0,166.0,1487,527.0,40.0,542,607.0,25.0,25.0,0.0,0.0,0.0,0.0
30,1,1,4,NK,168,N629NK,PHX,ORD,125,72.0,175.0,156.0,1440,622.0,10.0,549,632.0,43.0,43.0,0.0,0.0,0.0,0.0
35,1,1,4,HA,17,N389HA,LAS,HNL,145,0.0,385.0,361.0,2762,602.0,8.0,555,610.0,15.0,0.0,0.0,15.0,0.0,0.0
50,1,1,4,B6,1030,N239JB,BQN,MCO,307,-3.0,196.0,160.0,1129,509.0,11.0,500,520.0,20.0,20.0,0.0,0.0,0.0,0.0
52,1,1,4,B6,2134,N307JB,SJU,MCO,400,95.0,175.0,163.0,1189,727.0,3.0,605,730.0,85.0,0.0,0.0,85.0,0.0,0.0
55,1,1,4,B6,2276,N646JB,SJU,BDL,438,72.0,258.0,237.0,1666,902.0,6.0,739,908.0,89.0,17.0,0.0,72.0,0.0,0.0
70,1,1,4,AA,1057,N3ASAA,DFW,MIA,515,108.0,155.0,133.0,1121,1031.0,7.0,856,1038.0,102.0,0.0,0.0,0.0,0.0,102.0
73,1,1,4,US,425,N174US,PDX,PHX,520,60.0,150.0,132.0,1009,945.0,5.0,850,950.0,60.0,0.0,0.0,60.0,0.0,0.0
74,1,1,4,AA,89,N3KVAA,IAH,MIA,520,58.0,137.0,111.0,964,928.0,7.0,841,935.0,54.0,0.0,0.0,54.0,0.0,0.0
86,1,1,4,AA,328,N4XKAA,DEN,DFW,530,53.0,138.0,96.0,641,931.0,10.0,835,941.0,66.0,13.0,0.0,53.0,0.0,0.0


In [34]:
# Export cleaned dataset
df.to_csv("resources/flights_cleaned.csv")

In [42]:
# Export cleaned no null dataset
df_no_null.to_csv("resources/flights_cleaned_no_null.csv")

In [None]:
# Connect to SQL and export
db_string = f"postgres://postgres:{db_password}@127.0.0.1:5432/flight_data"
engine = create_engine(db_string)
movies_df.to_sql(name='flight', con=engine, if_exists='replace')