Prior to loading this data into jupyter notebook first we created a SQLite database and inserted the 3 tables (airlines, airports & flights) tables into with the following steps in SQL command prompt:

To check for existing dbs: > .databases
To open or create db > .open DATABASE_NAME.db
To import csv > .mode csv
 .import CSV_NAME.csv TABLENAME
To check tables in db > .tables
To view table content > . mode columns
 .header on
SELECT * FROM TABLENAME;
To save be db > .backup DATABASE_NAME.db

In the final setep of the code we also show how to export tables from Python into mysql

# Setup

In [1]:
#import dependencies
import pandas as pd
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, inspect, func
import MySQLdb
import mysql.connector

In [2]:
#connect to the database
database_path = "flights.db"
engine = create_engine(f"sqlite:///{database_path}")
conn = engine.connect()

In [3]:
#Double check it worked
inspector = inspect(engine)
inspector.get_table_names()

['airlines', 'airports', 'flights']

In [4]:
# Get a list of column names and types
columns = inspector.get_columns('airlines')
for c in columns:
    print(c['name'], c["type"])
# columns

IATA_CODE TEXT
AIRLINE TEXT


In [5]:
# Get a list of column names and types
columns = inspector.get_columns('airports')
for c in columns:
    print(c['name'], c["type"])
# columns

IATA_CODE TEXT
AIRPORT TEXT
CITY TEXT
STATE TEXT
COUNTRY TEXT
LATITUDE TEXT
LONGITUDE TEXT


In [6]:
# Get a list of column names and types
columns = inspector.get_columns('flights')
for c in columns:
    print(c['name'], c["type"])
# columns

YEAR TEXT
MONTH TEXT
DAY TEXT
DAY_OF_WEEK TEXT
AIRLINE TEXT
FLIGHT_NUMBER TEXT
TAIL_NUMBER TEXT
ORIGIN_AIRPORT TEXT
DESTINATION_AIRPORT TEXT
SCHEDULED_DEPARTURE TEXT
DEPARTURE_TIME TEXT
DEPARTURE_DELAY TEXT
TAXI_OUT TEXT
WHEELS_OFF TEXT
SCHEDULED_TIME TEXT
ELAPSED_TIME TEXT
AIR_TIME TEXT
DISTANCE TEXT
WHEELS_ON TEXT
TAXI_IN TEXT
SCHEDULED_ARRIVAL TEXT
ARRIVAL_TIME TEXT
ARRIVAL_DELAY TEXT
DIVERTED TEXT
CANCELLED TEXT
CANCELLATION_REASON TEXT
AIR_SYSTEM_DELAY TEXT
SECURITY_DELAY TEXT
AIRLINE_DELAY TEXT
LATE_AIRCRAFT_DELAY TEXT
WEATHER_DELAY TEXT


# Cancellations by Airline

In [7]:
#find the number of flights cancelled for which the airline was responsible (i.e. not weather or security related etc.)
cancelled_count_by_airline = pd.read_sql_query('SELECT a.AIRLINE as `Airline Name`, a.IATA_CODE as `Airline Code`,\
                                        SUM(f.CANCELLED) as `SUM_CANCELLED`\
                                        FROM airlines as a\
                                        INNER JOIN flights as f ON a.IATA_CODE = f.AIRLINE\
                                        WHERE CANCELLATION_REASON = "A"\
                                        GROUP BY f.AIRLINE\
                                        ORDER BY(SUM_CANCELLED) desc;', con = engine)
cancelled_count_by_airline

Unnamed: 0,Airline Name,Airline Code,SUM_CANCELLED
0,Southwest Airlines Co.,WN,6122
1,Atlantic Southeast Airlines,EV,3604
2,Skywest Airlines Inc.,OO,3205
3,American Airlines Inc.,AA,2879
4,United Air Lines Inc.,UA,2870
5,American Eagle Airlines Inc.,MQ,2475
6,US Airways Inc.,US,1007
7,JetBlue Airways,B6,883
8,Spirit Air Lines,NK,654
9,Delta Air Lines Inc.,DL,594


In [8]:
#insert this table into the sqlite database to perform further queries
cancelled_count_by_airline.to_sql(name='cancelled_count_by_airline', con=engine, if_exists='replace', index=False)
#check it worked
engine.table_names()

['airlines', 'airports', 'cancelled_count_by_airline', 'flights']

In [9]:
#combine the avg delay to show the actual airline as opposed to the shortcode only 
cancelled_count = pd.read_sql_query('SELECT airlines.AIRLINE, airlines.IATA_CODE, flights.CANCELLED as total_cancelled\
                                     FROM airlines \
                                     JOIN flights \
                                     ON airlines.IATA_code = flights.AIRLINE\
                                     WHERE flights.CANCELLED = 1;', con = engine)

#insert this table into the sqlite database to perform further queries
cancelled_count.to_sql(name='cancelled_count', con=engine, if_exists='replace', index=False)
#check it worked
engine.table_names()

['airlines',
 'airports',
 'cancelled_count',
 'cancelled_count_by_airline',
 'flights']

In [10]:
cancelled_count.head()

Unnamed: 0,AIRLINE,IATA_CODE,total_cancelled
0,Alaska Airlines Inc.,AS,1
1,American Airlines Inc.,AA,1
2,Skywest Airlines Inc.,OO,1
3,American Eagle Airlines Inc.,MQ,1
4,Skywest Airlines Inc.,OO,1


In [11]:
#Get an idea on which airlines cancel the most flights
total_cancelled = pd.read_sql_query('SELECT AIRLINE, IATA_CODE, SUM(total_cancelled) as cancelled_flights\
                                     FROM cancelled_count \
                                     GROUP BY AIRLINE\
                                     ORDER BY SUM(total_cancelled) desc;', con = engine)
total_cancelled

Unnamed: 0,AIRLINE,IATA_CODE,cancelled_flights
0,Southwest Airlines Co.,WN,16043
1,Atlantic Southeast Airlines,EV,15231
2,American Eagle Airlines Inc.,MQ,15025
3,American Airlines Inc.,AA,10919
4,Skywest Airlines Inc.,OO,9960
5,United Air Lines Inc.,UA,6573
6,JetBlue Airways,B6,4276
7,US Airways Inc.,US,4067
8,Delta Air Lines Inc.,DL,3824
9,Spirit Air Lines,NK,2004


In [12]:
#insert this table into the sqlite database to perform further queries
total_cancelled.to_sql(name='total_cancelled', con=engine, if_exists='replace', index=False)
#drop the previous table to keep the database clean
engine.execute("DROP TABLE cancelled_count")
#check it worked
engine.table_names()

['airlines',
 'airports',
 'cancelled_count_by_airline',
 'flights',
 'total_cancelled']

In [13]:
#get the total count of flights to get a ratio of total cancelled
#Get an idea on which airlines cancel the most flights
total_flights = pd.read_sql_query("SELECT AIRLINE, COUNT(AIRLINE) as flight_totals\
                                     FROM flights \
                                     GROUP BY AIRLINE", con = engine)
#insert this table into the sqlite database to perform further queries
total_flights.to_sql(name='total_flights', con=engine, if_exists='replace', index=False)
#check it worked
engine.table_names()

['airlines',
 'airports',
 'cancelled_count_by_airline',
 'flights',
 'total_cancelled',
 'total_flights']

In [14]:
total_flights

Unnamed: 0,AIRLINE,flight_totals
0,AA,725984
1,AS,172521
2,B6,267048
3,DL,875881
4,EV,571977
5,F9,90836
6,HA,76272
7,MQ,294632
8,NK,117379
9,OO,588353


In [15]:
#merge the total cancellation and total number of flights to get the cancellation ratio
#combine the avg delay to show the actual airline as opposed to the shortcode only 
cancelled_and_count = pd.read_sql_query("SELECT total_cancelled.AIRLINE, total_cancelled.IATA_CODE,\
                                        total_cancelled.cancelled_flights,total_flights.flight_totals \
                                        FROM total_cancelled JOIN total_flights \
                                        ON total_cancelled.IATA_CODE = total_flights.AIRLINE", con = engine)
cancelled_and_count

Unnamed: 0,AIRLINE,IATA_CODE,cancelled_flights,flight_totals
0,Southwest Airlines Co.,WN,16043,1261855
1,Atlantic Southeast Airlines,EV,15231,571977
2,American Eagle Airlines Inc.,MQ,15025,294632
3,American Airlines Inc.,AA,10919,725984
4,Skywest Airlines Inc.,OO,9960,588353
5,United Air Lines Inc.,UA,6573,515723
6,JetBlue Airways,B6,4276,267048
7,US Airways Inc.,US,4067,198715
8,Delta Air Lines Inc.,DL,3824,875881
9,Spirit Air Lines,NK,2004,117379


In [16]:
#clean up in pandas for ratio
cancelled_and_count['cancellation ratio'] = round(cancelled_and_count['cancelled_flights'] / cancelled_and_count['flight_totals'] *100,2)
cancelled_and_count = cancelled_and_count.sort_values(['cancellation ratio'])
cancelled_and_count

Unnamed: 0,AIRLINE,IATA_CODE,cancelled_flights,flight_totals,cancellation ratio
13,Hawaiian Airlines Inc.,HA,171,76272,0.22
10,Alaska Airlines Inc.,AS,669,172521,0.39
8,Delta Air Lines Inc.,DL,3824,875881,0.44
11,Frontier Airlines Inc.,F9,588,90836,0.65
12,Virgin America,VX,534,61903,0.86
0,Southwest Airlines Co.,WN,16043,1261855,1.27
5,United Air Lines Inc.,UA,6573,515723,1.27
3,American Airlines Inc.,AA,10919,725984,1.5
6,JetBlue Airways,B6,4276,267048,1.6
4,Skywest Airlines Inc.,OO,9960,588353,1.69


In [17]:
#insert this table into the sqlite database to perform further queries
cancelled_and_count.to_sql(name='cancelled_and_count', con=engine, if_exists='replace', index=False)
#check it worked
engine.table_names()

['airlines',
 'airports',
 'cancelled_and_count',
 'cancelled_count_by_airline',
 'flights',
 'total_cancelled',
 'total_flights']

In [18]:
#remove dbs which have double information we used in transformation for a tidy database
engine.execute("DROP TABLE total_cancelled")

<sqlalchemy.engine.result.ResultProxy at 0x29df2283390>

In [19]:
#check it worked
engine.table_names()

['airlines',
 'airports',
 'cancelled_and_count',
 'cancelled_count_by_airline',
 'flights',
 'total_flights']

# Average Delay by Airline

In [20]:
#check out the average delay on airline to see what airlines have the longest delays so we can avoid them
avg_airline_delays = pd.read_sql_query('SELECT a.AIRLINE as `Airline Name`, a.IATA_CODE as `Airline Code`,\
                  COUNT(f.AIRLINE) as `Total Flights not cancelled`,\
                  AVG(f.DEPARTURE_DELAY) as `Avg DEPARTURE_DELAY`,\
                  AVG(f.ARRIVAL_DELAY) as `Avg ARRIVAL_DELAY`\
                  FROM airlines as a\
                  INNER JOIN flights as f ON a.IATA_CODE = f.AIRLINE\
                  WHERE f.CANCELLED = 0\
                  GROUP BY f.AIRLINE\
                  ORDER BY(`AVG DEPARTURE_DELAY`) desc;', con = engine)

In [21]:
#double check the table
avg_airline_delays

Unnamed: 0,Airline Name,Airline Code,Total Flights not cancelled,Avg DEPARTURE_DELAY,Avg ARRIVAL_DELAY
0,Spirit Air Lines,NK,115375,15.911281,14.448971
1,United Air Lines Inc.,UA,509150,14.379658,5.416787
2,Frontier Airlines Inc.,F9,90248,13.31747,12.482814
3,JetBlue Airways,B6,262772,11.499049,6.659309
4,Southwest Airlines Co.,WN,1245812,10.571461,4.362992
5,American Eagle Airlines Inc.,MQ,279607,10.01491,6.439027
6,Virgin America,VX,61369,9.006274,4.728364
7,American Airlines Inc.,AA,715065,8.864029,3.441091
8,Atlantic Southeast Airlines,EV,556746,8.66331,6.561793
9,Skywest Airlines Inc.,OO,578393,7.768222,5.829694


In [22]:
#insert this table into the sqlite database to perform further queries
#Drop the table if it already exists and update it with new data
avg_airline_delays.to_sql(name='avg_airline_delays', con=engine, if_exists='replace', index=False)
#check it worked
engine.table_names()

['airlines',
 'airports',
 'avg_airline_delays',
 'cancelled_and_count',
 'cancelled_count_by_airline',
 'flights',
 'total_flights']

In [23]:
#Check which airlines experience the most departure delays, do not include any delays 0 or under as those may have experienced some type of delay
#but were on time in the end
airline_departure_delay_counts = pd.read_sql_query('SELECT a.AIRLINE as `Airline_Name`, a.IATA_CODE as `Airline_Code`,\
                       COUNT(f.DEPARTURE_DELAY) as `Count_DEPARTURE_DELAY`\
                       FROM airlines as a\
                       INNER JOIN flights as f ON a.IATA_CODE = f.AIRLINE\
                       WHERE f.DEPARTURE_DELAY <> 0\
                       GROUP BY f.AIRLINE\
                       ORDER BY(`Count_DEPARTURE_DELAY`) desc;', con = engine)
airline_departure_delay_counts

Unnamed: 0,Airline_Name,Airline_Code,Count_DEPARTURE_DELAY
0,Southwest Airlines Co.,WN,1170316
1,Delta Air Lines Inc.,DL,808748
2,American Airlines Inc.,AA,692329
3,Skywest Airlines Inc.,OO,561209
4,Atlantic Southeast Airlines,EV,550173
5,United Air Lines Inc.,UA,487593
6,American Eagle Airlines Inc.,MQ,279588
7,JetBlue Airways,B6,253426
8,US Airways Inc.,US,189487
9,Alaska Airlines Inc.,AS,165430


In [24]:
#insert this table into the sqlite database to perform further queries
#Drop the table if it already exists and update it with new data
airline_departure_delay_counts.to_sql(name='airline_departure_delay_counts', con=engine, if_exists='replace', index=False)
#check it worked
engine.table_names()

['airline_departure_delay_counts',
 'airlines',
 'airports',
 'avg_airline_delays',
 'cancelled_and_count',
 'cancelled_count_by_airline',
 'flights',
 'total_flights']

In [25]:
#add the total number of flights to the table to get a delay ratio
airline_departure_delay_ratio = pd.read_sql_query("SELECT airline_departure_delay_counts.Airline_Name, airline_departure_delay_counts.Airline_Code,\
                                        airline_departure_delay_counts.Count_DEPARTURE_DELAY, total_flights.flight_totals\
                                        FROM airline_departure_delay_counts JOIN total_flights \
                                        ON airline_departure_delay_counts.Airline_Code = total_flights.AIRLINE", con = engine)
airline_departure_delay_ratio

Unnamed: 0,Airline_Name,Airline_Code,Count_DEPARTURE_DELAY,flight_totals
0,Southwest Airlines Co.,WN,1170316,1261855
1,Delta Air Lines Inc.,DL,808748,875881
2,American Airlines Inc.,AA,692329,725984
3,Skywest Airlines Inc.,OO,561209,588353
4,Atlantic Southeast Airlines,EV,550173,571977
5,United Air Lines Inc.,UA,487593,515723
6,American Eagle Airlines Inc.,MQ,279588,294632
7,JetBlue Airways,B6,253426,267048
8,US Airways Inc.,US,189487,198715
9,Alaska Airlines Inc.,AS,165430,172521


In [26]:
#clean up in pandas for ratio
airline_departure_delay_ratio['departure_delay_ratio'] = round(airline_departure_delay_ratio['Count_DEPARTURE_DELAY'] / airline_departure_delay_ratio['flight_totals'] *100,2)
airline_departure_delay_ratio = airline_departure_delay_ratio.sort_values(['departure_delay_ratio'])
airline_departure_delay_ratio

Unnamed: 0,Airline_Name,Airline_Code,Count_DEPARTURE_DELAY,flight_totals,departure_delay_ratio
1,Delta Air Lines Inc.,DL,808748,875881,92.34
0,Southwest Airlines Co.,WN,1170316,1261855,92.75
13,Virgin America,VX,58020,61903,93.73
5,United Air Lines Inc.,UA,487593,515723,94.55
6,American Eagle Airlines Inc.,MQ,279588,294632,94.89
7,JetBlue Airways,B6,253426,267048,94.9
2,American Airlines Inc.,AA,692329,725984,95.36
8,US Airways Inc.,US,189487,198715,95.36
3,Skywest Airlines Inc.,OO,561209,588353,95.39
9,Alaska Airlines Inc.,AS,165430,172521,95.89


In [27]:
#insert this table into the sqlite database to perform further queries
#Drop the table if it already exists and update it with new data
airline_departure_delay_ratio.to_sql(name='airline_departure_delay_ratio', con=engine, if_exists='replace', index=False)
#check it worked
engine.table_names()

['airline_departure_delay_counts',
 'airline_departure_delay_ratio',
 'airlines',
 'airports',
 'avg_airline_delays',
 'cancelled_and_count',
 'cancelled_count_by_airline',
 'flights',
 'total_flights']

In [28]:
#Check which airlines experience the most arrival delays, do not include any delays 0 or under as those may have experienced some type of delay
#but were on time in the end
airline_arrival_delay_counts = pd.read_sql_query('SELECT a.AIRLINE as `Airline_Name`, a.IATA_CODE as `Airline_Code`,COUNT(f.ARRIVAL_DELAY) as `Count_ARRIVAL_DELAY`\
                    FROM airlines as a\
                    INNER JOIN flights as f ON a.IATA_CODE = f.AIRLINE\
                    WHERE f.ARRIVAL_DELAY <> 0\
                    GROUP BY f.AIRLINE\
                    ORDER BY(`Count_ARRIVAL_DELAY`) desc;', con = engine)

airline_arrival_delay_counts

Unnamed: 0,Airline_Name,Airline_Code,Count_ARRIVAL_DELAY
0,Southwest Airlines Co.,WN,1232551
1,Delta Air Lines Inc.,DL,858305
2,American Airlines Inc.,AA,711895
3,Skywest Airlines Inc.,OO,573941
4,Atlantic Southeast Airlines,EV,558743
5,United Air Lines Inc.,UA,506347
6,American Eagle Airlines Inc.,MQ,289448
7,JetBlue Airways,B6,262174
8,US Airways Inc.,US,194176
9,Alaska Airlines Inc.,AS,168197


In [29]:
#insert this table into the sqlite database to perform further queries
#Drop the table if it already exists and update it with new data
airline_arrival_delay_counts.to_sql(name='airline_arrival_delay_counts', con=engine, if_exists='replace', index=False)
#check it worked
engine.table_names()

['airline_arrival_delay_counts',
 'airline_departure_delay_counts',
 'airline_departure_delay_ratio',
 'airlines',
 'airports',
 'avg_airline_delays',
 'cancelled_and_count',
 'cancelled_count_by_airline',
 'flights',
 'total_flights']

In [30]:
airline_arrival_delay_ratio = pd.read_sql_query("SELECT airline_arrival_delay_counts.Airline_Name, airline_arrival_delay_counts.Airline_Code,\
                                        airline_arrival_delay_counts.Count_ARRIVAL_DELAY, total_flights.flight_totals\
                                        FROM airline_arrival_delay_counts JOIN total_flights \
                                        ON airline_arrival_delay_counts.Airline_Code = total_flights.AIRLINE", con = engine)
airline_arrival_delay_ratio

Unnamed: 0,Airline_Name,Airline_Code,Count_ARRIVAL_DELAY,flight_totals
0,Southwest Airlines Co.,WN,1232551,1261855
1,Delta Air Lines Inc.,DL,858305,875881
2,American Airlines Inc.,AA,711895,725984
3,Skywest Airlines Inc.,OO,573941,588353
4,Atlantic Southeast Airlines,EV,558743,571977
5,United Air Lines Inc.,UA,506347,515723
6,American Eagle Airlines Inc.,MQ,289448,294632
7,JetBlue Airways,B6,262174,267048
8,US Airways Inc.,US,194176,198715
9,Alaska Airlines Inc.,AS,168197,172521


In [31]:
#clean up in pandas for ratio
airline_arrival_delay_ratio['arrival_delay_ratio'] = round(airline_arrival_delay_ratio['Count_ARRIVAL_DELAY'] / airline_arrival_delay_ratio['flight_totals'] *100,2)
airline_arrival_delay_ratio = airline_arrival_delay_ratio.sort_values(['arrival_delay_ratio'])
airline_arrival_delay_ratio

Unnamed: 0,Airline_Name,Airline_Code,Count_ARRIVAL_DELAY,flight_totals,arrival_delay_ratio
12,Hawaiian Airlines Inc.,HA,72955,76272,95.65
13,Virgin America,VX,60317,61903,97.44
9,Alaska Airlines Inc.,AS,168197,172521,97.49
3,Skywest Airlines Inc.,OO,573941,588353,97.55
0,Southwest Airlines Co.,WN,1232551,1261855,97.68
4,Atlantic Southeast Airlines,EV,558743,571977,97.69
8,US Airways Inc.,US,194176,198715,97.72
11,Frontier Airlines Inc.,F9,88851,90836,97.81
10,Spirit Air Lines,NK,114966,117379,97.94
1,Delta Air Lines Inc.,DL,858305,875881,97.99


In [32]:
#insert this table into the sqlite database to perform further queries
#Drop the table if it already exists and update it with new data
airline_arrival_delay_ratio.to_sql(name='airline_arrival_delay_ratio', con=engine, if_exists='replace', index=False)
#check it worked
engine.table_names()

['airline_arrival_delay_counts',
 'airline_arrival_delay_ratio',
 'airline_departure_delay_counts',
 'airline_departure_delay_ratio',
 'airlines',
 'airports',
 'avg_airline_delays',
 'cancelled_and_count',
 'cancelled_count_by_airline',
 'flights',
 'total_flights']

In [33]:
airline_avg_DeptAndArr_TotalFlights = pd.read_sql_query('SELECT a.AIRLINE as `Airline Name`, a.IATA_CODE as `Airline Code`,\
                                                 COUNT(f.AIRLINE) as `Total Flights`,\
                                                 AVG(f.DEPARTURE_DELAY) as `Avg DEPARTURE_DELAY`,\
                                                 AVG(f.ARRIVAL_DELAY) as `Avg ARRIVAL_DELAY`\
                                                 FROM airlines as a\
                                                 INNER JOIN flights as f ON a.IATA_CODE = f.AIRLINE\
                                                 WHERE f.CANCELLED = 0 AND AIRLINE_DELAY <> 0\
                                                 GROUP BY f.AIRLINE\
                                                 ORDER BY(`Total Flights`) desc;', con = engine)
airline_avg_DeptAndArr_TotalFlights

Unnamed: 0,Airline Name,Airline Code,Total Flights,Avg DEPARTURE_DELAY,Avg ARRIVAL_DELAY
0,Southwest Airlines Co.,WN,1153710,7.506802,0.489224
1,Delta Air Lines Inc.,DL,817162,4.871584,-3.544768
2,American Airlines Inc.,AA,653442,5.770684,-1.40131
3,Skywest Airlines Inc.,OO,509725,2.385777,-0.828021
4,Atlantic Southeast Airlines,EV,501505,4.420486,1.406365
5,United Air Lines Inc.,UA,470427,10.791878,0.41155
6,American Eagle Airlines Inc.,MQ,246296,5.061938,-0.934376
7,JetBlue Airways,B6,242262,8.240459,1.944692
8,US Airways Inc.,US,177816,3.878684,-0.428044
9,Alaska Airlines Inc.,AS,157863,-1.025408,-4.937959


In [34]:
#insert this table into the sqlite database to perform further queries
#Drop the table if it already exists and update it with new data
airline_avg_DeptAndArr_TotalFlights.to_sql(name='airline_avg_DeptAndArr_TotalFlights', con=engine, if_exists='replace', index=False)
#check it worked
engine.table_names()

['airline_arrival_delay_counts',
 'airline_arrival_delay_ratio',
 'airline_avg_DeptAndArr_TotalFlights',
 'airline_departure_delay_counts',
 'airline_departure_delay_ratio',
 'airlines',
 'airports',
 'avg_airline_delays',
 'cancelled_and_count',
 'cancelled_count_by_airline',
 'flights',
 'total_flights']

In [35]:
#count the number of departure delay incidents by airline
count_departure_delay = pd.read_sql_query('SELECT a.AIRLINE as `Airline Name`, a.IATA_CODE as `Airline Code`,COUNT(f.DEPARTURE_DELAY) as `Count DEPARTURE_DELAY`\
                                        FROM airlines as a\
                                        INNER JOIN flights as f ON a.IATA_CODE = f.AIRLINE\
                                        WHERE f.DEPARTURE_DELAY <> 0 AND AIRLINE_DELAY <> 0\
                                        GROUP BY f.AIRLINE\
                                        ORDER BY(`Count DEPARTURE_DELAY`) desc;', con = engine)
count_departure_delay

Unnamed: 0,Airline Name,Airline Code,Count DEPARTURE_DELAY
0,Southwest Airlines Co.,WN,1080653
1,Delta Air Lines Inc.,DL,756467
2,American Airlines Inc.,AA,632995
3,Atlantic Southeast Airlines,EV,496694
4,Skywest Airlines Inc.,OO,494568
5,United Air Lines Inc.,UA,450084
6,American Eagle Airlines Inc.,MQ,247495
7,JetBlue Airways,B6,233921
8,US Airways Inc.,US,173514
9,Alaska Airlines Inc.,AS,151904


In [36]:
#insert this table into the sqlite database to perform further queries
#Drop the table if it already exists and update it with new data
count_departure_delay.to_sql(name='count_departure_delay', con=engine, if_exists='replace', index=False)
#check it worked
engine.table_names()

['airline_arrival_delay_counts',
 'airline_arrival_delay_ratio',
 'airline_avg_DeptAndArr_TotalFlights',
 'airline_departure_delay_counts',
 'airline_departure_delay_ratio',
 'airlines',
 'airports',
 'avg_airline_delays',
 'cancelled_and_count',
 'cancelled_count_by_airline',
 'count_departure_delay',
 'flights',
 'total_flights']

In [37]:
#count the number of departure delay incidents by airline
count_arrival_delay = pd.read_sql_query('SELECT a.AIRLINE as `Airline Name`, a.IATA_CODE as `Airline Code`,\
                                            COUNT(f.ARRIVAL_DELAY) as `Count ARRIVAL_DELAY`\
                                            FROM airlines as a\
                                            INNER JOIN flights as f ON a.IATA_CODE = f.AIRLINE\
                                            WHERE f.ARRIVAL_DELAY <> 0 AND AIRLINE_DELAY <> 0\
                                            GROUP BY f.AIRLINE\
                                            ORDER BY(`Count ARRIVAL_DELAY`) desc;', con = engine)
count_arrival_delay

Unnamed: 0,Airline Name,Airline Code,Count ARRIVAL_DELAY
0,Southwest Airlines Co.,WN,1140449
1,Delta Air Lines Inc.,DL,803410
2,American Airlines Inc.,AA,650272
3,Skywest Airlines Inc.,OO,505273
4,Atlantic Southeast Airlines,EV,503502
5,United Air Lines Inc.,UA,467624
6,American Eagle Airlines Inc.,MQ,256137
7,JetBlue Airways,B6,241664
8,US Airways Inc.,US,177344
9,Alaska Airlines Inc.,AS,154208


In [38]:
#insert this table into the sqlite database to perform further queries
#Drop the table if it already exists and update it with new data
count_arrival_delay.to_sql(name='count_arrival_delay', con=engine, if_exists='replace', index=False)
#check it worked
engine.table_names()

['airline_arrival_delay_counts',
 'airline_arrival_delay_ratio',
 'airline_avg_DeptAndArr_TotalFlights',
 'airline_departure_delay_counts',
 'airline_departure_delay_ratio',
 'airlines',
 'airports',
 'avg_airline_delays',
 'cancelled_and_count',
 'cancelled_count_by_airline',
 'count_arrival_delay',
 'count_departure_delay',
 'flights',
 'total_flights']

# Airport Cancellations

In [39]:
# Query airport cancellations from origin and departure destination, total and situational
airport_cancellations_df = pd.read_sql_query('''SELECT O.ORIGIN_AIRPORT AS IATA_Code, 
A.AIRPORT AS Airport, A.CITY AS City, A.STATE AS State, O.TOTAL AS Cancellations_As_Origin, 
D.TOTAL AS Cancellations_As_Destination, WO.TOTAL AS Weather_Cancellations_Origin, 
CO.TOTAL AS Air_Traffic_Cancellations_Origin, YO.TOTAL AS Security_Cancellations_Origin, 
WD.TOTAL AS Weather_Cancellations_Destination, 
CD.TOTAL AS Air_Traffic_Cancellations_Destination 
FROM 
    (SELECT ORIGIN_AIRPORT, COUNT(CANCELLED) AS TOTAL 
    FROM flights 
    WHERE CANCELLED == '1' 
    GROUP BY ORIGIN_AIRPORT) AS O 
LEFT JOIN 
    (SELECT ORIGIN_AIRPORT, COUNT(CANCELLED) AS TOTAL FROM flights 
    WHERE CANCELLATION_REASON == 'B' 
    GROUP BY ORIGIN_AIRPORT) AS WO 
    ON O.ORIGIN_AIRPORT == WO.ORIGIN_AIRPORT 
LEFT JOIN 
    (SELECT ORIGIN_AIRPORT, COUNT(CANCELLED) AS TOTAL FROM flights 
    WHERE CANCELLATION_REASON == 'C' GROUP BY ORIGIN_AIRPORT) AS CO 
    ON O.ORIGIN_AIRPORT == CO.ORIGIN_AIRPORT 
LEFT JOIN 
    (SELECT ORIGIN_AIRPORT, COUNT(CANCELLED) AS TOTAL FROM flights 
    WHERE CANCELLATION_REASON == 'D' 
    GROUP BY ORIGIN_AIRPORT) AS YO 
    ON O.ORIGIN_AIRPORT == YO.ORIGIN_AIRPORT 
LEFT JOIN 
    (SELECT DESTINATION_AIRPORT, COUNT(CANCELLED) AS TOTAL FROM flights 
    WHERE CANCELLED == '1' 
    GROUP BY DESTINATION_AIRPORT) AS D 
    ON O.ORIGIN_AIRPORT==D.DESTINATION_AIRPORT 
LEFT JOIN 
    (SELECT DESTINATION_AIRPORT, COUNT(CANCELLED) AS TOTAL FROM flights 
    WHERE CANCELLATION_REASON == 'B' 
    GROUP BY DESTINATION_AIRPORT) AS WD 
    ON O.ORIGIN_AIRPORT == WD.DESTINATION_AIRPORT 
LEFT JOIN 
    (SELECT DESTINATION_AIRPORT, COUNT(CANCELLED) AS TOTAL FROM flights 
    WHERE CANCELLATION_REASON == 'C' GROUP BY DESTINATION_AIRPORT) AS CD 
    ON O.ORIGIN_AIRPORT == CD.DESTINATION_AIRPORT 
LEFT JOIN 
    (SELECT * FROM airports) AS A 
    ON O.ORIGIN_AIRPORT == A.IATA_CODE 
ORDER BY O.TOTAL DESC;''', con = engine)

# View to verify
airport_cancellations_df.head()

Unnamed: 0,IATA_Code,Airport,City,State,Cancellations_As_Origin,Cancellations_As_Destination,Weather_Cancellations_Origin,Air_Traffic_Cancellations_Origin,Security_Cancellations_Origin,Weather_Cancellations_Destination,Air_Traffic_Cancellations_Destination
0,ORD,Chicago O'Hare International Airport,Chicago,IL,8548,9273.0,4769.0,2334.0,,4934.0,2442.0
1,DFW,Dallas/Fort Worth International Airport,Dallas-Fort Worth,TX,6254,6749.0,4664.0,135.0,,4745.0,153.0
2,LGA,LaGuardia Airport (Marine Air Terminal),New York,NY,4531,4418.0,2191.0,1197.0,1.0,2090.0,1199.0
3,EWR,Newark Liberty International Airport,Newark,NJ,3110,3350.0,1144.0,1597.0,,1222.0,1676.0
4,BOS,Gen. Edward Lawrence Logan International Airport,Boston,MA,2654,2658.0,1882.0,333.0,,1873.0,329.0


In [40]:
# Find total cancellations
TOTAL_CANCELLED = conn.execute("""SELECT COUNT(CANCELLED) FROM flights WHERE CANCELLED == '1';""").fetchall()
TOTAL_CANCELLED = TOTAL_CANCELLED[0][0]

# Calculate and create origin & destination percentage columns
airport_cancellations_df['Cancellations(Origin_or_Destination)'] = airport_cancellations_df['Cancellations_As_Destination'] + airport_cancellations_df['Cancellations_As_Origin']
airport_cancellations_df['%_Cancellations_Origin'] = airport_cancellations_df['Cancellations_As_Origin']/TOTAL_CANCELLED*100
airport_cancellations_df['%_Cancellations_Origin'] = round(airport_cancellations_df['%_Cancellations_Origin'], 2)
airport_cancellations_df['%_Cancellations_Destination'] = airport_cancellations_df['Cancellations_As_Destination']/TOTAL_CANCELLED*100
airport_cancellations_df['%_Cancellations_Destination'] = round(airport_cancellations_df['%_Cancellations_Destination'], 2)

# View to verify
airport_cancellations_df.head()

Unnamed: 0,IATA_Code,Airport,City,State,Cancellations_As_Origin,Cancellations_As_Destination,Weather_Cancellations_Origin,Air_Traffic_Cancellations_Origin,Security_Cancellations_Origin,Weather_Cancellations_Destination,Air_Traffic_Cancellations_Destination,Cancellations(Origin_or_Destination),%_Cancellations_Origin,%_Cancellations_Destination
0,ORD,Chicago O'Hare International Airport,Chicago,IL,8548,9273.0,4769.0,2334.0,,4934.0,2442.0,17821.0,9.51,10.32
1,DFW,Dallas/Fort Worth International Airport,Dallas-Fort Worth,TX,6254,6749.0,4664.0,135.0,,4745.0,153.0,13003.0,6.96,7.51
2,LGA,LaGuardia Airport (Marine Air Terminal),New York,NY,4531,4418.0,2191.0,1197.0,1.0,2090.0,1199.0,8949.0,5.04,4.92
3,EWR,Newark Liberty International Airport,Newark,NJ,3110,3350.0,1144.0,1597.0,,1222.0,1676.0,6460.0,3.46,3.73
4,BOS,Gen. Edward Lawrence Logan International Airport,Boston,MA,2654,2658.0,1882.0,333.0,,1873.0,329.0,5312.0,2.95,2.96


In [41]:
# Calculate and create perncetage situational columns
airport_cancellations_df['%_Weather_Cancellations_Origin'] = airport_cancellations_df['Weather_Cancellations_Origin']/airport_cancellations_df['Cancellations_As_Origin'] * 100
airport_cancellations_df['%_Weather_Cancellations_Origin'] = round(airport_cancellations_df['%_Weather_Cancellations_Origin'], 2)
airport_cancellations_df['%_Weather_Cancellations_Destination'] = airport_cancellations_df['Weather_Cancellations_Destination']/airport_cancellations_df['Cancellations_As_Destination'] * 100
airport_cancellations_df['%_Weather_Cancellations_Destination'] = round(airport_cancellations_df['%_Weather_Cancellations_Destination'], 2)

airport_cancellations_df['%_Air_Traffic_Cancellations_Origin'] = airport_cancellations_df['Air_Traffic_Cancellations_Origin']/airport_cancellations_df['Cancellations_As_Origin'] * 100
airport_cancellations_df['%_Air_Traffic_Cancellations_Origin'] = round(airport_cancellations_df['%_Air_Traffic_Cancellations_Origin'], 2)
airport_cancellations_df['%_Air_Traffic_Cancellations_Destination'] = airport_cancellations_df['Air_Traffic_Cancellations_Destination']/airport_cancellations_df['Cancellations_As_Destination'] * 100
airport_cancellations_df['%_Air_Traffic_Cancellations_Destination'] = round(airport_cancellations_df['%_Air_Traffic_Cancellations_Destination'], 2)

airport_cancellations_df['%_Security_Cancellations_Origin'] = airport_cancellations_df['Security_Cancellations_Origin']/airport_cancellations_df['Cancellations_As_Origin'] * 100
airport_cancellations_df['%_Security_Cancellations_Origin'] = round(airport_cancellations_df['%_Security_Cancellations_Origin'], 2)

# Reorder dataframe
airport_cancellations_df = airport_cancellations_df[['IATA_Code','Airport','City','State',
                                                     'Cancellations(Origin_or_Destination)',
                                                     'Cancellations_As_Origin',
                                                     '%_Cancellations_Origin', 
                                                     'Cancellations_As_Destination', 
                                                     '%_Cancellations_Destination', 
                                                     'Weather_Cancellations_Origin', 
                                                     '%_Weather_Cancellations_Origin', 
                                                     'Weather_Cancellations_Destination', 
                                                     '%_Weather_Cancellations_Destination', 
                                                     'Air_Traffic_Cancellations_Origin', 
                                                     '%_Air_Traffic_Cancellations_Origin', 
                                                     'Air_Traffic_Cancellations_Destination', 
                                                     '%_Air_Traffic_Cancellations_Destination', 
                                                     'Security_Cancellations_Origin', 
                                                     '%_Security_Cancellations_Origin']]

# Resort and reindex dataframe
airport_cancellations_df = airport_cancellations_df.sort_values('Cancellations(Origin_or_Destination)', ascending=False)
airport_cancellations_df = airport_cancellations_df.reset_index(drop=True)

# View to verify
airport_cancellations_df

Unnamed: 0,IATA_Code,Airport,City,State,Cancellations(Origin_or_Destination),Cancellations_As_Origin,%_Cancellations_Origin,Cancellations_As_Destination,%_Cancellations_Destination,Weather_Cancellations_Origin,%_Weather_Cancellations_Origin,Weather_Cancellations_Destination,%_Weather_Cancellations_Destination,Air_Traffic_Cancellations_Origin,%_Air_Traffic_Cancellations_Origin,Air_Traffic_Cancellations_Destination,%_Air_Traffic_Cancellations_Destination,Security_Cancellations_Origin,%_Security_Cancellations_Origin
0,ORD,Chicago O'Hare International Airport,Chicago,IL,17821.0,8548,9.51,9273.0,10.32,4769.0,55.79,4934.0,53.21,2334.0,27.30,2442.0,26.33,,
1,DFW,Dallas/Fort Worth International Airport,Dallas-Fort Worth,TX,13003.0,6254,6.96,6749.0,7.51,4664.0,74.58,4745.0,70.31,135.0,2.16,153.0,2.27,,
2,LGA,LaGuardia Airport (Marine Air Terminal),New York,NY,8949.0,4531,5.04,4418.0,4.92,2191.0,48.36,2090.0,47.31,1197.0,26.42,1199.0,27.14,1.0,0.02
3,EWR,Newark Liberty International Airport,Newark,NJ,6460.0,3110,3.46,3350.0,3.73,1144.0,36.78,1222.0,36.48,1597.0,51.35,1676.0,50.03,,
4,BOS,Gen. Edward Lawrence Logan International Airport,Boston,MA,5312.0,2654,2.95,2658.0,2.96,1882.0,70.91,1873.0,70.47,333.0,12.55,329.0,12.38,,
5,ATL,Hartsfield-Jackson Atlanta International Airport,Atlanta,GA,5272.0,2557,2.84,2715.0,3.02,1707.0,66.76,1775.0,65.38,155.0,6.06,173.0,6.37,,
6,IAH,George Bush Intercontinental Airport,Houston,TX,4562.0,2130,2.37,2432.0,2.71,1200.0,56.34,1294.0,53.21,447.0,20.99,531.0,21.83,,
7,SFO,San Francisco International Airport,San Francisco,CA,4453.0,2148,2.39,2305.0,2.56,900.0,41.90,945.0,41.00,369.0,17.18,376.0,16.31,1.0,0.05
8,DEN,Denver International Airport,Denver,CO,4433.0,2123,2.36,2310.0,2.57,1383.0,65.14,1409.0,61.00,202.0,9.51,231.0,10.00,,
9,LAX,Los Angeles International Airport,Los Angeles,CA,4423.0,2164,2.41,2259.0,2.51,609.0,28.14,676.0,29.92,257.0,11.88,258.0,11.42,,


In [42]:
# Insert into database file
airport_cancellations_df.to_sql(name='airport_cancellations', con=engine, if_exists='replace', index=False)

# Verify if successful
engine.table_names()

['airline_arrival_delay_counts',
 'airline_arrival_delay_ratio',
 'airline_avg_DeptAndArr_TotalFlights',
 'airline_departure_delay_counts',
 'airline_departure_delay_ratio',
 'airlines',
 'airport_cancellations',
 'airports',
 'avg_airline_delays',
 'cancelled_and_count',
 'cancelled_count_by_airline',
 'count_arrival_delay',
 'count_departure_delay',
 'flights',
 'total_flights']

# Airport Delays

In [43]:
# Query and create dataframe of airport arrival and departure delay averages and counts
airport_delays_df = pd.read_sql_query('''SELECT O.ORIGIN_AIRPORT AS IATA_Code, A.AIRPORT AS Airport, 
A.CITY AS City, A.STATE AS State, ROUND(O.AVERAGE, 2) AS Average_Departure_Delay, 
O.TOTAL AS Departure_Delay_Count, ROUND(D.AVERAGE, 2) AS Average_Arrival_Delay, 
D.TOTAL AS Arrival_Delay_Count 
FROM 
    (SELECT ORIGIN_AIRPORT, AVG(DEPARTURE_DELAY) AS AVERAGE, COUNT(DEPARTURE_DELAY) AS TOTAL 
    FROM flights 
    WHERE DEPARTURE_DELAY != '' 
    GROUP BY ORIGIN_AIRPORT) AS O 
LEFT JOIN 
    (SELECT DESTINATION_AIRPORT, AVG(ARRIVAL_DELAY) AS AVERAGE, COUNT(ARRIVAL_DELAY) AS TOTAL 
    FROM flights 
    WHERE ARRIVAL_DELAY != '' 
    GROUP BY DESTINATION_AIRPORT) AS D 
ON O.ORIGIN_AIRPORT == D.DESTINATION_AIRPORT 
LEFT JOIN 
    (SELECT * FROM airports) AS A 
ON O.ORIGIN_AIRPORT == A.IATA_CODE 
ORDER BY O.TOTAL DESC;''', con = engine)

# View to verify
airport_delays_df.head()

Unnamed: 0,IATA_Code,Airport,City,State,Average_Departure_Delay,Departure_Delay_Count,Average_Arrival_Delay,Arrival_Delay_Count
0,ATL,Hartsfield-Jackson Atlanta International Airport,Atlanta,GA,9.34,344384,2.23,343076
1,ORD,Chicago O'Hare International Airport,Chicago,IL,14.15,277748,7.24,275864
2,DFW,Dallas/Fort Worth International Airport,Dallas-Fort Worth,TX,11.53,233658,5.87,231764
3,DEN,Denver International Airport,Denver,CO,11.84,194077,5.08,193033
4,LAX,Los Angeles International Airport,Los Angeles,CA,10.67,192585,6.11,192136


In [44]:
# Insert into database file
airport_delays_df.to_sql(name='airport_delays', con=engine, if_exists='replace', index=False)

# Verify if successful
engine.table_names()

['airline_arrival_delay_counts',
 'airline_arrival_delay_ratio',
 'airline_avg_DeptAndArr_TotalFlights',
 'airline_departure_delay_counts',
 'airline_departure_delay_ratio',
 'airlines',
 'airport_cancellations',
 'airport_delays',
 'airports',
 'avg_airline_delays',
 'cancelled_and_count',
 'cancelled_count_by_airline',
 'count_arrival_delay',
 'count_departure_delay',
 'flights',
 'total_flights']

# Weekly Average Delays and Cancellation Totals

In [45]:
# Query and create dataframe of weekly arrival and departure delay averages and cancellation counts
weekly_delays_cancellations_df = pd.read_sql_query('''SELECT AV.MONTH AS Month, AV.DAY_OF_WEEK AS Day_Of_Week, 
ROUND(AV.AVERAGE, 2) AS Average_Arrival_Delay, ROUND(DP.AVERAGE, 2) AS Average_Departure_Delay, 
ROUND(RDP.AVERAGE, 2) AS Average_Air_System_Delay, ROUND(SDP.AVERAGE, 2) AS Average_Security_Delay, 
ROUND(ADP.AVERAGE, 2) AS Average_Airline_Delay, ROUND(LDP.AVERAGE, 2) AS Average_Late_Airline_Delay, 
ROUND(WDP.AVERAGE, 2) AS Average_Weather_Delay, TC.TOTAL AS Total_Cancellations, 
CC.TOTAL AS Total_Carrier_Cancellations, WC.TOTAL AS Total_Weather_Cancellations, 
AC.TOTAL AS Total_Air_Traffic_Cancellations, SC.TOTAL AS Total_Security_Cancellations 
FROM
    (SELECT MONTH, DAY_OF_WEEK, AVG(ARRIVAL_DELAY) AS AVERAGE FROM flights 
    WHERE ARRIVAL_DELAY != '' 
    GROUP BY MONTH, DAY_OF_WEEK) AS AV 
LEFT JOIN 
    (SELECT MONTH, DAY_OF_WEEK, AVG(DEPARTURE_DELAY) AS AVERAGE FROM flights 
    WHERE DEPARTURE_DELAY != '' 
    GROUP BY MONTH, DAY_OF_WEEK) AS DP 
USING(MONTH, DAY_OF_WEEK) 
LEFT JOIN 
    (SELECT MONTH, DAY_OF_WEEK, AVG(AIR_SYSTEM_DELAY) AS AVERAGE FROM flights 
    WHERE AIR_SYSTEM_DELAY != '' 
    GROUP BY MONTH, DAY_OF_WEEK) AS RDP 
USING(MONTH, DAY_OF_WEEK) 
LEFT JOIN 
    (SELECT MONTH, DAY_OF_WEEK, AVG(SECURITY_DELAY) AS AVERAGE FROM flights 
    WHERE SECURITY_DELAY != '' 
    GROUP BY MONTH, DAY_OF_WEEK) AS SDP 
USING(MONTH, DAY_OF_WEEK) 
LEFT JOIN 
    (SELECT MONTH, DAY_OF_WEEK, AVG(AIRLINE_DELAY) AS AVERAGE FROM flights 
    WHERE AIRLINE_DELAY != '' 
    GROUP BY MONTH, DAY_OF_WEEK) AS ADP 
USING(MONTH, DAY_OF_WEEK)
LEFT JOIN 
    (SELECT MONTH, DAY_OF_WEEK, AVG(LATE_AIRCRAFT_DELAY) AS AVERAGE FROM flights 
    WHERE LATE_AIRCRAFT_DELAY != '' 
    GROUP BY MONTH, DAY_OF_WEEK) AS LDP 
USING(MONTH, DAY_OF_WEEK)
LEFT JOIN 
    (SELECT MONTH, DAY_OF_WEEK, AVG(WEATHER_DELAY) AS AVERAGE FROM flights 
    WHERE WEATHER_DELAY != '' 
    GROUP BY MONTH, DAY_OF_WEEK) AS WDP 
USING(MONTH, DAY_OF_WEEK)
LEFT JOIN 
    (SELECT MONTH, DAY_OF_WEEK, COUNT(CANCELLED) AS TOTAL FROM flights 
    WHERE CANCELLED == '1' 
    GROUP BY MONTH, DAY_OF_WEEK) AS TC 
USING (MONTH, DAY_OF_WEEK) 
LEFT JOIN 
    (SELECT MONTH, DAY_OF_WEEK, COUNT(CANCELLED) AS TOTAL FROM flights 
    WHERE CANCELLATION_REASON == 'A' 
    GROUP BY MONTH, DAY_OF_WEEK) AS CC 
USING(MONTH, DAY_OF_WEEK) 
LEFT JOIN 
    (SELECT MONTH, DAY_OF_WEEK, COUNT(CANCELLED) AS TOTAL FROM flights 
    WHERE CANCELLATION_REASON == 'B' 
    GROUP BY MONTH, DAY_OF_WEEK) AS WC 
USING(MONTH, DAY_OF_WEEK) 
LEFT JOIN 
    (SELECT MONTH, DAY_OF_WEEK, COUNT(CANCELLED) AS TOTAL FROM flights 
    WHERE CANCELLATION_REASON == 'C' 
    GROUP BY MONTH, DAY_OF_WEEK) AS AC 
USING(MONTH, DAY_OF_WEEK) 
LEFT JOIN 
    (SELECT MONTH, DAY_OF_WEEK, COUNT(CANCELLED) AS TOTAL FROM flights 
    WHERE CANCELLATION_REASON == 'D' 
    GROUP BY MONTH, DAY_OF_WEEK) AS SC 
USING(MONTH, DAY_OF_WEEK)
ORDER BY AV.MONTH, AV.DAY_OF_WEEK;''', con = engine)

# Transform Month and Day_Of_Week from string to integer, re-sort and reset index
weekly_delays_cancellations_df['Month'] = [int(x) for x in weekly_delays_cancellations_df['Month']]
weekly_delays_cancellations_df['Day_Of_Week'] = [int(x) for x in weekly_delays_cancellations_df['Day_Of_Week']]
weekly_delays_cancellations_df = weekly_delays_cancellations_df.sort_values(['Month', 'Day_Of_Week'])
weekly_delays_cancellations_df = weekly_delays_cancellations_df.reset_index(drop=True)

# View to verify
weekly_delays_cancellations_df.head()

Unnamed: 0,Month,Day_Of_Week,Average_Arrival_Delay,Average_Departure_Delay,Average_Air_System_Delay,Average_Security_Delay,Average_Airline_Delay,Average_Late_Airline_Delay,Average_Weather_Delay,Total_Cancellations,Total_Carrier_Cancellations,Total_Weather_Cancellations,Total_Air_Traffic_Cancellations,Total_Security_Cancellations
0,1,1,9.49,12.72,14.51,0.04,16.78,23.66,3.21,2539,593,1638,308,
1,1,2,4.36,8.33,14.35,0.06,18.02,23.07,3.08,3568,443,2755,370,
2,1,3,2.22,6.5,13.61,0.04,16.89,19.87,3.1,1342,288,869,185,
3,1,4,3.74,8.03,12.8,0.08,18.13,20.36,3.34,1570,496,758,316,
4,1,5,5.86,9.56,12.51,0.04,16.74,21.91,2.06,1008,422,343,243,


In [46]:
# Insert into database file
weekly_delays_cancellations_df.to_sql(name='weekly_delays_cancellations', con=engine, if_exists='replace', index=False)

# Verify if successful
engine.table_names()

['airline_arrival_delay_counts',
 'airline_arrival_delay_ratio',
 'airline_avg_DeptAndArr_TotalFlights',
 'airline_departure_delay_counts',
 'airline_departure_delay_ratio',
 'airlines',
 'airport_cancellations',
 'airport_delays',
 'airports',
 'avg_airline_delays',
 'cancelled_and_count',
 'cancelled_count_by_airline',
 'count_arrival_delay',
 'count_departure_delay',
 'flights',
 'total_flights',
 'weekly_delays_cancellations']

# Monthly Average Delays and Cancellation Counts

In [47]:
# Query and create dataframe of monthly arrival and departure delay averages and cancellation counts
month_delays_cancellations_df = pd.read_sql_query('''SELECT AV.MONTH AS Month, 
ROUND(AV.AVERAGE, 2) AS Average_Arrival_Delay, ROUND(DP.AVERAGE, 2) AS Average_Departure_Delay, 
ROUND(RDP.AVERAGE, 2) AS Average_Air_System_Delay, ROUND(SDP.AVERAGE, 2) AS Average_Security_Delay, 
ROUND(ADP.AVERAGE, 2) AS Average_Airline_Delay, ROUND(LDP.AVERAGE, 2) AS Average_Late_Airline_Delay, 
ROUND(WDP.AVERAGE, 2) AS Average_Weather_Delay, TC.TOTAL AS Total_Cancellations, 
CC.TOTAL AS Total_Carrier_Cancellations, WC.TOTAL AS Total_Weather_Cancellations, 
AC.TOTAL AS Total_Air_Traffic_Cancellations, SC.TOTAL AS Total_Security_Cancellations 
FROM
    (SELECT MONTH, AVG(ARRIVAL_DELAY) AS AVERAGE FROM flights 
    WHERE ARRIVAL_DELAY != '' 
    GROUP BY MONTH) AS AV 
LEFT JOIN 
    (SELECT MONTH, AVG(DEPARTURE_DELAY) AS AVERAGE FROM flights 
    WHERE DEPARTURE_DELAY != '' 
    GROUP BY MONTH) AS DP 
USING(MONTH) 
LEFT JOIN 
    (SELECT MONTH, AVG(AIR_SYSTEM_DELAY) AS AVERAGE FROM flights 
    WHERE AIR_SYSTEM_DELAY != '' 
    GROUP BY MONTH) AS RDP 
USING(MONTH) 
LEFT JOIN 
    (SELECT MONTH, AVG(SECURITY_DELAY) AS AVERAGE FROM flights 
    WHERE SECURITY_DELAY != '' 
    GROUP BY MONTH) AS SDP 
USING(MONTH) 
LEFT JOIN 
    (SELECT MONTH, DAY_OF_WEEK, AVG(AIRLINE_DELAY) AS AVERAGE FROM flights 
    WHERE AIRLINE_DELAY != '' 
    GROUP BY MONTH) AS ADP 
USING(MONTH)
LEFT JOIN 
    (SELECT MONTH, DAY_OF_WEEK, AVG(LATE_AIRCRAFT_DELAY) AS AVERAGE FROM flights 
    WHERE LATE_AIRCRAFT_DELAY != '' 
    GROUP BY MONTH) AS LDP 
USING(MONTH)
LEFT JOIN 
    (SELECT MONTH, AVG(WEATHER_DELAY) AS AVERAGE FROM flights 
    WHERE WEATHER_DELAY != '' 
    GROUP BY MONTH) AS WDP 
USING(MONTH)
LEFT JOIN 
    (SELECT MONTH, DAY_OF_WEEK, COUNT(CANCELLED) AS TOTAL FROM flights 
    WHERE CANCELLED == '1' 
    GROUP BY MONTH) AS TC 
USING (MONTH) 
LEFT JOIN 
    (SELECT MONTH, COUNT(CANCELLED) AS TOTAL FROM flights 
    WHERE CANCELLATION_REASON == 'A' 
    GROUP BY MONTH) AS CC 
USING(MONTH) 
LEFT JOIN 
    (SELECT MONTH, COUNT(CANCELLED) AS TOTAL FROM flights 
    WHERE CANCELLATION_REASON == 'B' 
    GROUP BY MONTH) AS WC 
USING(MONTH) 
LEFT JOIN 
    (SELECT MONTH, COUNT(CANCELLED) AS TOTAL FROM flights 
    WHERE CANCELLATION_REASON == 'C' 
    GROUP BY MONTH) AS AC 
USING(MONTH) 
LEFT JOIN 
    (SELECT MONTH, COUNT(CANCELLED) AS TOTAL FROM flights 
    WHERE CANCELLATION_REASON == 'D' 
    GROUP BY MONTH) AS SC 
USING(MONTH)
ORDER BY AV.MONTH;''', con = engine)

# Transform Month from string to integer, re-sort and reset index
month_delays_cancellations_df['Month'] = [int(x) for x in month_delays_cancellations_df['Month']]
month_delays_cancellations_df = month_delays_cancellations_df.sort_values('Month')
month_delays_cancellations_df = month_delays_cancellations_df.reset_index(drop=True)

# View to verify
month_delays_cancellations_df.head()

Unnamed: 0,Month,Average_Arrival_Delay,Average_Departure_Delay,Average_Air_System_Delay,Average_Security_Delay,Average_Airline_Delay,Average_Late_Airline_Delay,Average_Weather_Delay,Total_Cancellations,Total_Carrier_Cancellations,Total_Weather_Cancellations,Total_Air_Traffic_Cancellations,Total_Security_Cancellations
0,1,5.81,9.76,13.32,0.07,17.8,22.76,2.74,11982,2874,7020,2087,1.0
1,2,8.32,11.89,14.18,0.05,17.99,22.67,4.32,20517,2815,15447,2254,1.0
2,3,4.92,9.66,12.87,0.07,19.05,22.59,2.4,11002,2494,6864,1639,5.0
3,4,3.16,7.72,13.59,0.04,18.12,21.71,2.69,4520,1796,1789,935,
4,5,4.49,9.45,14.0,0.06,18.61,24.23,3.75,5694,2007,2780,906,1.0


In [48]:
# Insert into database file
month_delays_cancellations_df.to_sql(name='month_delays_cancellations', con=engine, if_exists='replace', index=False)

# Verify if successful
engine.table_names()

['airline_arrival_delay_counts',
 'airline_arrival_delay_ratio',
 'airline_avg_DeptAndArr_TotalFlights',
 'airline_departure_delay_counts',
 'airline_departure_delay_ratio',
 'airlines',
 'airport_cancellations',
 'airport_delays',
 'airports',
 'avg_airline_delays',
 'cancelled_and_count',
 'cancelled_count_by_airline',
 'count_arrival_delay',
 'count_departure_delay',
 'flights',
 'month_delays_cancellations',
 'total_flights',
 'weekly_delays_cancellations']

## Connecting to MySQL (Workbench)

#### Create empty database on MySQL

#### Uploading to our newly created MySQL database

In [None]:
db1 = MySQLdb.connect(host="localhost",user="root",passwd="")
cursor = db1.cursor()
sql = 'CREATE DATABASE Overview'
cursor.execute(sql)

In [51]:
database = {'user': 'root', 
            'password': '', 
            'port': '3306',
            'host': 'localhost',
            'database': 'Overview' }

db_engine = create_engine("""mysql://%s:%s@%s:%s/%s
    """ % (database["user"], database["password"], database["host"], database["port"], database["database"]),
    echo=False)

In [None]:
## Upload dome dataframes into the MySQL workbench

total_cancelled.to_sql(name='total_cancelled', con=db_engine, if_exists = 'replace', index=False)
total_flights.to_sql(name='total_flights', con=db_engine, if_exists='replace', index=False)
cancelled_and_count.to_sql(name='cancelled_and_count', con=db_engine, if_exists='replace', index=False)
avg_airline_delays.to_sql(name='avg_airline_delays', con=db_engine, if_exists='replace', index=False)
airline_departure_delay_counts.to_sql(name='airline_departure_delay_counts', con=db_engine, if_exists='replace', index=False)
airline_departure_delay_ratio.to_sql(name='airline_departure_delay_ratio', con=engine, if_exists='replace', index=False)
airline_arrival_delay_counts.to_sql(name='airline_arrival_delay_counts', con=engine, if_exists='replace', index=False)
airline_arrival_delay_ratio.to_sql(name='airline_arrival_delay_ratio', con=engine, if_exists='replace', index=False)
count_departure_delay.to_sql(name='count_departure_delay', con=db_engine, if_exists='replace', index=False)
count_arrival_delay.to_sql(name='count_arrival_delay', con=db_engine, if_exists='replace', index=False)
airport_cancellations_df.to_sql(name='airport_cancellations', con=db_engine, if_exists='replace', index=False)
airport_delays_df.to_sql(name='airport_delays', con=db_engine, if_exists='replace', index=False)
weekly_delays_cancellations_df.to_sql(name='weekly_delays_cancellations', con=db_engine, if_exists='replace', index=False)
month_delays_cancellations_df.to_sql(name='month_delays_cancellations', con=db_engine, if_exists='replace', index=False)


In [53]:
## Checking out the database after uploading

db_engine.table_names()

['airline_avg_deptandarr_totalflights',
 'airline_departure_delay_counts',
 'airport_cancellations',
 'airport_delays',
 'avg_airline_delays',
 'cancelled_and_count',
 'count_arrival_delay',
 'count_departure_delay',
 'month_delays_cancellations',
 'total_cancelled',
 'total_flights',
 'weekly_delays_cancellations']

In [54]:
## test query sting 

query_string = "SELECT * FROM total_flights;"

In [55]:
pd.read_sql_query(query_string, con=db_engine)

Unnamed: 0,AIRLINE,flight_totals
0,AA,725984
1,AS,172521
2,B6,267048
3,DL,875881
4,EV,571977
5,F9,90836
6,HA,76272
7,MQ,294632
8,NK,117379
9,OO,588353
