In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
from datetime import datetime
from datetime import timedelta
% matplotlib inline
sns.set(style="darkgrid")
from bs4 import BeautifulSoup
import requests
import pprint
import re

In [None]:
df = pd.read_csv('../big_data/cleaned_flight_data_updated.csv', nrows=20)

In [None]:
# drop some error data
airport_lst = ['TKI']
tail_num_lst = ['PLANET', 'N101NK', 'N999FR', 'N700TA', 'N187SW', 'N399FR']
df = df[(~df['ORIGIN'].isin(airport_lst))
        & (~df['TAIL_NUM'].isin(tail_num_lst))]
df['age'] = df['YEAR'] - df['mfr_year']

In [None]:
# For delayed flights
delayed_flight = df[df['DEP_DELAY'] > 15].reset_index(drop=True)

In [None]:
res_dict = dict()
minutes_dict = dict()
pattern = re.compile(r'\d{2}')

for index, row in delayed_flight.iterrows():
    match = []
    if isinstance(row['HOURLYPRSENTWEATHERTYPE_origin'], str):
        if re.search(r'\|.*?(([A-Za-z]{2,}:(\d{2})\s)*)\|', row['HOURLYPRSENTWEATHERTYPE_origin']):
            string = re.search(r'\|.*?(([A-Za-z]{2,}:(\d{2})\s)*)\|', row['HOURLYPRSENTWEATHERTYPE_origin']).group(0)
            match = pattern.findall(string)
    if len(match) == 0:
        if 0 in list(res_dict.keys()):
            res_dict[0] += 1
            minutes_dict[0] += row['DEP_DELAY']
        else:
            res_dict[0] = 1
            minutes_dict[0] = row['DEP_DELAY']
    else:
        for num in match:
            if int(num) in list(res_dict.keys()):
                res_dict[int(num)] += 1
                minutes_dict[int(num)] += row['DEP_DELAY']
            else:
                res_dict[int(num)] = 1
                minutes_dict[int(num)] = row['DEP_DELAY']

In [None]:
avr_dict = dict()
for i in list(res_dict.keys()):
    avr_dict[i] = minutes_dict[i] / res_dict[i]
weather_type_df = pd.read_csv('../../Weather Data/weather_type_codes.csv', sep='\t')
new_avr_dict = dict()
for i in range(weather_type_df.shape[0]):
    if weather_type_df.loc[i, 'weather code'] in list(avr_dict.keys()):
        new_avr_dict[weather_type_df.loc[i, 'weather type']] = avr_dict[weather_type_df.loc[i, 'weather code']]
new_res_dict = dict()
for i in range(weather_type_df.shape[0]):
    if weather_type_df.loc[i, 'weather code'] in list(res_dict.keys()):
        new_res_dict[weather_type_df.loc[i, 'weather type']] = res_dict[weather_type_df.loc[i, 'weather code']]
new_res_df = pd.DataFrame.from_dict(new_res_dict, orient='index', columns=['count']).sort_values(by='count')
new_avr_df = pd.DataFrame.from_dict(new_avr_dict, orient='index', columns=['average delay minutes']).sort_values(by='average delay minutes')

In [None]:
new_avr_df.plot(kind='barh', title='Average Delay Minutes of Different Weather Types', figsize=(10,7), fontsize=14)

In [None]:
new_res_df.plot(kind='barh', title='Count of Delayed Flights of Different Weather Types', figsize=(10,7), fontsize=14)

## Delay to plane age

In [None]:
all_flights_age_count = np.array(df.groupby('age').DEP_DELAY.count().tolist()[1:])
delay_flights_age_count = np.array(delayed_flight.groupby('age').DEP_DELAY.count().tolist()[1:])
plt.plot(range(0,33), delay_flights_age_count / all_flights_age_count)
plt.xlabel('Age of Aircraft', fontsize=20)
plt.ylabel('Delay Percentage', fontsize=20)
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)

In [None]:
delayed_flight_age = delayed_flight.groupby('age').DEP_DELAY.mean()
plt.plot(delayed_flight_age.index, delayed_flight_age.values)
plt.xlabel('Age of Aircraft', fontsize=20)
plt.ylabel('Average Delay Minutes', fontsize=20)
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)

## Delay to carrier

In [None]:
all_flights_carrier_count = np.array(df.groupby('OP_CARRIER').DEP_DELAY.count().tolist())
delay_flights_carrier_count = np.array(delayed_flight.groupby('carrier').DEP_DELAY.count().tolist())
carrier_names = df.groupby('OP_CARRIER').DEP_DELAY.count().index.tolist()
carrier_delay_percentage = pd.DataFrame(delay_flights_carrier_count / all_flights_carrier_count, index=carrier_names, columns=['delay_percentage'])
carrier_delay_percentage = carrier_delay_percentage.sort_values(by='delay_percentage')
plt.bar(carrier_delay_percentage.index, carrier_delay_percentage.delay_percentage)
plt.xticks(delayed_flight_carrier.index, fontsize=15)
plt.xlabel('Carrier', fontsize=20)
plt.ylabel('Delay Percentage', fontsize=20)
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)

In [None]:
delayed_flight_carrier = delayed_flight.groupby('OP_CARRIER').DEP_DELAY.mean().sort_values()
plt.bar(delayed_flight_carrier.index, delayed_flight_carrier.values)
plt.xticks(delayed_flight_carrier.index, fontsize=15)
plt.xlabel('Carrier', fontsize=20)
plt.ylabel('Average Delay Minutes', fontsize=20)
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)

## Delay to plane manufacturer

In [None]:
all_flights_mfr_count = np.array(df.groupby('mfr_name').DEP_DELAY.count().tolist())
delay_flights_mfr_count = np.array(delayed_flight.groupby('mfr_name').DEP_DELAY.count().tolist())
mfr_names = df.groupby('mfr_name').DEP_DELAY.count().index.tolist()
mfr_delay_percentage = pd.DataFrame(delay_flights_mfr_count / all_flights_mfr_count, index=mfr_names, columns=['delay_percentage'])
mfr_delay_percentage = mfr_delay_percentage.sort_values(by='delay_percentage')
plt.bar(mfr_delay_percentage.index, mfr_delay_percentage.delay_percentage)
plt.xticks(mfr_delay_percentage.index)
plt.xlabel('Manufacturer', fontsize=20)
plt.ylabel('Delay Percentage', fontsize=20)
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)

In [None]:
delayed_flight_mfr = delayed_flight.groupby('mfr_name').DEP_DELAY.mean().sort_values()
plt.bar(delayed_flight_mfr.index, delayed_flight_mfr.values)
plt.xticks(delayed_flight_mfr.index)
plt.xlabel('Manufacturer', fontsize=20)
plt.ylabel('Average Delay Minutes', fontsize=20)
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)

## Delay to aircraft model

In [None]:
all_flights_model_count = np.array(df.groupby('model_2').DEP_DELAY.count().tolist())
delay_flights_model_count = np.array(delayed_flight.groupby('model_2').DEP_DELAY.count().tolist())
model_names = df.groupby('model_2').DEP_DELAY.count().index.tolist()
model_delay_percentage = pd.DataFrame(delay_flights_model_count / all_flights_model_count, index=model_names, columns=['delay_percentage'])
model_delay_percentage = model_delay_percentage.sort_values(by='delay_percentage')
plt.bar(model_delay_percentage.index, model_delay_percentage.delay_percentage)
plt.xticks(model_delay_percentage.index, rotation=90, fontsize=15)
plt.xlabel('Aircraft Model', fontsize=20)
plt.ylabel('Delay Percentage', fontsize=20)
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)

In [None]:
delayed_flight_model = delayed_flight.groupby('model_2').DEP_DELAY.mean().sort_values()
plt.bar(delayed_flight_model.index, delayed_flight_model.values)
plt.xticks(delayed_flight_model.index, rotation=90, fontsize=15)
plt.xlabel('Aircraft Model', fontsize=20)
plt.ylabel('Average Delay Minutes', fontsize=20)
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)

## Delay to engine manufacturer

In [None]:
all_flights_eng_mfr_count = np.array(df.groupby('eng_mfr_name').DEP_DELAY.count().tolist())
delay_flights_eng_mfr_count = np.array(delayed_flight.groupby('eng_mfr_name').DEP_DELAY.count().tolist())
eng_mfr_names = df.groupby('eng_mfr_name').DEP_DELAY.count().index.tolist()
eng_mfr_delay_percentage = pd.DataFrame(delay_flights_eng_mfr_count / all_flights_eng_mfr_count, index=eng_mfr_names, columns=['delay_percentage'])
eng_mfr_delay_percentage = eng_mfr_delay_percentage.sort_values(by='delay_percentage')
plt.bar(eng_mfr_delay_percentage.index, eng_mfr_delay_percentage.delay_percentage)
plt.xticks(eng_mfr_delay_percentage.index, rotation=45)
plt.xlabel('Engine Manufacturer')
plt.ylabel('Delay Percentage')

In [None]:
delayed_flight_eng_mfr = delayed_flight.groupby('eng_mfr_name').DEP_DELAY.mean().sort_values()
plt.bar(delayed_flight_eng_mfr.index, delayed_flight_eng_mfr.values)
plt.xticks(delayed_flight_eng_mfr.index, rotation=45)
plt.xlabel('Engine Manufacturer')
plt.ylabel('Average Delay Minutes')

## Delay to engine model

In [None]:
all_flights_eng_model_count = np.array(df.groupby('eng_model').DEP_DELAY.count().tolist())
delay_flights_eng_model_count = np.array(delayed_flight.groupby('eng_model').DEP_DELAY.count().tolist())
eng_model_names = df.groupby('eng_model').DEP_DELAY.count().index.tolist()
eng_model_delay_percentage = pd.DataFrame(delay_flights_eng_model_count / all_flights_eng_model_count, index=eng_model_names, columns=['delay_percentage'])
eng_model_delay_percentage = eng_model_delay_percentage.sort_values(by='delay_percentage')
plt.bar(eng_model_delay_percentage.index, eng_model_delay_percentage.delay_percentage)
plt.xticks(eng_model_delay_percentage.index, rotation=90, fontsize=15)
plt.xlabel('Engine Model', fontsize=20)
plt.ylabel('Delay Percentage', fontsize=20)
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)

In [None]:
delayed_flight_eng_model = delayed_flight.groupby('eng_model').DEP_DELAY.mean().sort_values()
plt.bar(delayed_flight_eng_model.index, delayed_flight_eng_model.values)
plt.xticks(delayed_flight_eng_model.index, rotation=90, fontsize=15)
plt.xlabel('Engine Model', fontsize=20)
plt.ylabel('Average Delay Minutes', fontsize=20)
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)

# Delayed flights on US map

In [None]:
from mpl_toolkits.basemap import Basemap as Basemap
from math import log10

## Delay by origin airport

In [None]:
avr_delay = pd.read_csv('../airport_delay.csv')
avr_delay['delay_percentage'] = avr_delay['delay_count'] / avr_delay['total_count']
# drop airports with too few flights
avr_delay = avr_delay[avr_delay['total_count'] >= 10]

In [None]:
# set the stations 3 std of avr_day far away mean to mean + 3 std
inx = avr_delay[
    np.abs(avr_delay.avr_delay - avr_delay.avr_delay.mean()) > (
                3 * avr_delay.avr_delay.std())].index.tolist()
set_value = avr_delay.avr_delay.mean() + 3 * avr_delay.avr_delay.std()
for i in inx:
    avr_delay.loc[i, 'avr_delay'] = set_value

## delay percentage to origin airports

In [None]:
# continental US
lat = avr_delay['LATITUDE'].values
lon = avr_delay['LONGITUDE'].values
size = avr_delay['passenger_count'].apply(lambda x: log10(x)).values
colors = avr_delay['delay_percentage'].values
m = Basemap(llcrnrlon=-128,llcrnrlat=22,urcrnrlon=-64,urcrnrlat=51,
        projection='lcc',lat_1=33,lat_2=45,lon_0=-96,
            resolution='l', epsg=4687)
m.arcgisimage(service="ESRI_StreetMap_World_2D", xpixels=950)
m.drawcoastlines(color='gray')
m.drawcountries(color='black')
m.drawstates(color='grey')
m.scatter(lon, lat, latlon=True,
      c=colors, cmap=plt.cm.hot_r, alpha=0.7, s=2.5 ** size)
# make a color bar
plt.colorbar(label=r'Delay Percentage')
plt.clim(0, 0.4)
# make a legend
for a in [50000, 500000, 5000000]:
    plt.scatter([], [], c='k', alpha=0.5, s=2.5 ** log10(a),
                label=str(2*a))
plt.legend(scatterpoints=1, frameon=False,
           labelspacing=1, loc='lower right', title='Annual Passenger Count')
fig = plt.gcf()
fig.set_size_inches(18.5, 7.5)
plt.show()

In [None]:
# Alaska
m = Basemap(llcrnrlon=-170, llcrnrlat=50, urcrnrlon=-110, urcrnrlat=72,
            projection='lcc', lat_1=59, lat_2=66, lon_0=-142,
            resolution='l', epsg=2964)
m.arcgisimage(service="ESRI_StreetMap_World_2D", xpixels=950)
m.drawcountries(color='black')
m.drawstates(color='grey')
m.scatter(lon, lat, latlon=True,
          c=colors, cmap=plt.cm.hot_r, alpha=0.7, s=2.5 ** size * 4)
ax_2 = plt.gca()
ax_2.text(.7, .15, 'Alaska',
          horizontalalignment='center',
          transform=ax_2.transAxes, color='red', fontsize=25)

In [None]:
# Hawaii
m = Basemap(llcrnrlon=-160, llcrnrlat=18, urcrnrlon=-154, urcrnrlat=23,
            projection='lcc', lat_1=19.5, lat_2=22, lon_0=-156,
            resolution='l', epsg=2782)
m.arcgisimage(service="ESRI_StreetMap_World_2D", xpixels=500)
m.drawcountries(color='black')
m.drawstates(color='grey')
m.scatter(lon, lat, latlon=True,
          c=colors, cmap=plt.cm.hot_r, alpha=0.7, s=2.5 ** size * 4)
ax_3 = plt.gca()
ax_3.text(.35, .4, 'Hawaii',
          horizontalalignment='center',
          transform=ax_3.transAxes, color='red', fontsize=25)

In [None]:
# Puerto Rico & US Virgin Islands
m = Basemap(llcrnrlon=-68, llcrnrlat=17.5, urcrnrlon=-64, urcrnrlat=18.6,
            projection='lcc', lat_1=18, lat_2=18.5, lon_0=-66,
            resolution='l', epsg=2866)
m.arcgisimage(service="ESRI_StreetMap_World_2D", xpixels=1000)
m.drawcountries(color='black')
m.drawstates(color='grey')
m.scatter(lon, lat, latlon=True,
          c=colors, cmap=plt.cm.hot_r, alpha=0.7, s=2.5 ** size * 4)
ax_4 = plt.gca()
ax_4.text(.37, .15, 'Puerto Rico & US Virgin Islands',
          horizontalalignment='center',
          transform=ax_4.transAxes, color='red', fontsize=17)

In [None]:
 # American Samoa
m = Basemap(llcrnrlon=-171, llcrnrlat=-14.4, urcrnrlon=-170.5,
            urcrnrlat=-14.2,
            projection='lcc', lat_1=-14.4, lat_2=-14.2, lon_0=-170.75,
            resolution='l', epsg=3102)
m.arcgisimage(service="ESRI_StreetMap_World_2D", xpixels=1000)
m.drawcountries(color='black')
m.drawstates(color='grey')
m.scatter(lon, lat, latlon=True,
          c=colors, cmap=plt.cm.hot_r, alpha=0.7, s=2.5 ** size * 100 / 16)
ax_5 = plt.gca()
ax_5.text(.35, .75, 'American Samoa',
          horizontalalignment='center',
          transform=ax_5.transAxes, color='red', fontsize=25)

In [None]:
 # Guam
m = Basemap(llcrnrlon=144.5, llcrnrlat=13.2, urcrnrlon=145, urcrnrlat=13.7,
            projection='lcc', lat_1=13.3, lat_2=13.6, lon_0=144.75,
            resolution='l', epsg=4675)
m.arcgisimage(service="ESRI_StreetMap_World_2D", xpixels=1000)
m.drawcountries(color='black')
m.drawstates(color='grey')
m.scatter(lon, lat, latlon=True,
          c=colors, cmap=plt.cm.hot_r, alpha=0.7, s=2.5 ** size * 100 / 16)
ax_6 = plt.gca()
ax_6.text(.35, .75, 'Guam',
          horizontalalignment='center',
          transform=ax_6.transAxes, color='red', fontsize=25)

## average delay minutes for delayed flights to origin airports

In [None]:
# continental US
lat = avr_delay['LATITUDE'].values
lon = avr_delay['LONGITUDE'].values
size = avr_delay['passenger_count'].apply(lambda x: log10(x)).values
colors = avr_delay['avr_delay'].values
m = Basemap(llcrnrlon=-128, llcrnrlat=22, urcrnrlon=-64, urcrnrlat=51,
            projection='lcc', lat_1=33, lat_2=45, lon_0=-96,
            resolution='l', epsg=4687)
m.arcgisimage(service="ESRI_StreetMap_World_2D", xpixels=950)
m.drawcoastlines(color='gray')
m.drawcountries(color='black')
m.drawstates(color='grey')
m.scatter(lon, lat, latlon=True,
          c=colors, cmap=plt.cm.hot_r, alpha=0.7, s=2.5 ** size)
# make a color bar
plt.colorbar(label=r'Delay Percentage')
plt.clim(0, 0.4)
# make a legend
for a in [50000, 500000, 5000000]:
    plt.scatter([], [], c='k', alpha=0.5, s=2.5 ** log10(a),
                label=str(2 * a))
plt.legend(scatterpoints=1, frameon=False,
           labelspacing=1, loc='lower right',
           title='Annual Passenger Count')
fig = plt.gcf()
fig.set_size_inches(18.5, 7.5)
plt.show()

In [None]:
# Alaska
m = Basemap(llcrnrlon=-170, llcrnrlat=50, urcrnrlon=-110, urcrnrlat=72,
            projection='lcc', lat_1=59, lat_2=66, lon_0=-142,
            resolution='l', epsg=2964)
m.arcgisimage(service="ESRI_StreetMap_World_2D", xpixels=950)
m.drawcountries(color='black')
m.drawstates(color='grey')
m.scatter(lon, lat, latlon=True,
          c=colors, cmap=plt.cm.hot_r, alpha=0.7, s=2.5 ** size * 4)
ax_2 = plt.gca()
ax_2.text(.7, .15, 'Alaska',
          horizontalalignment='center',
          transform=ax_2.transAxes, color='red', fontsize=25)

In [None]:
# Hawaii
m = Basemap(llcrnrlon=-160, llcrnrlat=18, urcrnrlon=-154, urcrnrlat=23,
            projection='lcc', lat_1=19.5, lat_2=22, lon_0=-156,
            resolution='l', epsg=2782)
m.arcgisimage(service="ESRI_StreetMap_World_2D", xpixels=500)
m.drawcountries(color='black')
m.drawstates(color='grey')
m.scatter(lon, lat, latlon=True,
          c=colors, cmap=plt.cm.hot_r, alpha=0.7, s=2.5 ** size * 4)
ax_3 = plt.gca()
ax_3.text(.35, .4, 'Hawaii',
          horizontalalignment='center',
          transform=ax_3.transAxes, color='red', fontsize=25)

In [None]:
# Puerto Rico & US Virgin Islands
m = Basemap(llcrnrlon=-68, llcrnrlat=17.5, urcrnrlon=-64, urcrnrlat=18.6,
            projection='lcc', lat_1=18, lat_2=18.5, lon_0=-66,
            resolution='l', epsg=2866)
m.arcgisimage(service="ESRI_StreetMap_World_2D", xpixels=1000)
m.drawcountries(color='black')
m.drawstates(color='grey')
m.scatter(lon, lat, latlon=True,
          c=colors, cmap=plt.cm.hot_r, alpha=0.7, s=2.5 ** size * 4)
ax_4 = plt.gca()
ax_4.text(.37, .15, 'Puerto Rico & US Virgin Islands',
          horizontalalignment='center',
          transform=ax_4.transAxes, color='red', fontsize=17)

In [None]:
# American Samoa
m = Basemap(llcrnrlon=-171, llcrnrlat=-14.4, urcrnrlon=-170.5,
            urcrnrlat=-14.2,
            projection='lcc', lat_1=-14.4, lat_2=-14.2, lon_0=-170.75,
            resolution='l', epsg=3102)
m.arcgisimage(service="ESRI_StreetMap_World_2D", xpixels=1000)
m.drawcountries(color='black')
m.drawstates(color='grey')
m.scatter(lon, lat, latlon=True,
          c=colors, cmap=plt.cm.hot_r, alpha=0.7, s=2.5 ** size * 100 / 16)
ax_5 = plt.gca()
ax_5.text(.35, .75, 'American Samoa',
          horizontalalignment='center',
          transform=ax_5.transAxes, color='red', fontsize=25)

In [None]:
# Guam
m = Basemap(llcrnrlon=144.5, llcrnrlat=13.2, urcrnrlon=145, urcrnrlat=13.7,
            projection='lcc', lat_1=13.3, lat_2=13.6, lon_0=144.75,
            resolution='l', epsg=4675)
m.arcgisimage(service="ESRI_StreetMap_World_2D", xpixels=1000)
m.drawcountries(color='black')
m.drawstates(color='grey')
m.scatter(lon, lat, latlon=True,
          c=colors, cmap=plt.cm.hot_r, alpha=0.7, s=2.5 ** size * 100 / 16)
ax_6 = plt.gca()
ax_6.text(.35, .75, 'Guam',
          horizontalalignment='center',
          transform=ax_6.transAxes, color='red', fontsize=25)
