In [None]:
!pip install -U kaleido



In [None]:
# library imports
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import kaleido

In [None]:
# free mapbox token
mapbox_token = 'pk.eyJ1Ijoic3VtaS12b3JhIiwiYSI6ImNsbzBxcjJkNzE3ZjQya3MzaHRiamprM2gifQ.qBcbpDQPfbk3PeSs2LMyRA'

In [None]:
# datathon dataset and cities dataset for longitude and latitude
dataset = pd.read_csv("datathon_train.csv")
cities = pd.read_csv("uscities.csv")

In [None]:
# getting mean
city_delay_counts = dataset.groupby('ORIGIN_CITY_NAME')['DEP_DELAY_NEW'].mean().reset_index()
city_counts = dataset['ORIGIN_CITY_NAME'].value_counts().reset_index()

In [None]:
# renaming columns for merging
city_counts = city_counts.rename(columns = {'ORIGIN_CITY_NAME':'num_flights','index':'ORIGIN_CITY_NAME'})

In [None]:
cities['ORIGIN_CITY_NAME'] = cities['city'] + ', ' + cities['state_id'] # adding city, state to dataset
cities = cities[['ORIGIN_CITY_NAME', 'lat', 'lng']]
cities.head()

Unnamed: 0,ORIGIN_CITY_NAME,lat,lng
0,"New York, NY",40.6943,-73.9249
1,"Los Angeles, CA",34.1139,-118.4068
2,"Chicago, IL",41.8373,-87.6862
3,"Miami, FL",25.7839,-80.2102
4,"Dallas, TX",32.7936,-96.7662


In [None]:
# creating delays dataset with origin city, latitude, longitude, and delay
delays = city_delay_counts.merge(cities, on = "ORIGIN_CITY_NAME", how = "left")
all_data = delays.merge(city_counts, on = "ORIGIN_CITY_NAME", how = "left")
all_data = all_data.dropna()
all_data = all_data.rename(columns = {"DEP_DELAY_NEW":"average delay"})
all_data.head()

Unnamed: 0,ORIGIN_CITY_NAME,average delay,lat,lng,num_flights
0,"Albuquerque, NM",12.169992,35.1053,-106.6464,2606
1,"Anchorage, AK",7.6644,61.1508,-149.1091,2059
2,"Atlanta, GA",12.086137,33.7627,-84.4224,41399
3,"Austin, TX",14.846112,30.3004,-97.7522,7278
4,"Baltimore, MD",14.1363,39.3051,-76.6144,11394


In [None]:
# routes dataset with origin city, dest city, and lat/lng for each
routes = dataset[['ORIGIN_CITY_NAME', 'DEST_CITY_NAME']]
routes = routes.merge(cities, left_on = 'ORIGIN_CITY_NAME', right_on = 'ORIGIN_CITY_NAME')
routes = routes.rename(columns = {'lat':'origin_lat', 'lng':'origin_lng'})
routes = routes.merge(cities, left_on='DEST_CITY_NAME', right_on = 'ORIGIN_CITY_NAME', suffixes=('', '2'))
routes = routes.drop(['ORIGIN_CITY_NAME2'], axis = 1)
routes = routes.rename(columns = {'lat':'dest_lat', 'lng':'dest_lng'})
routes = routes.dropna()
routes = routes.sample(1000).reset_index()

In [254]:
# plotting using mapbox

# scatterplot with size of dots = business, color = delay
fig = px.scatter_mapbox(all_data,
                        lat="lat",
                        lon="lng",
                        color="average delay",
                        size="num_flights",
                        color_continuous_scale=px.colors.sequential.YlOrRd,
                        title = "Busiest Cities, Longest Delays, and Airplane Routes in the US",
                        zoom=3)

fig.update_traces(marker=dict(size=(all_data['num_flights']*3)))

# creating lines to represent a sample of the routes
line_traces = []
for i in range(len(routes)):
    line_trace = go.Scattermapbox(
        mode="lines",
        lon=[routes['origin_lng'][i], routes['dest_lng'][i]],
        lat=[routes['origin_lat'][i], routes['dest_lat'][i]],
        line=dict(width=0.01, color='white'),
        showlegend=False
    )
    line_traces.append(line_trace)

# add lines to graph
for line_trace in line_traces:
    fig.add_trace(line_trace)

fig.update_layout(
    mapbox=dict(
        center={"lat": all_data['lat'].mean(), "lon": all_data['lng'].mean()},
        accesstoken=mapbox_token,
        style="dark",
    )
)

fig.show()

