# Flight delay time exploratory data analysis


**Scroll down to Part 3 for this week's work**

In [None]:
import numpy as np
import pandas as pd
import glob
import seaborn as sns
import networkx as nx
import tensorflow as tf
import matplotlib.pyplot as plt

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

First we read in the input files. We can use the `glob` package with `*` as a wildcard to make a list of all the csv files, and then open and concatenate all the files in the list to get a single dataframe.

In [None]:
df = pd.concat([pd.read_csv(f) for f in glob.glob("/kaggle/input/historical-flight-and-weather-data/*.csv") ])

Next, lets explore some basic characteristics of our data.

In [None]:
df.head()

In [None]:
df.hist(figsize=(20,20)); # Tip: put a semicolon at the end of the line to avoid printing a bunch of text output.

So from the initial analysis above, we can see that we've got a database of 5.5 billion flights, with each record including information about the airline ("carrier_code"), origin and destination airport, date and time, and weather information. This dataset is not well documented, but we'll assume that `*_x` corresponds to weather at the origin airport and `*_y` corresponds to weather at the destination airport. There is also information about flight delays and cancellations.

Our goal is always to do something useful. Some useful things we could do with this dataset could be to gain insight into what conditions are related to delayed and canceled flights, and potentially predict or avoid those delays in the future, so we will explore the dataset with that goal in mind.

First, we'll look into the frequency of delays and cancellations:

In [None]:
(df.arrival_delay > 0).sum() / df.shape[0]

In [None]:
(df.arrival_delay > 30).sum() / df.shape[0]

In [None]:
(df.arrival_delay > 60).sum() / df.shape[0]

In [None]:
(df.departure_delay > 0).sum() / df.shape[0]

In [None]:
((df.arrival_delay > 0) & (df.departure_delay > 0)).sum() / df.shape[0]

In [None]:
df.cancelled_code.value_counts()

In [None]:
(df.cancelled_code != "N").sum() / df.shape[0]

From the above, we can see that 34% of flight arrivals are delayed, 12% are delayed by more than 30 minutes, and 7% are delayed by more than one hour. (We're assuming the times are in minutes. Hopefully the benefit of having a well-documented dataset is apparent here.)

If we assume that a cancelled code of "N" means not cancelled, and everything else is cancelled, then about 1.5% of flights are cancelled.

We can start out by looking at how conditions were different for flights that were canceled compared to other flights. One way to do this is to create two sets of histograms:

In [None]:
df_cancel = df[df.cancelled_code != "N"]
df_cancel.hist(figsize=(20,20)); 

In [None]:
df_nocancel = df[df.cancelled_code == "N"]
df_nocancel.hist(figsize=(20,20)); 

One insight this gives us is that the max windspeed for non-canceled flights appears much higher than the max windspeed for flights that were canceled. TWe can investigate this further:

In [None]:
print(df_cancel.HourlyWindSpeed_x.mean(), df_cancel.HourlyWindSpeed_x.median(), df_cancel.HourlyWindSpeed_x.max())
print(df_nocancel.HourlyWindSpeed_x.mean(), df_nocancel.HourlyWindSpeed_x.median(), df_nocancel.HourlyWindSpeed_x.max())

## Part 2: Network analysis

Last week, we started an exploratory analysis of this dataset, treating it as tabular data. However there is also a graph or network aspect of this dataset—it's a "transportation network'. This week, we will explore that aspect.

First, let's calculate the number of flights on each "route", which is the number of flights that share an origin and destination airport:

In [None]:
num_flights = df.groupby(by=["origin_airport", "destination_airport"]).count()['flight_number']

num_flights.head()

In [None]:
num_flights.reset_index().head()

Next, let's create a directed graph of the different routes.

In [None]:
g = nx.DiGraph()

for _, edge in num_flights.reset_index().iterrows():
    g.add_edge(edge['origin_airport'], edge['destination_airport'], weight=edge['flight_number'])

We can make a plot of the graph:

Next, let's calculate the degree centrality and betweenness centrality of each airport and create a data frame that includes the columns `airport`, `deg_cen`, and `bet_cen`:

In [None]:
deg_cen = nx.degree_centrality(g)

airport, dc = [], []
for k in deg_cen:
    airport.append(k)
    dc.append(deg_cen[k])

data = {"airport": airport, "deg_cen": dc}
    
df_deg_cen = pd.DataFrame(data)
df_deg_cen.set_index("airport", inplace=True)

df_deg_cen.head()

In [None]:
bet_cen = nx.betweenness_centrality(g, weight="weight")

airport, bc = [], []
for k in bet_cen:
    airport.append(k)
    bc.append(bet_cen[k])

data = {"airport": airport, "bet_cen": bc}
    
df_bet_cen = pd.DataFrame(data)
df_bet_cen.set_index("airport", inplace=True)

df_bet_cen.head()

In [None]:
net_stats = df_deg_cen
net_stats["bet_cen"] = df_bet_cen.bet_cen
net_stats.reset_index(inplace=True)

net_stats.head()

Now, let's add our network statistics for each airport to data frame of flights, and see whether they are correlated with our "departure delay" dependent variable:

In [None]:
df_net_stats = df.merge(net_stats, left_on="origin_airport", right_on="airport")

df_net_stats["origin_bet_cen"] = df_net_stats["bet_cen"]
df_net_stats["origin_deg_cen"] = df_net_stats["deg_cen"]
df_net_stats.drop(["airport", "deg_cen", "bet_cen"], inplace=True, axis=1)

df_net_stats.head()

In [None]:
df_net_stats = df_net_stats.merge(net_stats, left_on="destination_airport", right_on="airport")

df_net_stats["destination_bet_cen"] = df_net_stats["bet_cen"]
df_net_stats["destination_deg_cen"] = df_net_stats["deg_cen"]
df_net_stats.drop(["airport", "deg_cen", "bet_cen"], inplace=True, axis=1)

df_net_stats.head()

In [None]:
df_net_stats[["arrival_delay", "destination_bet_cen","destination_deg_cen", "origin_bet_cen","origin_deg_cen"]].corr()

Can you conclude anything from these correlations?

# Part 3: Spatial analysis

So far, we have explored this dataset as tabular data and as netork data. However, this dataset also has a spatial component, which we will explore today.

First, we will merge it with a dataset of airport locations and calculate the distance of each flight. For this, we will use data from: https://openflights.org/data.html.

In [None]:
! wget https://raw.githubusercontent.com/jpatokal/openflights/master/data/airports.dat

In [None]:
cols = ['Airport ID', #Unique OpenFlights identifier for this airport.
'Name', # Name of airport. May or may not contain the City name.
'City', # Main city served by airport. May be spelled differently from Name.
'Country', # Country or territory where airport is located. See Countries to cross-reference to ISO 3166-1 codes.
'IATA', # 3-letter IATA code. Null if not assigned/unknown.
'ICAO', # 4-letter ICAO code. Null if not assigned.
'Latitude', # Decimal degrees, usually to six significant digits. Negative is South, positive is North.
'Longitude', # Decimal degrees, usually to six significant digits. Negative is West, positive is East.
'Altitude', # In feet.
'Timezone', # Hours offset from UTC. Fractional hours are expressed as decimals, eg. India is 5.5.
'DST', # Daylight savings time. One of E (Europe), A (US/Canada), S (South America), O (Australia), Z (New Zealand), N (None) or U (Unknown). See also: Help: Time
'Tz', # database time zone	Timezone in "tz" (Olson) format, eg. "America/Los_Angeles".
'Type', # Type of the airport. Value "airport" for air terminals, "station" for train stations, "port" for ferry terminals and "unknown" if not known. In airports.csv, only type=airport is included.
'Source', # Source of this data. "OurAirports" for data sourced from OurAirports, "Legacy" for old data not matched to OurAirports (mostly DAFIF), "User" for unverified user contributions. In airports.csv, only source=OurAirports is included.
]

airports = pd.read_csv("airports.dat", names=cols)

airports.head()

First, let's make a map of our routes. It could be useful to make a map of the routes between each airport, where the thickness of the line along each route is proportional to the number of flights (`flight_number` in the above dataframe). The first thing we can do is add the latitude and longitudes of the origin and destination airports to the `num_flights` dataframe we created last week.

In [None]:
num_flights_spatial = num_flights.reset_index().merge(airports[["IATA", "Latitude", "Longitude"]], how="inner", left_on="origin_airport", right_on="IATA")
num_flights_spatial["lat_origin"] = num_flights_spatial["Latitude"]
num_flights_spatial["lon_origin"] = num_flights_spatial["Longitude"]
num_flights_spatial.drop(['IATA', 'Latitude', "Longitude"], inplace=True, axis=1)

num_flights_spatial = num_flights_spatial.merge(airports[["IATA", "Latitude", "Longitude"]], how="inner", left_on="destination_airport", right_on="IATA")
num_flights_spatial["lat_destination"] = num_flights_spatial["Latitude"]
num_flights_spatial["lon_destination"] = num_flights_spatial["Longitude"]
num_flights_spatial.drop(['IATA', 'Latitude', "Longitude"], inplace=True, axis=1)


num_flights_spatial.head()

Now we're ready to make our map. Let's try plotting the routes using arcs like are shown in the maps you see on airplanes next to the lunch menu, which we know are called "great circle" paths because (for this hour, at least) we're transportation engineers.

However, making plots of great circle routes wasn't in the readings for this week, what should we do?

To do this, we can use one of the great secrets of IT professionals everywhere, which, when encountered with a problem you've never seen before, to Google `[tool] [verb] [noun]`, where `[tool]` is the programming language or software library you want to use, `[verb]` is what you want to do, and `[noun]` is what you want to do it to.

In this case, our tool is "Geopandas" (or "Python", but more specific is usually better), our verb is "plot" or "map", and our noun is "great circle routes". So we can put "`geopandas plot great circle routes`" or "`geopandas map great circle routes`" into google, and see what comes out. More often than not, there will be a similar question on stackoverflow.com or a blog post by an unemployed software engineer describing something very similar to what you're looking for, so you just have to adapt it to your use case.

(As an aside, "`[tool] [context (optional)] [error message]`" is a good google pattern for debugging your code. {As a double aside, "`what noise does [animal] make`" is a good google pattern for occupying a toddler for an extended period of time.})

In [None]:
num_flights_spatial.describe()

When you give a transportation engineer a map of great circle routes, they might wonder what difference is between the great circle distance traveled on these routes and the distance it looks like we would travel if we go in a straight line using the "Conus Albers" projection (`epsg=5070`). Lets `describe` that, assuming that the original data use the NAD83 coordinate reference system (`epsg=4269`).

First let's calculate the great circle distance of each route. We need to do some auxilliary googling to figure out how to do that (e.g. "`python great circle distance`"), which I'll save you by telling you that it's [sklearn.metrics.pairwise.haversine_distances](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.haversine_distances.html). (There's an example at the bottom of that page.)

In [None]:
from sklearn.metrics.pairwise import haversine_distances
from math import radians


def great_circle(row):
    d = haversine_distances([[radians(row.lat_origin), radians(row.lon_origin)], [radians(row.lat_destination), radians(row.lon_destination)]])
    d = d * 6371000/1000
    return d[0][1] # The haversine distances function returns a 2-d array for some reason.

num_flights_spatial["great_circle_km"] = num_flights_spatial.apply(great_circle, axis=1)

num_flights_spatial["great_circle_km"].describe()

Well, that was enlightening.

Finally, let's see if the our flight delay variable is correlated with the distance of the flight:

In [None]:
df_spatial = df_net_stats.merge(num_flights_spatial, how="inner", on=["origin_airport","destination_airport"])

df_spatial.head()

In [None]:
df_spatial[["departure_delay","arrival_delay", "great_circle_km", "flight_number_y"]].corr()

## Part 4: Linear regression

In this class, we're going to make a model of this dataset using linear regression.

First, let's refamiliarize ourselves with the variables we're working with:

In [None]:
df_spatial.columns

Now, let's do some feature engineering. We'll choose some of the variables that we think are likely to be predictive of flight delays (based on our exploratory data analysis above) and prepare them for use in a model:

In [None]:
feature_columns = []
feature_data = {}

def add_categorical_column(df, key):
    feat_col = tf.feature_column.categorical_column_with_vocabulary_list(
        key=key, vocabulary_list=df[key].unique())
    feat_col = tf.feature_column.indicator_column(feat_col)
    feature_columns.append(feat_col)
    feature_data[key] = np.array(df[key])
    
add_categorical_column(df_spatial, "carrier_code")

# add more here...


feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

![](http://)What other columns can we add to the cell above?

Finally, Let's define our label data:

In [None]:
label_data = np.array(df_spatial["arrival_delay"])

#### Modeling

Now, let's use these features to make a model of our data.

In [None]:
# These functions are adapted from the machine learning crash course.

def create_model(learning_rate, feature_layer):
  # Most simple tf.keras models are sequential.
  model = tf.keras.models.Sequential()

  # Add the layer containing the feature columns to the model.
  model.add(feature_layer)
    
  # Add one linear layer to the model to yield a simple linear regressor.
  model.add(tf.keras.layers.Dense(units=1, input_shape=(1,)))    
    
  # Construct the layers into a model that TensorFlow can execute.
  model.compile(optimizer=tf.keras.optimizers.RMSprop(lr=learning_rate),
                loss="mean_squared_error",
                metrics=[tf.keras.metrics.RootMeanSquaredError()])
    
  return model

def plot_the_loss_curve(epochs, rmse):
  """Plot a curve of loss vs. epoch."""

  plt.figure()
  plt.xlabel("Epoch")
  plt.ylabel("Root Mean Squared Error")

  plt.plot(epochs, rmse, label="Loss")
  plt.legend()
  plt.ylim([rmse.min()*0.94, rmse.max()* 1.05])
  plt.show()  

In [None]:
# The following variables are the hyperparameters.
learning_rate = 0.05
epochs = 10
batch_size = 1000

# Create and compile the model's topography.
model = create_model(learning_rate, feature_layer)

In [None]:
# Train the model on the training set.
history = model.fit(x=feature_data, y=label_data, batch_size=batch_size,
                  epochs=epochs, shuffle=True, steps_per_epoch=100)
# We wouldn't normally use the 'steps_per_epoch' argument, but we're using it
# here so the training goes faster. (Basically we're only training on part of the data).

# The list of epochs is stored separately from the rest of history.
epochs = history.epoch

# Isolate the mean absolute error for each epoch.
hist = pd.DataFrame(history.history)
rmse = hist["root_mean_squared_error"]

In [None]:
model.summary()

In [None]:
plot_the_loss_curve(epochs, rmse)

In [None]:
plot_data = {key : feature_data[key][0:1000] for key in feature_data}
y_prediction = model.predict(plot_data)

In [None]:
plt.scatter(label_data[0:1000], y_prediction);