# Flight Data Exploration

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob
import os
from datetime import datetime

pd.set_option("display.max_columns", 500)
#plt.style.use("seaborn-colorblind")
pal = sns.color_palette()

# Read in and Format Data

In [None]:
from pathlib import Path

# Define the directory path
data_dir = Path("/Users/Praveen/bd_project/flight_analysis_project/data/raw")

# Use rglob() to find all .parquet files recursively
parquet_files = list(data_dir.rglob("*.parquet"))

# Print the list of files to verify
print(parquet_files)

In [None]:
print(f"Parquet files: {parquet_files}")
print("Your message here", flush=True)


In [None]:
column_subset = [
    "FlightDate",
    "Airline",
    "Flight_Number_Marketing_Airline",
    "Origin",
    "Dest",
    "Cancelled",
    "Diverted",
    "CRSDepTime",
    "DepTime",
    "DepDelayMinutes",
    "OriginAirportID",
    "OriginCityName",
    "OriginStateName",
    "DestAirportID",
    "DestCityName",
    "DestStateName",
    "TaxiOut",
    "TaxiIn",
    "CRSArrTime",
    "ArrTime",
    "ArrDelayMinutes",
]

dfs = []
for f in parquet_files:
    dfs.append(pd.read_parquet(f, columns=column_subset))
df = pd.concat(dfs).reset_index(drop=True)

cat_cols = ["Airline", "Origin", "Dest", "OriginStateName", "DestStateName"]
for c in cat_cols:
    df[c] = df[c].astype("category")

# Understand the `DepDelayMinutes` Variable

In [None]:

# Create the folder if it doesn't exist
folder_path = "/Users/Praveen/bd_project/flight_analysis_project/data/presentation"
os.makedirs(folder_path, exist_ok=True)

# Plot the histogram
df.query("DepDelayMinutes < 30")["DepDelayMinutes"].plot(
    kind="hist", bins=30, title="Distribution of Flight Delays < 30 Min"
)

file_name = f"flight_delay_distribution.png"
file_path = os.path.join(folder_path, file_name)

# Save the plot
plt.savefig(file_path)

# Show the plot
plt.show()

print(f"Plot saved at {file_path}")


In [None]:
df.query("DepDelayMinutes > 1 and DepDelayMinutes < 61")["DepDelayMinutes"].plot(
    kind="hist", bins=30, title="Distribution of Flight Delays - by an hour"
)
file_name = f"flight_delay_distribution_1_60_min.png"
file_path = os.path.join(folder_path, file_name)

# Save the plot
plt.savefig(file_path)

# Show the plot
plt.show()

print(f"Plot saved at {file_path}")

# Grouping of Delays
Per Wikipedia https://en.wikipedia.org/wiki/Flight_cancellation_and_delay:
```
Delays are divided into three categories, namely "on time or small delay" (up to 15 minutes delay), "Medium delay" (15 â€“ 45 minutes delay) and "Large delay" ( 45 minutes delay). 
```

In [None]:
df["DelayGroup"] = None
df.loc[df["DepDelayMinutes"] == 0, "DelayGroup"] = "OnTime_Early"
df.loc[
    (df["DepDelayMinutes"] > 0) & (df["DepDelayMinutes"] <= 15), "DelayGroup"
] = "Small_Delay"
df.loc[
    (df["DepDelayMinutes"] > 15) & (df["DepDelayMinutes"] <= 45), "DelayGroup"
] = "Medium_Delay"
df.loc[df["DepDelayMinutes"] > 45, "DelayGroup"] = "Large_Delay"
df.loc[df["Cancelled"], "DelayGroup"] = "Cancelled"

In [None]:
df["DelayGroup"].value_counts(ascending=True).plot(
    kind="barh", figsize=(10, 5), color=pal[1], title="Flight Results (2018-2022)"
)

file_name = f"flight_results_delaygroup.png"
file_path = os.path.join(folder_path, file_name)

# Save the plot
plt.savefig(file_path)

# Show the plot
plt.show()

print(f"Plot saved at {file_path}")

# How Many Flights Per Year?

In [None]:
df["Year"] = df['FlightDate'].dt.year
df["Year"].value_counts().sort_index().plot(
    kind="bar", figsize=(10, 5), title="Scheduled Flights Per Year"
)

file_name = f"Scheduled_Flights_Per_Year.png"
file_path = os.path.join(folder_path, file_name)

# Save the plot
plt.savefig(file_path)

# Show the plot
plt.show()

print(f"Plot saved at {file_path}")

# What is the % of Flight Results by Year

In [None]:
df["Year"] = df["FlightDate"].dt.year
df_agg = df.groupby("Year")["DelayGroup"].value_counts(normalize=True).unstack() * 100
col_order = ["OnTime_Early", "Small_Delay", "Medium_Delay", "Large_Delay", "Cancelled"]
df_agg[col_order].style.background_gradient(cmap="Greens")

# Results by Month

In [None]:
df["Month"] = df["FlightDate"].dt.month
df_agg = df.groupby("Month")["DelayGroup"].value_counts(normalize=True).unstack() * 100
col_order = ["OnTime_Early", "Small_Delay", "Medium_Delay", "Large_Delay", "Cancelled"]
df_agg[col_order].style.background_gradient(cmap="Blues")

## Exclude 2020

In [None]:
df["Month"] = df["FlightDate"].dt.month
df_agg = (
    df.query("Year != 2020")
    .groupby("Month")["DelayGroup"]
    .value_counts(normalize=True)
    .unstack()
    * 100
)
col_order = ["OnTime_Early", "Small_Delay", "Medium_Delay", "Large_Delay", "Cancelled"]
df_agg[col_order].style.background_gradient(cmap="Blues")

## Why High Cancellation in March/April? What does it look like by year for these months?

In [None]:
df["Month"] = df["FlightDate"].dt.month
df_agg = (
    df.query("3 <= Month <= 4")
    .groupby("Year")["DelayGroup"]
    .value_counts(normalize=True)
    .unstack()
    * 100
)
col_order = ["OnTime_Early", "Small_Delay", "Medium_Delay", "Large_Delay", "Cancelled"]
df_agg[col_order].style.background_gradient(cmap="Oranges", axis=0)

# Plot using `calmap`

In [None]:
!pip install calmap plotly_calplot -q

In [None]:
import calmap
events = df.groupby("FlightDate")["Cancelled"].mean()
fig, axs = plt.subplots(5, 1, figsize=(10, 10))
for i, year in enumerate([2018, 2019, 2020, 2021, 2022]):
    calmap.yearplot(
        events.apply(np.log), year=year, cmap="YlOrRd", monthly_border=True, ax=axs[i]
    )
    axs[i].set_title(year)
fig.patch.set_facecolor("white")
fig.suptitle("US Flight Cancellations", y=0.92, fontsize=20)
file_name = f"flight_cancellations.png"
file_path = os.path.join(folder_path, file_name)

# Save the plot
plt.savefig(file_path)

# Show the plot
plt.show()

print(f"Plot saved at {file_path}")

# Interactive Calendar Heatmap

In [None]:
from plotly_calplot import calplot

# creating the plot
fig = calplot(events.apply(np.log).reset_index(), x="FlightDate", y="Cancelled")
file_name = f"interactive_flight_cancellations.html"
file_path = os.path.join(folder_path, file_name)

# Save the plot as an HTML file
fig.write_html(file_path)

# Optionally, show the plot
fig.show()

print(f"Interactive plot saved at {file_path}")

# Compare Airlines
- Who has the most delays?
- Who has the most cancellations?
- Who is the most reliable? (on time)

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
airlines_ordered = (df["Airline"].value_counts(ascending=True) / 100_000).plot(
    kind="barh", ax=ax, color=pal[2], width=1, edgecolor="black"
)
ax.set_title("Number of Flights in Dataset")
ax.set_xlabel("Flights (100k)")
file_name = f"number_of_flights.png"
file_path = os.path.join(folder_path, file_name)

# Save the plot as a PNG file
fig.savefig(file_path, bbox_inches="tight")

# Optionally, show the plot
plt.show()

print(f"Plot saved at {file_path}")
