In [None]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set_theme()
import numpy as np
from scipy import stats
import folium
from mpl_toolkits.basemap import Basemap

In [None]:
path = '/home/nyangweso/Desktop/Ds_1/Data_Science-In-Python/Python projects/Python+Tableau/data/USA Flights data/US_Flights_2013.csv'

In [None]:
df = pd.read_csv(path)
df.head(10)

In [None]:
df.columns

It seems like you're asking for a comprehensive data analysis and visualization task using the `US_flights_2013` dataset. Here's a high-level plan:

**Objectives:**

1. Analyze the flight delay patterns across different airlines, origin and destination airports, and times (month, day of the month, day of the week).
2. Investigate the relationship between departure delay and arrival delay.
3. Understand the cancellation patterns across different airlines and airports.

**Data Analysis & Visualization:**

1. Descriptive statistics of the dataset.
2. Histograms or bar charts for categorical variables like `Carrier`, `OriginCity`, `DestCity`, `Month`, `DayOfWeek`.
3. Scatter plots to visualize the relationship between `DepDelay` and `ArrDelay`.
4. Heatmaps to visualize the delay patterns across different times.
5. Pie charts to visualize the proportion of delayed and cancelled flights.
6. Box plots to visualize the distribution of delay times across different airlines or airports.
7. Line charts to visualize the trend of delays over time.
8. Geographic maps to visualize the delay patterns across different airports.

**Application of Principles:**

1. Use effective visuals: Choose the right type of chart for each visualization based on the nature of the data.
2. Apply Gestalt principles and Preattentive attributes: Use color, size, and position effectively to highlight important information.
3. Storytelling: Arrange the visualizations in a logical order to tell a coherent story about the flight delays and cancellations.

**Explanation of Results:**

1. Provide detailed comments explaining each visualization and the insights derived from it.

**Conclusions:**

1. Summarize the key findings from the analysis.
2. Provide recommendations for airlines to improve their on-time performance based on the findings.

**References:**

1. Include all the references used in the analysis.

Please note that this is a high-level plan. The actual analysis would involve writing code in Python and creating visualizations in Tableau, which I'm unable to do directly. However, I can guide you through the process and provide advice on how to perform each step. Let me know how you'd like to proceed! 😊


#### Count based on airlines


In [None]:
flight_counts = df['Carrier'].value_counts()

In [None]:
plt.figure(figsize=(10, 6))
plt.bar(flight_counts.index, flight_counts.values, color='b')
plt.xlabel('Carrier')
plt.ylabel('Number of Flights')
plt.title('Number of Flights for Each Carrier')
plt.show()

#### Count based on airports


In [None]:
# For Origin Airport
origin_airport_counts = df['OriginAirportName'].value_counts()

# For Destination Airport
dest_airport_counts = df['DestAirportName'].value_counts()

# Get a list of unique airports
airports = list(set(origin_airport_counts.index)
                | set(dest_airport_counts.index))

# Get counts for each airport as origin and destination
origin_counts = [origin_airport_counts.get(airport, 0) for airport in airports]
dest_counts = [dest_airport_counts.get(airport, 0) for airport in airports]

# Create an array for the positions of the bars on the x-axis
r = np.arange(len(airports))

# Create the figure and a single subplot
fig, ax = plt.subplots(figsize=(15, 10))

# Width of a bar
width = 0.4

# Plotting
plt.bar(r - width/2, origin_counts, color='b', width=width, label='origin')
plt.bar(r + width/2, dest_counts, color='r', width=width, label='destination')

# Adding labels and title
plt.xlabel('Airport')
plt.ylabel('Number of Flights')
plt.title('Number of Flights for Each Airport as Origin and Destination')
plt.xticks(r, airports, rotation=90)

# Show the legend
plt.legend()

# Show the plot
plt.show()

Line Chart: Plot DepDelay and ArrDelay over Month. This can show if delays are more common in certain months.


In [None]:
# Group by Month and calculate average delays
average_delays = df.groupby('Month')[['DepDelay', 'ArrDelay']].mean()

plt.figure(figsize=(10, 6))
plt.plot(average_delays.index,
         average_delays['DepDelay'], marker='o', label='Departure Delays')
plt.plot(average_delays.index,
         average_delays['ArrDelay'], marker='o', label='Arrival Delays')
plt.xlabel('Month')
plt.ylabel('Average Delay (in minutes)')
plt.title('Average Departure and Arrival Delays Over Months')
plt.legend()
plt.grid(True)
plt.show()

Heatmap: Show DayOfWeek vs Carrier with color indicating average DepDelay. This can reveal if certain carriers have more delays on specific days of the week.


In [None]:
# Group by DayOfWeek and Carrier, then calculate average departure delay
average_dep_delay = df.groupby(['DayOfWeek', 'Carrier'])[
    'DepDelay'].mean().unstack()

plt.figure(figsize=(10, 6))
sns.heatmap(average_dep_delay, cmap='coolwarm', annot=True, fmt=".1f")
plt.title('Average Departure Delay for Each Carrier on Each Day of the Week')
plt.show()

Pie Chart: Show the proportion of flights that are Cancelled. This gives a quick view of how many flights are cancelled.


In [None]:
cancelled_flights = df['Cancelled'].value_counts()

plt.figure(figsize=(6, 6))
plt.pie(cancelled_flights, labels=[
        'Not Cancelled', 'Cancelled'], autopct='%1.1f%%')
plt.title('Proportion of Flights Cancelled')
plt.show()

#### Cancellation %


In [None]:
grouped = df.groupby('Carrier')

# Cancellation %
cancel_percent = grouped['Cancelled'].mean() * 100

# Average Departure Delay
avg_dep_delay = grouped['DepDelay'].mean()

# Average Arrival Delay
avg_arr_delay = grouped['ArrDelay'].mean()

In [None]:
result = pd.DataFrame({
    'Cancellation %': cancel_percent,
    'Average Departure Delay': avg_dep_delay,
    'Average Arrival Delay': avg_arr_delay
})
result

Scatter Plot: Plot DepDelay vs ArrDelay to see if there is a correlation between departure delay and arrival delay.


In [None]:
# Calculate the line of best fit
slope, intercept, r_value, p_value, std_err = stats.linregress(
    df['DepDelay'], df['ArrDelay'])

# Create a new column for the color gradient based on the difference between departure and arrival delay
df['DelayDifference'] = abs(df['DepDelay'] - df['ArrDelay'])

plt.figure(figsize=(10, 6))
scatter = plt.scatter(df['DepDelay'], df['ArrDelay'],
                      c=df['DelayDifference'], cmap='RdYlGn_r')
plt.plot(df['DepDelay'], intercept + slope *
         df['DepDelay'], 'r', label='fitted line')
plt.colorbar(scatter)
plt.xlabel('Departure Delay')
plt.ylabel('Arrival Delay')
plt.title('Departure Delay vs Arrival Delay')
plt.show()

Histogram: Show the distribution of ArrDelay. This can give an idea of the common delay times.


In [None]:
plt.figure(figsize=(10, 6))
plt.hist(df['ArrDelay'], bins=30, edgecolor='black')
plt.xlabel('Arrival Delay')
plt.ylabel('Frequency')
plt.title('Distribution of Arrival Delays')
plt.show()

Box Plot: Show the distribution of DepDelay for each Carrier. This can reveal which carriers have the most variation in departure delays.


In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x='Carrier', y='DepDelay', data=df)
plt.xlabel('Carrier')
plt.ylabel('Departure Delay')
plt.title('Distribution of Departure Delays for Each Carrier')
plt.show()