In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import yaml
import sys
import os
from datetime import datetime, timedelta

project_root = os.path.abspath(os.path.join(os.getcwd(), '../..'))

if project_root not in sys.path:
    sys.path.append(project_root)

from functions import detect_outliers_iqr

try:
    with open("../../config.yaml", "r") as file:
        config = yaml.safe_load(file)

    df = pd.read_csv(config['data']['clean_data']['full_clean'], sep=";")

except:
    print("Yaml configuration file not found!")

In [None]:
#Flights with no stops
df_nonstop = df[(df['class'] == 'Economy') & (df['stops'] == '0')]

In [None]:
from scipy.stats import spearmanr

# Drop NaN or invalid values
df_valid = df_nonstop[['lead_time_days', 'price']].dropna()

# Spearman correlation
corr, pval = spearmanr(df_valid['lead_time_days'], df_valid['price'])

print(f"Spearman correlation = {corr:.3f}")
print(f"p-value = {pval:.5f}")

if pval < 0.05:
    print("Reject H0 → There is a statistically significant relationship between lead time and price.")
else:
    print("Fail to reject H0 → No statistically significant relationship found.")

In [None]:
# --- Detect outliers using your custom function ---
outliers = df_nonstop.groupby('airline', group_keys=False).apply(detect_outliers_iqr)
outlier_pct = len(outliers) / len(df_nonstop) * 100
print(f"Total outliers detected: {len(outliers)} ({outlier_pct:.2f}%)")

# --- Calculate lead-time percentages ---
lt7_pct = (df_nonstop['lead_time_days'] < 7).mean() * 100
lt15_pct = (df_nonstop['lead_time_days'] < 15).mean() * 100

print(f"Percentage of bookings made <7 days before departure:  {lt7_pct:.2f}%")
print(f"Percentage of bookings made <15 days before departure: {lt15_pct:.2f}%")

In [None]:
# --- Check overlap between outliers and short-lead bookings ---
df_nonstop = df_nonstop.copy()
df_nonstop['is_outlier'] = df_nonstop.index.isin(outliers.index)
df_nonstop['short_lead_15'] = df_nonstop['lead_time_days'] < 15

# Cross-tab: percentage of outliers within each group
cross = pd.crosstab(df_nonstop['short_lead_15'], df_nonstop['is_outlier'], normalize='index') * 100
print("\nOutlier rate by lead-time group:")
print(cross)

In [None]:
# --- Visualization: Distribution of lead times for outliers vs inliers ---
plt.figure(figsize=(10,6))
sns.kdeplot(data=df_nonstop, x='lead_time_days', hue='is_outlier', fill=True, common_norm=False, palette={True: 'red', False: 'grey'})
plt.title('Distribution of Lead Times for Outliers vs Inliers')
plt.xlabel('Lead Time (days)')
plt.ylabel('Density')
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# --- Aggregate data ---
leadtime_summary = (
    df_nonstop
    .groupby('lead_time_bin')
    .agg(
        mean_price=('price', 'mean'),
        count=('price', 'count')
    )
    .reset_index()
)

# --- Create figure ---
fig, ax1 = plt.subplots(figsize=(10,6))

# Bar plot (booking counts)
sns.barplot(
    data=leadtime_summary,
    x='lead_time_bin',
    y='count',
    color='lightblue',
    ax=ax1
)

# Add secondary y-axis for mean prices
ax2 = ax1.twinx()

sns.lineplot(
    data=leadtime_summary,
    x='lead_time_bin',
    y='mean_price',
    marker='o',
    color='darkblue',
    linewidth=2,
    ax=ax2
)

# --- Titles and labels ---
ax1.set_title('Average Ticket Price and Booking Volume by Lead-Time Category', fontsize=14)
ax1.set_xlabel('Lead Time (days before departure)')
ax1.set_ylabel('Number of Bookings', color='steelblue')
ax2.set_ylabel('Average Ticket Price', color='darkblue')

ax1.tick_params(axis='y', labelcolor='steelblue')
ax2.tick_params(axis='y', labelcolor='darkblue')
plt.tight_layout()
plt.show()