In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import yaml
import sys
import os
from datetime import datetime, timedelta

project_root = os.path.abspath(os.path.join(os.getcwd(), '../..'))

if project_root not in sys.path:
    sys.path.append(project_root)

from functions import detect_outliers_iqr

try:
    with open("../../config.yaml", "r") as file:
        config = yaml.safe_load(file)

    df = pd.read_csv(config['data']['clean_data']['full_clean'], sep=";")

except:
    print("Yaml configuration file not found!")

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
df.duplicated().any()

In [None]:
df.isnull().any()

In [None]:
df["airline"].unique()

In [None]:
df['class'].value_counts()
df.groupby('airline')['class'].unique()

In [None]:
#Flights with no stops
df_nonstop = df[(df['class'] == 'Economy') & (df['stops'] == '0')]
df_nonstop

In [None]:
#Summary statistics of all airlines by price, flights with 0 stops.
df_nonstop.groupby('airline')['price'].agg(['count', 'mean', 'median', 'std', 'min', 'max'])

In [None]:
#Flights with 1 stop
df_1stop = df[(df['class'] == 'Economy') & (df['stops'] == '1')]
df_1stop.groupby('airline')['price'].agg(['count', 'mean', 'median', 'std', 'min', 'max'])

In [None]:
#Flights with 2 or more stops
df_2more_stop = df[(df['class'] == 'Economy') & (df['stops'] == '2+')]
df_2more_stop.groupby('airline')['price'].agg(['count', 'mean', 'median', 'std', 'min', 'max'])

In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(data=df_nonstop, x='airline', y='price')
plt.xticks(rotation=45)
plt.title('Price Distribution by Airline (Detecting Outliers)')
plt.show()

In [None]:
outliers = df_nonstop.groupby('airline', group_keys=False).apply(detect_outliers_iqr)
print(f"Total outliers detected: {len(outliers)}")

outliers.head()

In [None]:
print(f"Outlier percentage: {len(outliers) / len(df_nonstop):.2%}")

In [None]:
from scipy.stats import shapiro, levene

# normality test (Shapiro–Wilk per airline)
for name, group in df_nonstop.groupby('airline'):
    stat, p = shapiro(group['price'])
    print(f"{name}: p(normality) = {p:.4f}")

# homogeneity of variances (Levene’s test)
stat, p = levene(*[group['price'] for _, group in df_nonstop.groupby('airline')])
print(f"Levene’s test for equal variances: p = {p:.4f}")

In [None]:
sns.boxplot(data=df_nonstop[~df_nonstop.index.isin(outliers.index)], x='airline', y='price')
plt.xticks(rotation=45)
plt.title('Price Distribution by Airline (without Outliers)')
plt.show()

In [None]:
from scipy.stats import kruskal
groups = [group['price'] for _, group in df_nonstop.groupby('airline')]
stat, p = kruskal(*groups)
print(f"Kruskal–Wallis test (with outliers): p = {p:.4f}")

In [None]:
df_no_outliers = df_nonstop[~df_nonstop.index.isin(outliers.index)]
groups_no = [group['price'] for _, group in df_no_outliers.groupby('airline')]
stat_no, p_no = kruskal(*groups_no)
print(f"Kruskal–Wallis test (without outliers): p = {p_no:.4f}")

In [None]:
# Compute mean price per airline
mean_prices = df_nonstop.groupby('airline')['price'].mean().reset_index()

# Sort by mean price (optional, for readability)
mean_prices = mean_prices.sort_values('price', ascending=False)

# Plot
plt.figure(figsize=(10,6))
sns.barplot(data=mean_prices, x='airline', y='price', palette='viridis')

plt.title('Mean Ticket Price by Airline')
plt.xlabel('Airline')
plt.ylabel('Mean Price (€ or $)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(data=df_1stop, x='airline', y='price')
plt.xticks(rotation=45)
plt.title('Price Distribution by Airline 1-stop (Detecting Outliers)')
plt.show()

In [None]:
outliers = df_1stop.groupby('airline', group_keys=False).apply(detect_outliers_iqr)

print(f"Total outliers detected: {len(outliers)}")
outliers.head()

In [None]:
print(f"Outlier percentage: {len(outliers) / len(df_nonstop):.2%}")

In [None]:
outliers_summary = outliers.groupby('airline')['price'].describe()
outliers_summary

In [None]:
# total rows
total_rows = len(df_nonstop)

# number of rows where lead_time_days < 7
last_minute = len(df_nonstop[df_nonstop['lead_time_days'] < 15])

# percentage
percentage_last_minute = (last_minute / total_rows) * 100

print(f"Percentage of bookings made less than 7 days before departure: {percentage_last_minute:.2f}%")

In [None]:
leadtime_summary = (
    df_nonstop
    .assign(last_minute=df_nonstop['lead_time_days'] < 7)
    .groupby('airline')['last_minute']
    .mean()
    .mul(100)
    .reset_index()
    .rename(columns={'last_minute': 'percent_lt7days'})
)

leadtime_summary.sort_values('percent_lt7days', ascending=False)

In [None]:
plt.figure(figsize=(10,6))
sns.lmplot(
    data=df_nonstop,
    x='lead_time_days',
    y='price',
    hue='airline',
    lowess=True,
    height=6,
    aspect=1.5,
    scatter=False,
    palette='husl'
)

plt.title('Average Price Trend by Lead Time (Smoothed)')
plt.xlabel('Lead Time (days)')
plt.ylabel('Ticket Price')
plt.tight_layout()
plt.show()


In [None]:
groups = [group['price'] for _, group in df_1stop.groupby('airline')]
stat, p = kruskal(*groups)
print(f"Kruskal–Wallis test (with outliers): p = {p:.4f}")

In [None]:
# With outliers
groups_full = [group['price'] for _, group in df_nonstop.groupby('airline')]
stat_full, p_full = kruskal(*groups_full)

# Without outliers
df_no_outliers = df_nonstop[~df_nonstop.index.isin(outliers.index)]
groups_no = [group['price'] for _, group in df_no_outliers.groupby('airline')]
stat_no, p_no = kruskal(*groups_no)

print(f"With outliers:  p = {p_full:.4f}")
print(f"Without outliers: p = {p_no:.4f}")

In [None]:
# Compute mean price per airline
mean_prices = df_1stop.groupby('airline')['price'].mean().reset_index()

# Sort by mean price (optional, for readability)
mean_prices = mean_prices.sort_values('price', ascending=False)

# Plot
plt.figure(figsize=(10,6))
sns.barplot(data=mean_prices, x='airline', y='price', palette='viridis')

plt.title('Mean Ticket Price by Airline')
plt.xlabel('Airline')
plt.ylabel('Mean Price')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(data=df_2more_stop, x='airline', y='price')
plt.xticks(rotation=45)
plt.title('Price Distribution by Airline 1-stop (Detecting Outliers)')
plt.show()

In [None]:
# normality test (Shapiro–Wilk per airline)
for name, group in df_2more_stop.groupby('airline'):
    stat, p = shapiro(group['price'])
    print(f"{name}: p(normality) = {p:.4f}")

# homogeneity of variances (Levene’s test)
stat, p = levene(*[group['price'] for _, group in df_2more_stop.groupby('airline')])
print(f"Levene’s test for equal variances: p = {p:.4f}")

In [None]:
outliers = df_1stop.groupby('airline', group_keys=False).apply(detect_outliers_iqr)

print(f"Total outliers detected: {len(outliers)}")

In [None]:
groups = [group['price'] for _, group in df_2more_stop.groupby('airline')]
stat, p = kruskal(*groups)
print(f"Kruskal–Wallis test (with outliers): p = {p:.4f}")

In [None]:
# Compute mean price per airline
mean_prices = df_2more_stop.groupby('airline')['price'].mean().reset_index()

# Sort by mean price (optional, for readability)
mean_prices = mean_prices.sort_values('price', ascending=False)

# Plot
plt.figure(figsize=(10,6))
sns.barplot(data=mean_prices, x='airline', y='price', palette='viridis')

plt.title('Mean Ticket Price by Airline')
plt.xlabel('Airline')
plt.ylabel('Mean Price (€ or $)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# With outliers
groups_full = [group['price'] for _, group in df_nonstop.groupby('airline')]
stat_full, p_full = kruskal(*groups_full)

# Without outliers
df_no_outliers = df_nonstop[~df_nonstop.index.isin(outliers.index)]
groups_no = [group['price'] for _, group in df_no_outliers.groupby('airline')]
stat_no, p_no = kruskal(*groups_no)

print(f"With outliers:  p = {p_full:.4f}")
print(f"Without outliers: p = {p_no:.4f}")