In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import yaml
import sys
import os
from datetime import datetime, timedelta

project_root = os.path.abspath(os.path.join(os.getcwd(), '../..'))

if project_root not in sys.path:
    sys.path.append(project_root)

from functions import detect_outliers_iqr

try:
    with open("../../config.yaml", "r") as file:
        config = yaml.safe_load(file)

    df = pd.read_csv(config['data']['clean_data']['full_clean'], sep=";")

except:
    print("Yaml configuration file not found!")

In [None]:
df.head()

In [None]:
# Convert duration ("HH:MM:SS") → total minutes
df['duration'] = (
    pd.to_timedelta(df['duration']).dt.total_seconds() / 60
)

# Keep relevant columns
df = df[['airline', 'duration', 'price']]

# Remove invalid rows (zero or null durations/prices)
df = df[(df['duration'] > 0) & (df['price'] > 0)]

df.head()

In [None]:
plt.figure(figsize=(8,6))
plt.hexbin(df['duration'], df['price'], gridsize=50, cmap='viridis', bins='log')
plt.colorbar(label='log10(Number of flights)')
plt.title("Price vs Duration")
plt.xlabel("Duration (minutes)")
plt.ylabel("Price (INR)")
plt.tight_layout()
plt.show()

In [None]:
# Bin durations
bins = [0, 60, 120, 180, 240, 300, 600, 1000]
labels = ['<1h', '1–2h', '2–3h', '3–4h', '4–5h', '5–10h', '>10h']
df['duration_bin'] = pd.cut(df['duration'], bins=bins, labels=labels)

plt.figure(figsize=(10,6))
sns.boxplot(x='duration_bin', y='price', data=df, showfliers=False)
plt.title("Ticket Prices by Flight Duration Category")
plt.xlabel("Duration Category")
plt.ylabel("Price (INR)")
plt.tight_layout()
plt.show()


In [None]:
from scipy.stats import pearsonr, spearmanr

# Pearson correlation (linear)
pearson_r, pearson_p = pearsonr(df['duration'], df['price'])
# Spearman correlation (monotonic)
spearman_r, spearman_p = spearmanr(df['duration'], df['price'])

print(f"Pearson r = {pearson_r:.3f} (p = {pearson_p:.3e})")
print(f"Spearman r = {spearman_r:.3f} (p = {spearman_p:.3e})")

In [None]:
df = pd.read_csv(config['data']['clean_data']['full_clean'], sep=";")

In [None]:
df['stops'] = df['stops'].replace('2+', '2')

In [None]:
# Keep relevant columns
df_stops = df[['stops', 'price']].copy()

# Ensure 'stops' is numeric
df_stops['stops'] = pd.to_numeric(df_stops['stops'], errors='coerce')

# Drop missing or invalid values
df_stops = df_stops.dropna(subset=['stops', 'price'])

In [None]:
plt.figure(figsize=(7,5))
sns.boxplot(x='stops', y='price', data=df_stops)
plt.title("Ticket Prices by Number of Stops")
plt.xlabel("Number of Stops")
plt.ylabel("Price (INR)")
plt.tight_layout()
plt.show()

In [None]:
price_by_stops = df_stops.groupby('stops')['price'].describe()[['mean','std','count']]
print(price_by_stops)

In [None]:
# Ensure stops are numeric
df['stops'] = pd.to_numeric(df['stops'], errors='coerce')

# Create separate groups
price_0 = df.loc[df['stops'] == 0, 'price']
price_1 = df.loc[df['stops'] == 1, 'price']
price_2 = df.loc[df['stops'] == 2, 'price']

print(f"0 stops: {len(price_0)} flights")
print(f"1 stop:  {len(price_1)} flights")
print(f"2 stops: {len(price_2)} flights")

In [None]:
anova_stat, anova_p = stats.f_oneway(price_0, price_1, price_2)
print(f"ANOVA F-statistic: {anova_stat:.3f}, p-value: {anova_p:.3e}")

In [None]:
df_class = df[['class', 'price']].copy()
df_class['class'] = df_class['class'].str.strip().str.capitalize()
df_class = df_class.dropna(subset=['class', 'price'])

In [None]:
plt.figure(figsize=(6,5))
sns.boxplot(x='class', y='price', data=df_class)
plt.title("Ticket Price by Travel Class")
plt.xlabel("Class")
plt.ylabel("Price (INR)")
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(7,5))
sns.barplot(x='class', y='price', data=df, ci=95, palette='pastel', capsize=0.2)
plt.title("Average Price by Class (with 95% CI)")
plt.xlabel("Travel Class")
plt.ylabel("Mean Price (INR)")
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8,5))
sns.kdeplot(df[df['class'] == 'Economy']['price'], label='Economy', fill=True)
sns.kdeplot(df[df['class'] == 'Business']['price'], label='Business', fill=True)
plt.title("Distribution of Ticket Prices by Class")
plt.xlabel("Price (INR)")
plt.ylabel("Density")
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
price_by_class = df_class.groupby('class')['price'].describe()[['mean','std','count']]
print(price_by_class)

In [None]:
economy = df_class.loc[df_class['class'] == 'Economy', 'price']
business = df_class.loc[df_class['class'] == 'Business', 'price']

t_stat, p_value = stats.ttest_ind(economy, business, equal_var=False)
print(f"T-statistic: {t_stat:.3f}, p-value: {p_value:.3e}")