# Flight Fare Prediction - Exploratory Data Analysis (EDA)

This notebook provides a step-by-step walkthrough of the EDA process, replicating the logic in `src/eda.py` but in an interactive format.

In [None]:
import os
import sys
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Add the project root to the path so we can import from src
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

from src import config
from src.data_loader import load_data
from src.preprocessing import preprocess_data
from src.feature_engineering import engineer_features

# Set plot style
sns.set_theme(style="whitegrid")
pd.set_option('display.max_columns', None)

## 1. Load and Prepare Data

We use the pipeline functions to load, clean, and engineer features for our data.

In [None]:
# Load Data
df = load_data()
print(f"Initial Shape: {df.shape}")

# Preprocess
df_clean = preprocess_data(df)
print(f"Shape after Cleaning: {df_clean.shape}")

# Feature Engineering (without encoding/scaling for EDA visuals)
# We want human-readable categories for plots, not one-hot encoded columns.
# So we pass encode=False, scale=False
df_eda = engineer_features(df_clean, encode=False, scale=False)
print(f"Shape after Feature Engineering: {df_eda.shape}")
display(df_eda.head())

## 2. Descriptive Statistics

Let's look at the basic statistics of our dataset.

In [None]:
target = 'Total Fare (BDT)'

print("--- Data Description ---")
display(df_eda.describe())

### Fare Summary by Group
How does the price vary by Airline, Source, etc.?

In [None]:
groups = ['Airline', 'Source', 'Destination', 'Season', 'Class']

for group in groups:
    if group in df_eda.columns:
        print(f"\n--- Fare Summary by {group} ---")
        summary = df_eda.groupby(group)[target].describe()
        display(summary)

## 3. Visual Analysis

Visualizing the distributions and relationships.

In [None]:
# Distributions of Numerical Data
cols_to_plot = [target, 'Base Fare (BDT)', 'Tax & Surcharge (BDT)', 'Duration (hrs)']
cols_present = [c for c in cols_to_plot if c in df_eda.columns]

if cols_present:
    fig, axes = plt.subplots(1, len(cols_present), figsize=(6 * len(cols_present), 5))
    if len(cols_present) == 1:
        axes = [axes]
        
    for i, col in enumerate(cols_present):
        sns.histplot(df_eda[col], kde=True, ax=axes[i])
        axes[i].set_title(f'Distribution of {col}')
    
    plt.tight_layout()
    plt.show()

In [None]:
# Boxplot: Fare Variation across Airlines
if 'Airline' in df_eda.columns:
    plt.figure(figsize=(12, 6))
    sns.boxplot(x='Airline', y=target, data=df_eda)
    plt.xticks(rotation=45)
    plt.title('Fare Variation across Airlines')
    plt.tight_layout()
    plt.show()

In [None]:
# Boxplot: Fare by Class
if 'Class' in df_eda.columns:
    plt.figure(figsize=(10, 6))
    sns.boxplot(x='Class', y=target, data=df_eda)
    plt.title('Fare Variation by Class')
    plt.show()

In [None]:
# Average Fare by Month
if 'Month' in df_eda.columns:
    plt.figure(figsize=(10, 5))
    sns.barplot(x='Month', y=target, data=df_eda, estimator='mean', errorbar=None)
    plt.title('Average Fare by Month')
    plt.show()

In [None]:
# Correlation Heatmap
numeric_df = df_eda.select_dtypes(include=['number'])
if not numeric_df.empty:
    plt.figure(figsize=(12, 10))
    sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm', fmt=".2f")
    plt.title('Correlation Heatmap')
    plt.tight_layout()
    plt.show()

## 4. KPI Exploration

Calculating derived business insights.

In [None]:
# 1. Most popular routes
if 'Source' in df_eda.columns and 'Destination' in df_eda.columns:
    popular_routes = df_eda.groupby(['Source', 'Destination']).size().sort_values(ascending=False)
    print("\n--- Most Popular Routes ---")
    display(popular_routes.head(5))

# 2. Most expensive routes
if 'Source' in df_eda.columns and 'Destination' in df_eda.columns:
    expensive_routes = df_eda.groupby(['Source', 'Destination'])[target].mean().sort_values(ascending=False)
    print("\n--- Top 5 Most Expensive Routes ---")
    display(expensive_routes.head(5))