# EDA Comparison: Original vs Processed Flight Data

This notebook provides a comprehensive comparison between the original `flights_sample_3m.csv` dataset and the processed `flights_processed.csv` dataset.

## Objectives:
1. Load and analyze both datasets
2. Compare data quality improvements
3. Analyze feature engineering changes
4. Visualize distributions and relationships

5. Generate summary insights

---


## 1. Setup and Imports


In [None]:
# Import required libraries
import json
import os
import warnings
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
from plotly.subplots import make_subplots

# Configure display settings
warnings.filterwarnings("ignore")
pd.options.display.max_columns = 50
pd.options.display.max_rows = 100
pd.options.display.float_format = "{:,.3f}".format

# Set style
plt.style.use("seaborn-v0_8")
sns.set_palette("husl")

# Create output directory
os.makedirs("eda_comparison_outputs", exist_ok=True)

print("Libraries imported successfully!")
print(f"Pandas version: {pd.__version__}")
print(f"Numpy version: {np.__version__}")

## 2. Load Datasets


In [None]:
# Define paths
original_path = Path("./dataset/flights_sample_3m.csv")
processed_path = Path("./dataset/flights_processed.csv")

print("Dataset paths:")
print(f"Original: {original_path.resolve()}")
print(f"Processed: {processed_path.resolve()}")

# Check if files exist
print(f"\nOriginal exists: {original_path.exists()}")
print(f"Processed exists: {processed_path.exists()}")

if original_path.exists():
    original_size = original_path.stat().st_size / (1024 * 1024)
    print(f"Original file size: {original_size:.2f} MB")

if processed_path.exists():
    processed_size = processed_path.stat().st_size / (1024 * 1024)
    print(f"Processed file size: {processed_size:.2f} MB")

In [None]:
# Load datasets with memory optimization
print("Loading original dataset...")
try:
    # Load original dataset
    df_original = pd.read_csv(original_path)
    print(f"Original dataset loaded: {df_original.shape}")
except MemoryError:
    print("Memory error loading original dataset. Using chunked loading...")
    chunks = []
    for chunk in pd.read_csv(original_path, chunksize=100_000):
        chunks.append(chunk)
    df_original = pd.concat(chunks, ignore_index=True)
    print(f"Original dataset loaded via chunks: {df_original.shape}")

print("\nLoading processed dataset...")
df_processed = pd.read_csv(processed_path)
print(f"Processed dataset loaded: {df_processed.shape}")

# Basic info about both datasets
print("\n" + "=" * 50)
print("DATASET OVERVIEW")
print("=" * 50)
print(f"Original:  {df_original.shape[0]:,} rows × {df_original.shape[1]} columns")
print(f"Processed: {df_processed.shape[0]:,} rows × {df_processed.shape[1]} columns")

## 3. Data Structure Comparison


In [None]:
# Compare column structures
original_cols = set(df_original.columns)
processed_cols = set(df_processed.columns)

print("COLUMN COMPARISON")
print("=" * 50)
print(f"Original columns: {len(original_cols)}")
print(f"Processed columns: {len(processed_cols)}")

# Find differences
removed_cols = original_cols - processed_cols
added_cols = processed_cols - original_cols
common_cols = original_cols & processed_cols

print(f"\nCommon columns: {len(common_cols)}")
print(f"Removed columns: {len(removed_cols)}")
print(f"Added columns: {len(added_cols)}")

if removed_cols:
    print(f"\nRemoved columns: {sorted(list(removed_cols))}")

if added_cols:
    print(f"\nAdded columns: {sorted(list(added_cols))}")

# Show data types comparison
print("\n" + "=" * 50)
print("DATA TYPES COMPARISON")
print("=" * 50)

print("\nOriginal dataset dtypes:")
print(df_original.dtypes.value_counts())

print("\nProcessed dataset dtypes:")
print(df_processed.dtypes.value_counts())

## 4. Missing Values Analysis


In [None]:
# Missing values comparison
print("MISSING VALUES COMPARISON")
print("=" * 50)

# Calculate missing values for both datasets
missing_original = df_original.isnull().sum().sort_values(ascending=False)
missing_processed = df_processed.isnull().sum().sort_values(ascending=False)

# Convert to percentages
missing_orig_pct = (missing_original / len(df_original) * 100).round(2)
missing_proc_pct = (missing_processed / len(df_processed) * 100).round(2)

# Create comparison dataframe for common columns
missing_comparison = []
for col in common_cols:
    if col in df_original.columns and col in df_processed.columns:
        missing_comparison.append(
            {
                "Column": col,
                "Original_Missing": missing_original[col],
                "Original_Pct": missing_orig_pct[col],
                "Processed_Missing": missing_processed[col],
                "Processed_Pct": missing_proc_pct[col],
                "Improvement": missing_orig_pct[col] - missing_proc_pct[col],
            }
        )

missing_df = pd.DataFrame(missing_comparison)
missing_df = missing_df.sort_values("Improvement", ascending=False)

print("Missing values comparison (top 20 columns):")
print(missing_df.head(20).to_string(index=False))

# Overall missing values summary
total_missing_orig = df_original.isnull().sum().sum()
total_missing_proc = df_processed.isnull().sum().sum()

print(f"\nOVERALL MISSING VALUES SUMMARY:")
print(f"Original dataset:  {total_missing_orig:,} missing values")
print(f"Processed dataset: {total_missing_proc:,} missing values")
print(f"Reduction: {total_missing_orig - total_missing_proc:,} missing values")

## 5. Statistical Summary Comparison


In [None]:
# Statistical summary for numeric columns
print("STATISTICAL SUMMARY COMPARISON")
print("=" * 50)

# Get numeric columns from both datasets
numeric_orig = df_original.select_dtypes(include=[np.number]).columns
numeric_proc = df_processed.select_dtypes(include=[np.number]).columns
common_numeric = list(set(numeric_orig) & set(numeric_proc))

print(f"Common numeric columns: {len(common_numeric)}")
print(f"Original numeric columns: {len(numeric_orig)}")
print(f"Processed numeric columns: {len(numeric_proc)}")

if common_numeric:
    print(f"\nCommon numeric columns: {sorted(common_numeric)}")

# Statistical summaries
print("\nOriginal dataset - Numeric summary:")
orig_stats = df_original[common_numeric].describe()
print(orig_stats)

print("\nProcessed dataset - Numeric summary:")
proc_stats = df_processed[common_numeric].describe()
print(proc_stats)

## 6. Distribution Analysis


In [None]:
# Compare key statistics for important columns
key_columns = ["DEP_DELAY", "ARR_DELAY", "DISTANCE", "AIR_TIME"]
available_key_cols = [col for col in key_columns if col in common_numeric]

if available_key_cols:
    print("KEY STATISTICS COMPARISON")
    print("=" * 50)

    stats_comparison = []

    for col in available_key_cols:
        orig_mean = df_original[col].mean()
        proc_mean = df_processed[col].mean()
        orig_std = df_original[col].std()
        proc_std = df_processed[col].std()
        orig_median = df_original[col].median()
        proc_median = df_processed[col].median()

        stats_comparison.append(
            {
                "Column": col,
                "Orig_Mean": orig_mean,
                "Proc_Mean": proc_mean,
                "Mean_Change": proc_mean - orig_mean,
                "Orig_Std": orig_std,
                "Proc_Std": proc_std,
                "Std_Change": proc_std - orig_std,
                "Orig_Median": orig_median,
                "Proc_Median": proc_median,
                "Median_Change": proc_median - orig_median,
            }
        )

    stats_df = pd.DataFrame(stats_comparison)
    print(stats_df.round(3).to_string(index=False))
else:
    print("No common key columns found for detailed comparison.")

In [None]:
# Distribution comparison for key numeric columns
if available_key_cols:
    n_cols = len(available_key_cols)
    fig, axes = plt.subplots(n_cols, 2, figsize=(15, 4 * n_cols))

    if n_cols == 1:
        axes = axes.reshape(1, -1)

    for i, col in enumerate(available_key_cols):
        # Original distribution
        axes[i, 0].hist(df_original[col].dropna(), bins=50, alpha=0.7, color="red", label="Original")
        axes[i, 0].set_title(f"{col} - Original Distribution")
        axes[i, 0].set_xlabel(col)
        axes[i, 0].set_ylabel("Frequency")
        axes[i, 0].legend()

        # Processed distribution
        axes[i, 1].hist(df_processed[col].dropna(), bins=50, alpha=0.7, color="green", label="Processed")
        axes[i, 1].set_title(f"{col} - Processed Distribution")
        axes[i, 1].set_xlabel(col)
        axes[i, 1].set_ylabel("Frequency")
        axes[i, 1].legend()

    plt.tight_layout()
    plt.savefig("eda_comparison_outputs/distribution_comparison.png", dpi=300, bbox_inches="tight")
    plt.show()
else:
    print("No key columns available for distribution comparison.")

## 7. Target Variable Analysis


In [None]:
# Look for target variables in processed dataset
target_candidates = ["any_delay", "delay_15min", "delay_30min", "ARR_DELAY", "DEP_DELAY"]
available_targets = [col for col in target_candidates if col in df_processed.columns]

if available_targets:
    print("TARGET VARIABLE ANALYSIS")
    print("=" * 50)

    for target in available_targets:
        print(f"\n{target}:")

        if df_processed[target].dtype in ["int64", "float64"]:
            # Numeric target
            print(f"  - Type: Numeric")
            print(f"  - Min: {df_processed[target].min():.3f}")
            print(f"  - Max: {df_processed[target].max():.3f}")
            print(f"  - Mean: {df_processed[target].mean():.3f}")
            print(f"  - Std: {df_processed[target].std():.3f}")
            print(f"  - Median: {df_processed[target].median():.3f}")

            # Distribution
            plt.figure(figsize=(10, 4))

            plt.subplot(1, 2, 1)
            plt.hist(df_processed[target].dropna(), bins=50, alpha=0.7)
            plt.title(f"{target} - Distribution")
            plt.xlabel(target)
            plt.ylabel("Frequency")

            plt.subplot(1, 2, 2)
            plt.boxplot(df_processed[target].dropna())
            plt.title(f"{target} - Box Plot")
            plt.ylabel(target)

            plt.tight_layout()
            plt.savefig(f"eda_comparison_outputs/target_{target}_analysis.png", dpi=300, bbox_inches="tight")
            plt.show()

        else:
            # Categorical target
            print(f"  - Type: Categorical")
            value_counts = df_processed[target].value_counts()
            print(f"  - Value distribution:")
            for val, count in value_counts.items():
                pct = count / len(df_processed) * 100
                print(f"    {val}: {count:,} ({pct:.2f}%)")

            # Bar plot
            plt.figure(figsize=(8, 5))
            value_counts.plot(kind="bar")
            plt.title(f"{target} - Value Distribution")
            plt.xlabel(target)
            plt.ylabel("Count")
            plt.xticks(rotation=45)
            plt.tight_layout()
            plt.savefig(f"eda_comparison_outputs/target_{target}_analysis.png", dpi=300, bbox_inches="tight")
            plt.show()
else:
    print("No target variables found in the processed dataset.")
    print("Available columns:", list(df_processed.columns))

## 8. Feature Engineering Analysis


In [None]:
# Analyze new features in processed dataset
print("FEATURE ENGINEERING ANALYSIS")
print("=" * 50)

if added_cols:
    print(f"New features added ({len(added_cols)}):")

    for col in sorted(added_cols):
        dtype = df_processed[col].dtype
        n_unique = df_processed[col].nunique()
        missing_count = df_processed[col].isnull().sum()

        print(f"\n{col}:")
        print(f"  - Data type: {dtype}")
        print(f"  - Unique values: {n_unique}")
        print(f"  - Missing values: {missing_count}")

        if dtype in ["object", "category"]:
            print(f"  - Top values:")
            top_values = df_processed[col].value_counts().head(5)
            for val, count in top_values.items():
                print(f"    {val}: {count}")
        else:
            print(f"  - Min: {df_processed[col].min():.3f}")
            print(f"  - Max: {df_processed[col].max():.3f}")
            print(f"  - Mean: {df_processed[col].mean():.3f}")
            print(f"  - Std: {df_processed[col].std():.3f}")
else:
    print("No new features were added to the processed dataset.")

# Analyze removed features
if removed_cols:
    print(f"\n\nRemoved features ({len(removed_cols)}):")

    for col in sorted(removed_cols):
        dtype = df_original[col].dtype
        n_unique = df_original[col].nunique()
        missing_count = df_original[col].isnull().sum()

        print(f"\n{col}:")
        print(f"  - Data type: {dtype}")
        print(f"  - Unique values: {n_unique}")
        print(f"  - Missing values: {missing_count}")
        print(f"  - Missing percentage: {missing_count/len(df_original)*100:.2f}%")

## 9. Summary Report


In [None]:
# Generate comprehensive summary report
print("COMPREHENSIVE EDA COMPARISON SUMMARY")
print("=" * 60)

# Dataset sizes
print(f"\n1. DATASET SIZES:")
print(f"   Original:  {df_original.shape[0]:,} rows × {df_original.shape[1]} columns")
print(f"   Processed: {df_processed.shape[0]:,} rows × {df_processed.shape[1]} columns")
print(f"   Row reduction: {df_original.shape[0] - df_processed.shape[0]:,} rows")
print(f"   Column change: {df_processed.shape[1] - df_original.shape[1]} columns")

# Data quality improvements
print(f"\n2. DATA QUALITY IMPROVEMENTS:")
print(f"   Missing values reduction: {total_missing_orig - total_missing_proc:,}")
print(f"   Missing percentage - Original: {total_missing_orig/(df_original.shape[0]*df_original.shape[1])*100:.2f}%")
print(f"   Missing percentage - Processed: {total_missing_proc/(df_processed.shape[0]*df_processed.shape[1])*100:.2f}%")

# Feature changes
print(f"\n3. FEATURE CHANGES:")
print(f"   Removed columns: {len(removed_cols)}")
print(f"   Added columns: {len(added_cols)}")
print(f"   Common columns: {len(common_cols)}")

# Data types
print(f"\n4. DATA TYPE CHANGES:")
type_changes = comparison_df[comparison_df["Type_Changed"] == "Yes"] if "comparison_df" in locals() else pd.DataFrame()
print(f"   Columns with type changes: {len(type_changes)}")

# Target variables
if available_targets:
    print(f"\n5. TARGET VARIABLES:")
    for target in available_targets:
        if target in ["any_delay", "delay_15min", "delay_30min"]:
            positive_rate = df_processed[target].mean()
            print(f"   {target}: {positive_rate:.3f} positive rate")

# Memory usage
orig_memory = df_original.memory_usage(deep=True).sum() / (1024**2)
proc_memory = df_processed.memory_usage(deep=True).sum() / (1024**2)
print(f"\n6. MEMORY USAGE:")
print(f"   Original: {orig_memory:.2f} MB")
print(f"   Processed: {proc_memory:.2f} MB")
print(f"   Memory reduction: {orig_memory - proc_memory:.2f} MB ({(orig_memory - proc_memory)/orig_memory*100:.1f}%)")

print(f"\n\nSummary report and visualizations saved to: eda_comparison_outputs/")
print("=" * 60)
print("EDA COMPARISON COMPLETED SUCCESSFULLY!")
print("=" * 60)

## 10. Original Dataset Deep Dive - Exploratory Charts


In [None]:
# Original Dataset - Comprehensive Visual Analysis
print("ORIGINAL DATASET EXPLORATORY ANALYSIS")
print("=" * 60)

# Sample the original dataset for visualization (to avoid memory issues)
sample_size = min(100000, len(df_original))
df_orig_sample = df_original.sample(n=sample_size, random_state=42)
print(f"Using sample of {sample_size:,} rows for visualization")

# 1. Missing Values Pattern Analysis
plt.figure(figsize=(15, 8))

# Missing values heatmap
missing_data = df_orig_sample.isnull()
plt.subplot(2, 2, 1)
sns.heatmap(missing_data.T, cbar=True, yticklabels=True, cmap="viridis")
plt.title("Missing Values Pattern (Sample)")
plt.xlabel("Row Index (Sample)")
plt.ylabel("Columns")

# Missing values by column
missing_counts = df_orig_sample.isnull().sum().sort_values(ascending=True)
plt.subplot(2, 2, 2)
missing_counts.plot(kind="barh", color="coral")
plt.title("Missing Values Count by Column")
plt.xlabel("Missing Count")

# Missing percentage by column
missing_pct = (missing_counts / len(df_orig_sample) * 100).round(2)
plt.subplot(2, 2, 3)
missing_pct.plot(kind="barh", color="lightcoral")
plt.title("Missing Values Percentage by Column")
plt.xlabel("Missing Percentage (%)")

# Data types distribution
plt.subplot(2, 2, 4)
dtype_counts = df_orig_sample.dtypes.value_counts()
plt.pie(dtype_counts.values, labels=dtype_counts.index, autopct="%1.1f%%")
plt.title("Data Types Distribution")

plt.tight_layout()
plt.savefig("eda_comparison_outputs/original_dataset_overview.png", dpi=300, bbox_inches="tight")
plt.show()

print("Missing values analysis completed and saved.")

In [None]:
# 2. Key Numeric Variables Analysis
print("KEY NUMERIC VARIABLES ANALYSIS")
print("=" * 40)

# Get key numeric columns
key_numeric_cols = ["DEP_DELAY", "ARR_DELAY", "DISTANCE", "AIR_TIME", "CRS_ELAPSED_TIME", "ELAPSED_TIME"]
available_numeric = [col for col in key_numeric_cols if col in df_orig_sample.columns]

if available_numeric:
    # Create comprehensive numeric analysis
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    axes = axes.flatten()

    for i, col in enumerate(available_numeric[:6]):
        # Remove outliers for better visualization
        data = df_orig_sample[col].dropna()
        Q1 = data.quantile(0.25)
        Q3 = data.quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        data_clean = data[(data >= lower_bound) & (data <= upper_bound)]

        # Histogram
        axes[i].hist(data_clean, bins=50, alpha=0.7, color="skyblue", edgecolor="black")
        axes[i].set_title(f"{col} Distribution (Outliers Removed)")
        axes[i].set_xlabel(col)
        axes[i].set_ylabel("Frequency")

        # Add statistics text
        stats_text = f"Mean: {data.mean():.1f}\nMedian: {data.median():.1f}\nStd: {data.std():.1f}"
        axes[i].text(
            0.7,
            0.8,
            stats_text,
            transform=axes[i].transAxes,
            bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.8),
        )

    plt.tight_layout()
    plt.savefig("eda_comparison_outputs/original_numeric_analysis.png", dpi=300, bbox_inches="tight")
    plt.show()

    # Box plots for outlier analysis
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    axes = axes.flatten()

    for i, col in enumerate(available_numeric[:6]):
        df_orig_sample.boxplot(column=col, ax=axes[i])
        axes[i].set_title(f"{col} - Box Plot (Outliers Shown)")
        axes[i].tick_params(axis="x", rotation=45)

    plt.tight_layout()
    plt.savefig("eda_comparison_outputs/original_boxplots.png", dpi=300, bbox_inches="tight")
    plt.show()

print("Numeric variables analysis completed and saved.")

In [None]:
# 3. Categorical Variables Analysis
print("CATEGORICAL VARIABLES ANALYSIS")
print("=" * 40)

# Get key categorical columns
key_cat_cols = ["AIRLINE", "AIRLINE_CODE", "ORIGIN", "DEST", "ORIGIN_CITY", "DEST_CITY"]
available_cat = [col for col in key_cat_cols if col in df_orig_sample.columns]

if available_cat:
    # Create categorical analysis
    n_cols = min(3, len(available_cat))
    n_rows = (len(available_cat) + n_cols - 1) // n_cols

    fig, axes = plt.subplots(n_rows, n_cols, figsize=(18, 6 * n_rows))
    if n_rows == 1:
        axes = [axes] if n_cols == 1 else axes
    else:
        axes = axes.flatten()

    for i, col in enumerate(available_cat):
        # Get top 15 categories
        top_categories = df_orig_sample[col].value_counts().head(15)

        # Create horizontal bar plot
        axes[i].barh(range(len(top_categories)), top_categories.values, color="lightgreen")
        axes[i].set_yticks(range(len(top_categories)))
        axes[i].set_yticklabels(top_categories.index)
        axes[i].set_title(f"Top 15 {col} Values")
        axes[i].set_xlabel("Count")
        axes[i].invert_yaxis()

        # Add percentage annotations
        total = df_orig_sample[col].value_counts().sum()
        for j, v in enumerate(top_categories.values):
            pct = (v / total) * 100
            axes[i].text(v + max(top_categories.values) * 0.01, j, f"{pct:.1f}%", va="center", fontsize=8)

    # Hide empty subplots
    for i in range(len(available_cat), len(axes)):
        axes[i].set_visible(False)

    plt.tight_layout()
    plt.savefig("eda_comparison_outputs/original_categorical_analysis.png", dpi=300, bbox_inches="tight")
    plt.show()

print("Categorical variables analysis completed and saved.")

In [None]:
# 4. Route Analysis and Flight Patterns
print("ROUTE ANALYSIS AND FLIGHT PATTERNS")
print("=" * 40)

# Route analysis
if "ORIGIN" in df_orig_sample.columns and "DEST" in df_orig_sample.columns:
    # Top routes
    routes = df_orig_sample.groupby(["ORIGIN", "DEST"]).size().reset_index(name="count")
    routes = routes.sort_values("count", ascending=False).head(20)

    plt.figure(figsize=(15, 10))

    # Top routes
    plt.subplot(2, 2, 1)
    route_labels = [f"{row['ORIGIN']}-{row['DEST']}" for _, row in routes.iterrows()]
    plt.barh(range(len(routes)), routes["count"].values, color="orange")
    plt.yticks(range(len(routes)), route_labels)
    plt.title("Top 20 Routes by Flight Count")
    plt.xlabel("Number of Flights")
    plt.gca().invert_yaxis()

    # Origin airports
    plt.subplot(2, 2, 2)
    origins = df_orig_sample["ORIGIN"].value_counts().head(15)
    origins.plot(kind="bar", color="lightblue")
    plt.title("Top 15 Origin Airports")
    plt.xlabel("Airport Code")
    plt.ylabel("Number of Flights")
    plt.xticks(rotation=45)

    # Destination airports
    plt.subplot(2, 2, 3)
    dests = df_orig_sample["DEST"].value_counts().head(15)
    dests.plot(kind="bar", color="lightcoral")
    plt.title("Top 15 Destination Airports")
    plt.xlabel("Airport Code")
    plt.ylabel("Number of Flights")
    plt.xticks(rotation=45)

    # Route distance analysis (if distance available)
    if "DISTANCE" in df_orig_sample.columns:
        plt.subplot(2, 2, 4)
        df_orig_sample["DISTANCE"].hist(bins=50, color="lightgreen", alpha=0.7)
        plt.title("Flight Distance Distribution")
        plt.xlabel("Distance (miles)")
        plt.ylabel("Frequency")

        # Add statistics
        mean_dist = df_orig_sample["DISTANCE"].mean()
        median_dist = df_orig_sample["DISTANCE"].median()
        plt.axvline(mean_dist, color="red", linestyle="--", label=f"Mean: {mean_dist:.0f}")
        plt.axvline(median_dist, color="blue", linestyle="--", label=f"Median: {median_dist:.0f}")
        plt.legend()

    plt.tight_layout()
    plt.savefig("eda_comparison_outputs/original_route_analysis.png", dpi=300, bbox_inches="tight")
    plt.show()

print("Route analysis completed and saved.")

In [None]:
# 5. Delay Analysis and Patterns
print("DELAY ANALYSIS AND PATTERNS")
print("=" * 40)

# Delay analysis
delay_cols = ["DEP_DELAY", "ARR_DELAY"]
available_delays = [col for col in delay_cols if col in df_orig_sample.columns]

if available_delays:
    plt.figure(figsize=(20, 12))

    # 1. Delay distributions
    for i, col in enumerate(available_delays):
        plt.subplot(2, 3, i + 1)
        data = df_orig_sample[col].dropna()

        # Create histogram with different bins for negative, zero, and positive delays
        neg_data = data[data < 0]
        zero_data = data[data == 0]
        pos_data = data[data > 0]

        plt.hist(neg_data, bins=30, alpha=0.7, color="red", label=f"Early ({len(neg_data):,})")
        plt.hist(zero_data, bins=1, alpha=0.7, color="green", label=f"On Time ({len(zero_data):,})")
        plt.hist(pos_data, bins=50, alpha=0.7, color="orange", label=f"Delayed ({len(pos_data):,})")

        plt.title(f"{col} Distribution")
        plt.xlabel("Delay (minutes)")
        plt.ylabel("Frequency")
        plt.legend()
        plt.yscale("log")  # Log scale due to large differences

    # 2. Delay correlation
    if len(available_delays) == 2:
        plt.subplot(2, 3, 3)
        delay_data = df_orig_sample[available_delays].dropna()
        plt.scatter(delay_data[available_delays[0]], delay_data[available_delays[1]], alpha=0.3, s=1)
        plt.xlabel(available_delays[0])
        plt.ylabel(available_delays[1])
        plt.title(f"{available_delays[0]} vs {available_delays[1]}")

        # Add correlation coefficient
        corr = delay_data[available_delays[0]].corr(delay_data[available_delays[1]])
        plt.text(
            0.05,
            0.95,
            f"Correlation: {corr:.3f}",
            transform=plt.gca().transAxes,
            bbox=dict(boxstyle="round", facecolor="white"),
        )

    # 3. Delay by airline (if available)
    if "AIRLINE" in df_orig_sample.columns and available_delays:
        plt.subplot(2, 3, 4)
        delay_by_airline = df_orig_sample.groupby("AIRLINE")[available_delays[0]].agg(["mean", "count"]).reset_index()
        delay_by_airline = delay_by_airline[delay_by_airline["count"] >= 100]  # Only airlines with 100+ flights
        delay_by_airline = delay_by_airline.sort_values("mean", ascending=False).head(15)

        plt.barh(range(len(delay_by_airline)), delay_by_airline["mean"].values, color="purple")
        plt.yticks(range(len(delay_by_airline)), delay_by_airline["AIRLINE"])
        plt.title(f"Average {available_delays[0]} by Airline")
        plt.xlabel("Average Delay (minutes)")
        plt.gca().invert_yaxis()

    # 4. Delay by route distance
    if "DISTANCE" in df_orig_sample.columns and available_delays:
        plt.subplot(2, 3, 5)
        df_orig_sample["distance_bin"] = pd.cut(
            df_orig_sample["DISTANCE"],
            bins=[0, 500, 1000, 2000, 5000],
            labels=["Short (<500)", "Medium (500-1000)", "Long (1000-2000)", "Very Long (>2000)"],
        )

        delay_by_distance = df_orig_sample.groupby("distance_bin")[available_delays[0]].mean()
        delay_by_distance.plot(kind="bar", color="teal")
        plt.title(f"Average {available_delays[0]} by Distance")
        plt.xlabel("Distance Category")
        plt.ylabel("Average Delay (minutes)")
        plt.xticks(rotation=45)

    # 5. Delay trends over time (if date available)
    if "FL_DATE" in df_orig_sample.columns and available_delays:
        plt.subplot(2, 3, 6)
        df_orig_sample["FL_DATE"] = pd.to_datetime(df_orig_sample["FL_DATE"], errors="coerce")
        df_orig_sample["month"] = df_orig_sample["FL_DATE"].dt.month

        delay_by_month = df_orig_sample.groupby("month")[available_delays[0]].mean()
        delay_by_month.plot(kind="line", marker="o", color="darkred")
        plt.title(f"Average {available_delays[0]} by Month")
        plt.xlabel("Month")
        plt.ylabel("Average Delay (minutes)")
        plt.xticks(range(1, 13))
        plt.grid(True, alpha=0.3)

    plt.tight_layout()
    plt.savefig("eda_comparison_outputs/original_delay_analysis.png", dpi=300, bbox_inches="tight")
    plt.show()

print("Delay analysis completed and saved.")

In [None]:
# 6. Correlation Analysis
print("CORRELATION ANALYSIS")
print("=" * 40)

# Get numeric columns for correlation
numeric_cols = df_orig_sample.select_dtypes(include=[np.number]).columns.tolist()
if len(numeric_cols) > 2:
    # Calculate correlation matrix
    corr_matrix = df_orig_sample[numeric_cols].corr()

    plt.figure(figsize=(15, 12))

    # Full correlation heatmap
    plt.subplot(2, 2, 1)
    sns.heatmap(corr_matrix, annot=False, cmap="RdBu_r", center=0, square=True, cbar_kws={"shrink": 0.8})
    plt.title("Full Correlation Matrix")

    # Focus on high correlations
    plt.subplot(2, 2, 2)
    # Get upper triangle of correlation matrix
    mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
    corr_masked = corr_matrix.mask(mask)

    # Find high correlations (|r| > 0.7)
    high_corr = corr_masked[abs(corr_masked) > 0.7]
    if not high_corr.empty:
        sns.heatmap(high_corr, annot=True, cmap="RdBu_r", center=0, square=True, cbar_kws={"shrink": 0.8}, fmt=".2f")
        plt.title("High Correlations (|r| > 0.7)")
    else:
        plt.text(0.5, 0.5, "No high correlations found", ha="center", va="center", transform=plt.gca().transAxes)
        plt.title("High Correlations (|r| > 0.7)")

    # Top correlations
    plt.subplot(2, 2, 3)
    # Flatten correlation matrix and get top correlations
    corr_pairs = []
    for i in range(len(corr_matrix.columns)):
        for j in range(i + 1, len(corr_matrix.columns)):
            corr_pairs.append(
                {"var1": corr_matrix.columns[i], "var2": corr_matrix.columns[j], "correlation": abs(corr_matrix.iloc[i, j])}
            )

    corr_df = pd.DataFrame(corr_pairs)
    top_correlations = corr_df.nlargest(10, "correlation")

    y_pos = np.arange(len(top_correlations))
    plt.barh(y_pos, top_correlations["correlation"].values, color="lightblue")
    plt.yticks(y_pos, [f"{row['var1']} - {row['var2']}" for _, row in top_correlations.iterrows()])
    plt.xlabel("Absolute Correlation")
    plt.title("Top 10 Variable Correlations")
    plt.gca().invert_yaxis()

    # Correlation with target variables (if available)
    plt.subplot(2, 2, 4)
    target_cols = [col for col in ["DEP_DELAY", "ARR_DELAY"] if col in numeric_cols]
    if target_cols:
        target_corr = corr_matrix[target_cols].drop(target_cols, axis=0)
        target_corr = target_corr.abs().max(axis=1).sort_values(ascending=True).tail(15)

        plt.barh(range(len(target_corr)), target_corr.values, color="lightcoral")
        plt.yticks(range(len(target_corr)), target_corr.index)
        plt.xlabel("Max Absolute Correlation with Delay Variables")
        plt.title("Variables Most Correlated with Delays")
        plt.gca().invert_yaxis()
    else:
        plt.text(0.5, 0.5, "No delay variables found", ha="center", va="center", transform=plt.gca().transAxes)
        plt.title("Correlation with Target Variables")

    plt.tight_layout()
    plt.savefig("eda_comparison_outputs/original_correlation_analysis.png", dpi=300, bbox_inches="tight")
    plt.show()

print("Correlation analysis completed and saved.")

## 11. Original Dataset Summary Statistics


In [None]:
# Generate comprehensive statistics summary for original dataset
print("ORIGINAL DATASET SUMMARY STATISTICS")
print("=" * 60)

# Basic dataset info
print(f"Dataset Shape: {df_original.shape[0]:,} rows × {df_original.shape[1]} columns")
print(f"Memory Usage: {df_original.memory_usage(deep=True).sum() / (1024**2):.2f} MB")
print(
    f"Date Range: {df_original['FL_DATE'].min()} to {df_original['FL_DATE'].max()}"
    if "FL_DATE" in df_original.columns
    else "No date column found"
)

# Data quality summary
print(f"\nDATA QUALITY SUMMARY:")
print(f"Total Missing Values: {df_original.isnull().sum().sum():,}")
print(f"Columns with Missing Values: {(df_original.isnull().sum() > 0).sum()}")
print(f"Duplicate Rows: {df_original.duplicated().sum():,}")

# Numeric columns summary
numeric_cols = df_original.select_dtypes(include=[np.number]).columns
print(f"\nNUMERIC COLUMNS ({len(numeric_cols)}):")
for col in numeric_cols:
    stats = df_original[col].describe()
    missing_pct = (df_original[col].isnull().sum() / len(df_original)) * 100
    print(f"  {col}:")
    print(f"    Range: {stats['min']:.1f} to {stats['max']:.1f}")
    print(f"    Mean: {stats['mean']:.1f}, Median: {stats['50%']:.1f}")
    print(f"    Missing: {missing_pct:.1f}%")

# Categorical columns summary
categorical_cols = df_original.select_dtypes(include=["object", "category"]).columns
print(f"\nCATEGORICAL COLUMNS ({len(categorical_cols)}):")
for col in categorical_cols:
    unique_count = df_original[col].nunique()
    missing_pct = (df_original[col].isnull().sum() / len(df_original)) * 100
    print(f"  {col}: {unique_count:,} unique values, {missing_pct:.1f}% missing")

# Flight patterns summary
if "AIRLINE" in df_original.columns:
    print(f"\nFLIGHT PATTERNS:")
    print(f"  Airlines: {df_original['AIRLINE'].nunique():,}")
    print(f"  Origin Airports: {df_original['ORIGIN'].nunique():,}")
    print(f"  Destination Airports: {df_original['DEST'].nunique():,}")
    print(f"  Unique Routes: {df_original.groupby(['ORIGIN', 'DEST']).size().shape[0]:,}")

# Delay summary
if "DEP_DELAY" in df_original.columns:
    dep_delay_stats = df_original["DEP_DELAY"].describe()
    print(f"\nDEPARTURE DELAY SUMMARY:")
    print(f"  On-time flights: {(df_original['DEP_DELAY'] <= 0).sum():,} ({(df_original['DEP_DELAY'] <= 0).mean()*100:.1f}%)")
    print(f"  Delayed flights: {(df_original['DEP_DELAY'] > 0).sum():,} ({(df_original['DEP_DELAY'] > 0).mean()*100:.1f}%)")
    print(f"  Average delay: {dep_delay_stats['mean']:.1f} minutes")
    print(f"  Median delay: {dep_delay_stats['50%']:.1f} minutes")

if "ARR_DELAY" in df_original.columns:
    arr_delay_stats = df_original["ARR_DELAY"].describe()
    print(f"\nARRIVAL DELAY SUMMARY:")
    print(f"  On-time arrivals: {(df_original['ARR_DELAY'] <= 0).sum():,} ({(df_original['ARR_DELAY'] <= 0).mean()*100:.1f}%)")
    print(f"  Delayed arrivals: {(df_original['ARR_DELAY'] > 0).sum():,} ({(df_original['ARR_DELAY'] > 0).mean()*100:.1f}%)")
    print(f"  Average delay: {arr_delay_stats['mean']:.1f} minutes")
    print(f"  Median delay: {arr_delay_stats['50%']:.1f} minutes")

# Save summary statistics to file
summary_stats = {
    "dataset_info": {
        "shape": list(df_original.shape),
        "memory_mb": float(df_original.memory_usage(deep=True).sum() / (1024**2)),
        "date_range": {
            "start": str(df_original["FL_DATE"].min()) if "FL_DATE" in df_original.columns else None,
            "end": str(df_original["FL_DATE"].max()) if "FL_DATE" in df_original.columns else None,
        },
    },
    "data_quality": {
        "total_missing": int(df_original.isnull().sum().sum()),
        "columns_with_missing": int((df_original.isnull().sum() > 0).sum()),
        "duplicate_rows": int(df_original.duplicated().sum()),
    },
    "numeric_columns": {
        col: {
            "count": int(df_original[col].count()),
            "mean": float(df_original[col].mean()),
            "median": float(df_original[col].median()),
            "std": float(df_original[col].std()),
            "min": float(df_original[col].min()),
            "max": float(df_original[col].max()),
            "missing_pct": float((df_original[col].isnull().sum() / len(df_original)) * 100),
        }
        for col in numeric_cols
    },
    "categorical_columns": {
        col: {
            "unique_count": int(df_original[col].nunique()),
            "missing_pct": float((df_original[col].isnull().sum() / len(df_original)) * 100),
        }
        for col in categorical_cols
    },
}

# Add flight patterns if available
if "AIRLINE" in df_original.columns:
    summary_stats["flight_patterns"] = {
        "airlines": int(df_original["AIRLINE"].nunique()),
        "origin_airports": int(df_original["ORIGIN"].nunique()),
        "destination_airports": int(df_original["DEST"].nunique()),
        "unique_routes": int(df_original.groupby(["ORIGIN", "DEST"]).size().shape[0]),
    }

# Add delay statistics if available
if "DEP_DELAY" in df_original.columns:
    summary_stats["departure_delay"] = {
        "on_time_count": int((df_original["DEP_DELAY"] <= 0).sum()),
        "on_time_pct": float((df_original["DEP_DELAY"] <= 0).mean() * 100),
        "delayed_count": int((df_original["DEP_DELAY"] > 0).sum()),
        "delayed_pct": float((df_original["DEP_DELAY"] > 0).mean() * 100),
        "mean_delay": float(df_original["DEP_DELAY"].mean()),
        "median_delay": float(df_original["DEP_DELAY"].median()),
    }

if "ARR_DELAY" in df_original.columns:
    summary_stats["arrival_delay"] = {
        "on_time_count": int((df_original["ARR_DELAY"] <= 0).sum()),
        "on_time_pct": float((df_original["ARR_DELAY"] <= 0).mean() * 100),
        "delayed_count": int((df_original["ARR_DELAY"] > 0).sum()),
        "delayed_pct": float((df_original["ARR_DELAY"] > 0).mean() * 100),
        "mean_delay": float(df_original["ARR_DELAY"].mean()),
        "median_delay": float(df_original["ARR_DELAY"].median()),
    }

# Save to JSON file
with open("eda_comparison_outputs/original_dataset_summary.json", "w") as f:
    json.dump(summary_stats, f, indent=2)

print(f"\n\nOriginal dataset summary statistics saved to: eda_comparison_outputs/original_dataset_summary.json")
print("=" * 60)