In [None]:
# ============================================================
# PART 1 — IMPORTS, SETUP, UTILITIES, LOADING, CLEANING
# ============================================================

import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path

from scipy.stats import ttest_ind, f_oneway, pearsonr

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import KNNImputer

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix
)

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

# Optional XGBoost
try:
    from xgboost import XGBClassifier
    xgb_available = True
except ImportError:
    xgb_available = False

from sklearn.model_selection import GridSearchCV, cross_val_score, learning_curve

# ------------------------------------------------------------
# GLOBAL SETTINGS
# ------------------------------------------------------------
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (8, 5)

# Your dataset path (as requested)
CSV_FILE_PATH = "/Users/ramjeetdixit/Desktop/Customer_Satisfaction_Cleaned_Full.csv"

BASE_DIR = Path(__file__).resolve().parent
OUTPUT_DIR = BASE_DIR / "outputs"
OUTPUT_DIR.mkdir(exist_ok=True)


# ------------------------------------------------------------
# HELPER: SECTION PRINTING
# ------------------------------------------------------------
def section(title: str):
    print("\n" + "=" * 60)
    print(title.upper())
    print("=" * 60 + "\n")


# ------------------------------------------------------------
# DATA LOADING
# ------------------------------------------------------------
def load_data(csv_path: str) -> pd.DataFrame:
    section("Loading Data")

    csv_path = Path(csv_path)
    if not csv_path.exists():
        raise FileNotFoundError(f"CSV file not found at {csv_path}")

    df = pd.read_csv(csv_path)
    print(f"Data loaded successfully with shape: {df.shape}")
    return df


# ------------------------------------------------------------
# BASIC CLEANING UTILITIES
# ------------------------------------------------------------
def convert_blank_to_nan(df: pd.DataFrame) -> pd.DataFrame:
    section("Converting blanks to NaN")
    df = df.replace(r"^\s*$", np.nan, regex=True)
    return df


def convert_to_numeric(df: pd.DataFrame, numeric_cols: list) -> pd.DataFrame:
    section("Converting selected columns to numeric")
    for col in numeric_cols:
        df[col] = pd.to_numeric(df[col], errors="coerce")
    return df


def fill_missing_with_median(df: pd.DataFrame, numeric_cols: list) -> pd.DataFrame:
    section("Median imputation for numeric columns")
    for col in numeric_cols:
        median_val = df[col].median()
        df[col] = df[col].fillna(median_val)
    return df


def clip_outliers(df: pd.DataFrame, numeric_cols: list, lower_q: float = 0.01, upper_q: float = 0.99) -> pd.DataFrame:
    section("Outlier clipping")
    for col in numeric_cols:
        lower = df[col].quantile(lower_q)
        upper = df[col].quantile(upper_q)
        df[col] = df[col].clip(lower=lower, upper=upper)
    return df


# ------------------------------------------------------------
# MAIN CLEANING PIPELINE
# ------------------------------------------------------------
def clean_data(df: pd.DataFrame) -> pd.DataFrame:
    section("Cleaning Data")

    df = df.copy()

    # Drop completely empty rows
    df.dropna(how="all", inplace=True)

    # Convert blanks to NaN
    df = convert_blank_to_nan(df)

    # Columns expected to be numeric
    numeric_cols = [
        "Age",
        "Flight Distance",
        "Departure Delay in Minutes",
        "Arrival Delay in Minutes",
        "Seat comfort",
        "Food and drink",
        "Inflight wifi service",
        "Cleanliness"
    ]

    # Ensure columns exist
    numeric_cols = [c for c in numeric_cols if c in df.columns]

    # Convert to numeric
    df = convert_to_numeric(df, numeric_cols)

    # Median imputation
    df = fill_missing_with_median(df, numeric_cols)

    # Outlier clipping
    df = clip_outliers(df, numeric_cols)

    print("Cleaning complete. Shape after cleaning:", df.shape)
    return df

# ============================================================
# PART 2 — DESCRIPTIVE STATISTICS & STATISTICAL TESTS
# ============================================================

def descriptive_statistics(df: pd.DataFrame):
    section("Descriptive Statistics")

    print("Basic Info:")
    print(df.info())

    print("\nSummary Statistics:")
    print(df.describe(include="all"))

    # Save summary
    df.describe(include="all").to_csv(OUTPUT_DIR / "summary_statistics.csv")


# ------------------------------------------------------------
# GENDER DISTRIBUTION
# ------------------------------------------------------------
def analyze_gender_distribution(df: pd.DataFrame):
    section("Gender Distribution")

    gender_counts = df["Gender"].value_counts()
    print(gender_counts)

    plt.figure(figsize=(6, 4))
    sns.countplot(data=df, x="Gender", palette="Set2")
    plt.title("Gender Distribution")
    plt.tight_layout()
    plt.show()

    # Pie chart
    plt.figure(figsize=(6, 6))
    gender_counts.plot.pie(autopct="%1.1f%%", colors=sns.color_palette("Set2"))
    plt.title("Gender Distribution (Pie Chart)")
    plt.ylabel("")
    plt.tight_layout()
    plt.show()


# ------------------------------------------------------------
# AGE DISTRIBUTION
# ------------------------------------------------------------
def analyze_age_distribution(df: pd.DataFrame):
    section("Age Distribution")

    plt.figure(figsize=(8, 5))
    sns.histplot(df["Age"], kde=True, bins=30, color="skyblue")
    plt.title("Age Distribution")
    plt.tight_layout()
    plt.show()

    # Age brackets
    df["Age_Group"] = pd.cut(
        df["Age"],
        bins=[0, 18, 30, 45, 60, 100],
        labels=["0-18", "19-30", "31-45", "46-60", "60+"]
    )

    age_group_counts = df["Age_Group"].value_counts().sort_index()
    print("\nAge Group Counts:")
    print(age_group_counts)

    plt.figure(figsize=(7, 4))
    sns.barplot(x=age_group_counts.index, y=age_group_counts.values, palette="Blues")
    plt.title("Age Groups")
    plt.tight_layout()
    plt.show()


# ------------------------------------------------------------
# TRAVEL TYPE ANALYSIS
# ------------------------------------------------------------
def analyze_travel_type(df: pd.DataFrame):
    section("Travel Type Analysis")

    travel_counts = df["Type of Travel"].value_counts()
    print(travel_counts)

    plt.figure(figsize=(6, 4))
    sns.countplot(data=df, x="Type of Travel", palette="Set3")
    plt.title("Type of Travel Distribution")
    plt.tight_layout()
    plt.show()


# ------------------------------------------------------------
# CLASS DISTRIBUTION
# ------------------------------------------------------------
def analyze_class_distribution(df: pd.DataFrame):
    section("Class Distribution")

    class_counts = df["Class"].value_counts()
    print(class_counts)

    plt.figure(figsize=(6, 4))
    sns.countplot(data=df, x="Class", palette="Set1")
    plt.title("Travel Class Distribution")
    plt.tight_layout()
    plt.show()


# ------------------------------------------------------------
# DELAY ANALYSIS
# ------------------------------------------------------------
def analyze_delays(df: pd.DataFrame):
    section("Delay Analysis")

    plt.figure(figsize=(8, 5))
    sns.histplot(df["Departure Delay in Minutes"], bins=40, kde=True, color="orange")
    plt.title("Departure Delay Distribution")
    plt.tight_layout()
    plt.show()

    plt.figure(figsize=(8, 5))
    sns.histplot(df["Arrival Delay in Minutes"], bins=40, kde=True, color="red")
    plt.title("Arrival Delay Distribution")
    plt.tight_layout()
    plt.show()

    # Correlation with satisfaction
    if "satisfaction" in df.columns:
        corr = df["Arrival Delay in Minutes"].corr(df["satisfaction"])
        print(f"\nCorrelation between Arrival Delay and Satisfaction: {corr:.4f}")


# ------------------------------------------------------------
# SEAT COMFORT BY CLASS (BOX PLOT)
# ------------------------------------------------------------
def analyze_seat_comfort(df: pd.DataFrame):
    section("Seat Comfort by Class")

    plt.figure(figsize=(8, 5))
    sns.boxplot(data=df, x="Class", y="Seat comfort", palette="coolwarm")
    plt.title("Seat Comfort by Travel Class")
    plt.tight_layout()
    plt.show()


# ------------------------------------------------------------
# T-TEST: SEAT COMFORT (BUSINESS vs ECONOMY)
# ------------------------------------------------------------
def ttest_seat_comfort(df: pd.DataFrame):
    section("T-Test: Seat Comfort (Business vs Economy)")

    business = df[df["Class"] == "Business"]["Seat comfort"]
    economy = df[df["Class"] == "Economy"]["Seat comfort"]

    t_stat, p_val = ttest_ind(business, economy, equal_var=False)

    print(f"T-statistic: {t_stat:.4f}")
    print(f"P-value: {p_val:.6f}")

    if p_val < 0.05:
        print("Result: Significant difference in seat comfort between Business and Economy.")
    else:
        print("Result: No significant difference detected.")


# ------------------------------------------------------------
# ANOVA: SEAT COMFORT ACROSS ALL CLASSES
# ------------------------------------------------------------
def anova_seat_comfort(df: pd.DataFrame):
    section("ANOVA: Seat Comfort Across Classes")

    groups = [group["Seat comfort"].values for _, group in df.groupby("Class")]
    f_stat, p_val = f_oneway(*groups)

    print(f"F-statistic: {f_stat:.4f}")
    print(f"P-value: {p_val:.6f}")

    if p_val < 0.05:
        print("Result: Seat comfort differs significantly across travel classes.")
    else:
        print("Result: No significant difference detected.")


# ------------------------------------------------------------
# CORRELATION MATRIX
# ------------------------------------------------------------
def correlation_matrix(df: pd.DataFrame):
    section("Correlation Matrix")

    numeric_df = df.select_dtypes(include=[np.number])

    plt.figure(figsize=(12, 8))
    sns.heatmap(numeric_df.corr(), annot=False, cmap="coolwarm")
    plt.title("Correlation Heatmap")
    plt.tight_layout()
    plt.show()


# ------------------------------------------------------------
# MASTER FUNCTION FOR PART 2
# ------------------------------------------------------------
def run_descriptive_and_stats(df: pd.DataFrame):
    descriptive_statistics(df)
    analyze_gender_distribution(df)
    analyze_age_distribution(df)
    analyze_travel_type(df)
    analyze_class_distribution(df)
    analyze_delays(df)
    analyze_seat_comfort(df)
    ttest_seat_comfort(df)
    anova_seat_comfort(df)
    correlation_matrix(df)
    
    # ============================================================
# PART 3 — FEATURE ENGINEERING, ENCODING, SCALING, IMPUTATION
# ============================================================

def feature_engineering(df: pd.DataFrame) -> pd.DataFrame:
    section("Feature Engineering")

    df = df.copy()

    # Total delay
    df["Total_Delay"] = (
        df["Departure Delay in Minutes"] +
        df["Arrival Delay in Minutes"]
    )

    # Distance bins
    df["Distance_Bin"] = pd.cut(
        df["Flight Distance"],
        bins=[0, 500, 1500, 3000, 10000],
        labels=["Short", "Medium", "Long", "Ultra-Long"]
    )

    # Interaction feature
    df["SeatFood_Interaction"] = (
        df["Seat comfort"] * df["Food and drink"]
    )

    print("Feature engineering complete. New columns added:")
    print(["Total_Delay", "Distance_Bin", "SeatFood_Interaction"])

    return df


# ------------------------------------------------------------
# ENCODING CATEGORICAL VARIABLES
# ------------------------------------------------------------
def encode_categoricals(df: pd.DataFrame) -> pd.DataFrame:
    section("Encoding Categorical Variables")

    df = df.copy()

    # Label encode binary columns
    label_cols = ["Gender", "Customer Type", "Type of Travel", "Class", "satisfaction"]
    for col in label_cols:
        if col in df.columns:
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col].astype(str))

    # One-hot encode Distance_Bin
    if "Distance_Bin" in df.columns:
        df = pd.get_dummies(df, columns=["Distance_Bin"], drop_first=True)

    print("Encoding complete.")
    return df


# ------------------------------------------------------------
# SCALING NUMERIC FEATURES
# ------------------------------------------------------------
def scale_numeric(df: pd.DataFrame, numeric_cols: list) -> pd.DataFrame:
    section("Scaling Numeric Features")

    df = df.copy()
    scaler = StandardScaler()

    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

    print("Scaling complete.")
    return df


# ------------------------------------------------------------
# KNN IMPUTATION
# ------------------------------------------------------------
def apply_knn_imputation(df: pd.DataFrame) -> pd.DataFrame:
    section("KNN Imputation")

    df = df.copy()
    imputer = KNNImputer(n_neighbors=5)

    df[df.columns] = imputer.fit_transform(df)

    print("KNN imputation complete.")
    return df


# ------------------------------------------------------------
# TRAIN/TEST SPLIT
# ------------------------------------------------------------
def split_data(df: pd.DataFrame):
    section("Train/Test Split")

    X = df.drop("satisfaction", axis=1)
    y = df["satisfaction"]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.25, random_state=42, stratify=y
    )

    print(f"Training set: {X_train.shape}")
    print(f"Testing set: {X_test.shape}")

    return X_train, X_test, y_train, y_test


# ------------------------------------------------------------
# MASTER FUNCTION FOR PART 3
# ------------------------------------------------------------
def run_feature_engineering_and_split(df: pd.DataFrame):
    df = feature_engineering(df)
    df = encode_categoricals(df)

    # Identify numeric columns for scaling
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    numeric_cols = [c for c in numeric_cols if c != "satisfaction"]

    df = scale_numeric(df, numeric_cols)
    df = apply_knn_imputation(df)

    X_train, X_test, y_train, y_test = split_data(df)

    return X_train, X_test, y_train, y_test
# ============================================================
# PART 4 — MODELING, TUNING, PLOTS, EVALUATION, MAIN EXECUTION
# ============================================================

# ------------------------------------------------------------
# MODEL DEFINITIONS
# ------------------------------------------------------------
def get_models():
    section("Initializing Models")

    models = {
        "Decision Tree": DecisionTreeClassifier(random_state=42),
        "Random Forest": RandomForestClassifier(random_state=42),
        "Logistic Regression": LogisticRegression(max_iter=500),
        "KNN": KNeighborsClassifier(),
        "Naive Bayes": GaussianNB(),
        "SVM": SVC(probability=True, random_state=42),
        "Gradient Boosting": GradientBoostingClassifier(random_state=42)
    }

    if xgb_available:
        models["XGBoost"] = XGBClassifier(
            eval_metric="logloss",
            random_state=42,
            use_label_encoder=False
        )

    return models


# ------------------------------------------------------------
# FEATURE IMPORTANCE PLOTS
# ------------------------------------------------------------
def plot_feature_importance(model, X_train, model_name):
    if not hasattr(model, "feature_importances_"):
        return

    section(f"Feature Importance — {model_name}")

    importances = model.feature_importances_
    features = X_train.columns

    importance_df = pd.DataFrame({
        "Feature": features,
        "Importance": importances
    }).sort_values(by="Importance", ascending=False)

    plt.figure(figsize=(10, 6))
    sns.barplot(data=importance_df.head(20), x="Importance", y="Feature")
    plt.title(f"Top 20 Feature Importances — {model_name}")
    plt.tight_layout()
    plt.show()

    importance_df.to_csv(
        OUTPUT_DIR / f"{model_name}_feature_importances.csv",
        index=False
    )


# ------------------------------------------------------------
# LEARNING CURVES
# ------------------------------------------------------------
def plot_learning_curve(model, X, y, model_name):
    section(f"Learning Curve — {model_name}")

    train_sizes, train_scores, test_scores = learning_curve(
        estimator=model,
        X=X,
        y=y,
        cv=5,
        scoring="accuracy",
        n_jobs=-1,
        train_sizes=np.linspace(0.1, 1.0, 5),
        shuffle=True,
        random_state=42
    )

    train_mean = train_scores.mean(axis=1)
    test_mean = test_scores.mean(axis=1)

    plt.figure(figsize=(8, 5))
    plt.plot(train_sizes, train_mean, label="Training Accuracy")
    plt.plot(train_sizes, test_mean, label="Validation Accuracy")
    plt.title(f"Learning Curve — {model_name}")
    plt.xlabel("Training Set Size")
    plt.ylabel("Accuracy")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()


# ------------------------------------------------------------
# MODEL TRAINING + EVALUATION
# ------------------------------------------------------------
def train_and_evaluate_models(X_train, X_test, y_train, y_test):
    section("Training and Evaluating Models")

    models = get_models()
    results = []

    for name, model in models.items():
        print(f"\nTraining: {name}")
        model.fit(X_train, y_train)

        # Feature importance
        plot_feature_importance(model, X_train, name)

        # Learning curve
        plot_learning_curve(model, X_train, y_train, name)

        # Predictions
        y_pred = model.predict(X_test)

        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred)
        rec = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        results.append([name, acc, prec, rec, f1])

        print(f"{name} — Accuracy: {acc:.4f}, Precision: {prec:.4f}, Recall: {rec:.4f}, F1: {f1:.4f}")

    results_df = pd.DataFrame(results, columns=["Model", "Accuracy", "Precision", "Recall", "F1"])
    results_df.to_csv(OUTPUT_DIR / "model_performance.csv", index=False)

    return results_df


# ------------------------------------------------------------
# MODEL COMPARISON PLOTS (INDIVIDUAL)
# ------------------------------------------------------------
def plot_model_comparisons(results_df):
    section("Model Comparison Plots")

    metrics = ["Accuracy", "Precision", "Recall", "F1"]

    for metric in metrics:
        plt.figure(figsize=(10, 5))
        sns.barplot(data=results_df, x="Model", y=metric, palette="viridis")
        plt.title(f"Model Comparison — {metric}")
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()


# ------------------------------------------------------------
# COMBINED MODEL COMPARISON PLOT (2×2 SUBPLOT)
# ------------------------------------------------------------
def plot_combined_comparison(results_df):
    section("Combined Model Comparison Plot")

    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    metrics = ["Accuracy", "Precision", "Recall", "F1"]

    for ax, metric in zip(axes.flatten(), metrics):
        ax.bar(results_df["Model"], results_df[metric], color="steelblue")
        ax.set_title(metric)
        ax.set_xticklabels(results_df["Model"], rotation=45)

    plt.tight_layout()
    plt.show()


# ------------------------------------------------------------
# HYPERPARAMETER TUNING (WITH MENU PROMPT)
# ------------------------------------------------------------
def tune_models_with_gridsearch(X_train, y_train):
    section("Hyperparameter Tuning — GridSearchCV")

    tuned_results = {}

    # Random Forest
    rf_params = {
        "n_estimators": [100, 200, 300],
        "max_depth": [5, 10, 20, None],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4],
        "bootstrap": [True, False]
    }

    rf = RandomForestClassifier(random_state=42)
    rf_grid = GridSearchCV(rf, rf_params, cv=5, scoring="accuracy", n_jobs=-1, verbose=1)
    rf_grid.fit(X_train, y_train)
    tuned_results["Random Forest"] = rf_grid.best_params_

    # Gradient Boosting
    gb_params = {
        "n_estimators": [100, 200, 300],
        "learning_rate": [0.01, 0.05, 0.1],
        "max_depth": [3, 5, 7],
        "subsample": [0.8, 1.0]
    }

    gb = GradientBoostingClassifier(random_state=42)
    gb_grid = GridSearchCV(gb, gb_params, cv=5, scoring="accuracy", n_jobs=-1, verbose=1)
    gb_grid.fit(X_train, y_train)
    tuned_results["Gradient Boosting"] = gb_grid.best_params_

    # SVM
    svm_params = {
        "C": [0.1, 1, 10],
        "kernel": ["rbf", "poly"],
        "gamma": ["scale", "auto"]
    }

    svm = SVC(probability=True, random_state=42)
    svm_grid = GridSearchCV(svm, svm_params, cv=5, scoring="accuracy", n_jobs=-1, verbose=1)
    svm_grid.fit(X_train, y_train)
    tuned_results["SVM"] = svm_grid.best_params_

    # XGBoost
    if xgb_available:
        xgb_params = {
            "n_estimators": [100, 200, 300],
            "learning_rate": [0.01, 0.05, 0.1],
            "max_depth": [3, 5, 7],
            "subsample": [0.8, 1.0],
            "colsample_bytree": [0.8, 1.0]
        }

        xgb = XGBClassifier(eval_metric="logloss", random_state=42, use_label_encoder=False)
        xgb_grid = GridSearchCV(xgb, xgb_params, cv=5, scoring="accuracy", n_jobs=-1, verbose=1)
        xgb_grid.fit(X_train, y_train)
        tuned_results["XGBoost"] = xgb_grid.best_params_

    tuned_df = pd.DataFrame.from_dict(tuned_results, orient="index")
    tuned_df.to_csv(OUTPUT_DIR / "hyperparameter_tuning_results.csv")

    return tuned_df


# ------------------------------------------------------------
# MAIN EXECUTION PIPELINE
# ------------------------------------------------------------
def run_all(csv_path):
    df = load_data(csv_path)
    df = clean_data(df)

    run_descriptive_and_stats(df)

    X_train, X_test, y_train, y_test = run_feature_engineering_and_split(df)

    # Hyperparameter tuning menu
    print("\nHyperparameter Tuning Options:")
    print("1. Run hyperparameter tuning")
    print("2. Skip tuning")
    choice = input("Select an option (1 or 2): ")

    if choice == "1":
        tune_models_with_gridsearch(X_train, y_train)

    results_df = train_and_evaluate_models(X_train, X_test, y_train, y_test)

    plot_model_comparisons(results_df)
    plot_combined_comparison(results_df)


# ------------------------------------------------------------
# SCRIPT ENTRY POINT
# ------------------------------------------------------------
if __name__ == "__main__":
    csv_file_path = CSV_FILE_PATH

    if not Path(csv_file_path).exists():
        print(f"CSV file not found at: {csv_file_path}")
    else:
        run_all(csv_file_path)