<a href="https://colab.research.google.com/github/OCE1984/MScDataAnaltyicsPrinciples/blob/main/StreamlitApp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Streamlit Dashboard App Script

## Importing the tools

In [None]:
import streamlit as st
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import xgboost as xgb
import numpy as np

from xgboost import XGBClassifier
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, silhouette_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, ConfusionMatrixDisplay
)
from sklearn.cluster import KMeans

## Formatting the page

In [None]:
st.set_page_config(page_title="Airline Customer Satisfaction", layout="wide")

st.markdown("""
    <style>
    .main-title {
        font-size: 40px;
        font-weight: 600;
        text-align: center;
        color: #0c4a6e;
    }
    .footer {
        font-size: 14px;
        color: #888;
        text-align: center;
        margin-top: 50px;
    }
    </style>
""", unsafe_allow_html=True)

# === Header ===
st.markdown(
    "<div class='main-title'>✈️ Airline Customer Satisfaction</div>",
    unsafe_allow_html=True)
st.markdown(
    "<hr style='border:1px solid #f0f0f0;' />", unsafe_allow_html=True)

# === Navigation Buttons ===
if "page" not in st.session_state:
    st.session_state.page = "EDA"

col1, col2, col3 = st.columns(3)
with col1:
    if st.button("Exploratory Data Analysis (EDA)"):
        st.session_state.page = "EDA"
with col2:
    if st.button("Supervised Machine Learning"):
        st.session_state.page = "Supervised"
with col3:
    if st.button("Unsupervised Machine Learning - KMeans"):
        st.session_state.page = "Unsupervised"

st.markdown("---")

## Loading the data

In [None]:
df_clean = pd.read_csv(
    "~/Desktop/AirlineDashboard/data/df_clean.csv")
df_cluster_analysis = pd.read_csv(
    "~/Desktop/AirlineDashboard/data/df_cluster_analysis.csv")
df_feature_eng = pd.read_csv(
    "~/Desktop/AirlineDashboard/data/df_feature_eng.csv")
df_original = pd.read_csv(
    "~/Desktop/AirlineDashboard/data/df_original.csv")

st.success("✅ Data loaded successfully from the Desktop folder!")

## Creating the functions

In [None]:
def plot_age_distribution(data):
    fig, ax = plt.subplots()
    sns.histplot(
        data=data,
        x="Age",
        kde=True,
        color="skyblue",
        stat="density",
        alpha=0.5,
        ax=ax)
    sns.kdeplot(data=data, x="Age", color="darkblue", linewidth=2, ax=ax)
    ax.set_title("Distribution of Age")
    return fig

def plot_flight_distance_distribution(data):
    fig, ax = plt.subplots()
    sns.histplot(
        data=data,
        x="Flight Distance",
        kde=True, color="skyblue",
        stat="density",
        alpha=0.5,
        ax=ax)
    sns.kdeplot(
        data=data,
        x="Flight Distance",
        color="darkblue",
        linewidth=2,
        ax=ax)
    ax.set_title("Distribution of Flight Distance")
    return fig

def placeholder_plot(title="Placeholder Graph"):
    fig, ax = plt.subplots()
    ax.text(
        0.5,
        0.5,
        title,
        ha='center',
        va='center',
        fontsize=14,
        color='gray')
    ax.axis('off')
    return fig

def add_count_labels(ax, fontsize=8, colour='black'):
    for p in ax.patches:
        height = p.get_height()
        if not pd.isna(height) and height > 0:
            ax.annotate(f'{int(height):,}',
                        (p.get_x() + p.get_width() / 2., height),
                        ha='center', va='bottom',
                        fontsize=fontsize, color=colour, xytext=(0, 3),
                        textcoords='offset points')

## Creating the Exploratory Data Analysis (EDA) page graphs

In [None]:
# Calculate satisfaction percentage
total_passengers = len(df_clean)
satisfied_passengers = df_clean[df_clean["Satisfaction"] == 1].shape[0]
satisfaction_percent = round((satisfied_passengers / total_passengers) * 100, 1)

# Create gauge chart
fig = go.Figure(go.Indicator(
    mode="gauge+number",
    value=satisfaction_percent,
    title={'text': "Overall Satisfaction (%)"},
    gauge={
        'axis': {'range': [0, 100]},
        'bar': {'color': "royalblue"},
        'steps': [
            {'range': [0, 50], 'color': '#ffcccc'},
            {'range': [50, 75], 'color': '#ffe680'},
            {'range': [75, 100], 'color': '#ccffcc'}
        ],
    }
))

# Satisfaction by Class
df_plot = df_original.copy()

if df_plot['Satisfaction'].dtype != 'object':
    df_plot['Satisfaction_Label'] = df_plot['Satisfaction'].map({
        0: "Neutral or Dissatisfied",
        1: "Satisfied"
    })
else:
    df_plot['Satisfaction_Label'] = df_plot['Satisfaction']

# Satisfaction by Class chart
fig_class = plt.figure(figsize=(6, 5))
ax_class = fig_class.add_subplot()
sns.countplot(
    data=df_plot,
    x="Class",
    hue="Satisfaction_Label",
    palette="pastel",
    ax=ax_class
)
ax_class.set_title("Satisfaction by Travel Class")
ax_class.set_xlabel("Class")
ax_class.set_ylabel("Passenger Count")
ax_class.legend(title="Satisfaction")
add_count_labels(ax_class)

# Satisfaction by Customer Type
df_plot = df_original.copy()

if df_plot['Satisfaction'].dtype != 'object':
    df_plot['Satisfaction_Label'] = df_plot['Satisfaction'].map({
        0: "Neutral or Dissatisfied",
        1: "Satisfied"
    })
else:
    df_plot['Satisfaction_Label'] = df_plot['Satisfaction']

# Create grouped bar plot by Customer Type
fig_customer_type = plt.figure(figsize=(6, 5))
ax_customer_type = fig_customer_type.add_subplot()

sns.countplot(
    data=df_plot,
    x="Customer Type",
    hue="Satisfaction_Label",
    palette="pastel",
    ax=ax_customer_type
)

ax_customer_type.set_title("Satisfaction by Customer Type")
ax_customer_type.set_xlabel("Customer Type")
ax_customer_type.set_ylabel("Passenger Count")
ax_customer_type.legend(title="Satisfaction")
add_count_labels(ax_customer_type)

# Satisfcation by Type of Travel
df_plot = df_original.copy()

if df_plot['Satisfaction'].dtype != 'object':
    df_plot['Satisfaction_Label'] = df_plot['Satisfaction'].map({
        0: "Neutral or Dissatisfied",
        1: "Satisfied"
    })
else:
    df_plot['Satisfaction_Label'] = df_plot['Satisfaction']

fig_travel_type = plt.figure(figsize=(6, 5))
ax_travel_type = fig_travel_type.add_subplot()

sns.countplot(
    data=df_plot,
    x="Type of Travel",
    hue="Satisfaction_Label",
    palette="pastel",
    ax=ax_travel_type
)

ax_travel_type.set_title("Satisfaction by Type of Travel")
ax_travel_type.set_xlabel("Type of Travel")
ax_travel_type.set_ylabel("Passenger Count")
ax_travel_type.legend(title="Satisfaction")
add_count_labels(ax_travel_type)

# Loyal Business Customers
df_feature_eng["Loyal Business Traveller"] = (
    (df_original["Customer Type"] == "Returning") &
    (df_original["Type of Travel"] == "Business")
).astype(int)

df_loyal = df_feature_eng.copy()

if df_loyal['Satisfaction'].dtype != 'object':
    df_loyal["Satisfaction_Label"] = df_loyal["Satisfaction"].map({
        0: "Neutral or Dissatisfied",
        1: "Satisfied"
    })
else:
    df_loyal["Satisfaction_Label"] = df_loyal["Satisfaction"]


df_loyal["Loyalty Group"] = df_loyal["Loyal Business Traveller"].map({
    1: "Loyal Business Traveller",
    0: "Other Passengers"
})

fig_loyalty = plt.figure(figsize=(6, 5))
ax_loyalty = fig_loyalty.add_subplot()

sns.countplot(
    data=df_loyal,
    x="Loyalty Group",
    hue="Satisfaction_Label",
    palette="pastel",
    ax=ax_loyalty
)

ax_loyalty.set_title("Satisfaction by Loyalty Group")
ax_loyalty.set_xlabel("Passenger Type")
ax_loyalty.set_ylabel("Passenger Count")
ax_loyalty.legend(title="Satisfaction")
add_count_labels(ax_loyalty)


## Supervised Machine Learning

### Model training

In [None]:
X = df_clean.drop('Satisfaction', axis=1)
y = df_clean['Satisfaction']
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42)

### === Evaluation Function ===
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    return {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1-Score': f1_score(y_test, y_pred),
        'ROC AUC': roc_auc_score(y_test, y_proba),
        'y_pred': y_pred,
        'y_proba': y_proba
    }

## === GridSearchCV: Random Forest ===
param_grid_rf = {
    'n_estimators': [100, 150],
    'max_depth': [None, 10]
}
grid_rf = GridSearchCV(RandomForestClassifier(random_state=42),
                       param_grid_rf, cv=3, scoring='accuracy', n_jobs=-1)
grid_rf.fit(X_train, y_train)
best_rf = grid_rf.best_estimator_
rf_metrics = evaluate_model(best_rf, X_test, y_test)

## === GridSearchCV: XGBoost ===
param_grid_xgb = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5]
}

xgb_classifier = XGBClassifier(random_state=42)
grid_xgb = GridSearchCV(estimator=xgb_classifier,
                        param_grid=param_grid_xgb,
                        scoring='accuracy',
                        cv=3,
                        n_jobs=-1)

grid_xgb.fit(X_train, y_train)
best_xgb = grid_xgb.best_estimator_
xgb_metrics = evaluate_model(best_xgb, X_test, y_test)

### Graphs

In [None]:
## === Accuracy Gauge (XGBoost)
fig_accuracy_gauge = go.Figure(go.Indicator(
    mode="gauge+number",
    value=round(xgb_metrics['Accuracy'] * 100, 1),
    title={'text': "XGBoost Accuracy (%)"},
    gauge={
        'axis': {'range': [0, 100]},
        'bar': {'color': "royalblue"},
        'steps': [
            {'range': [0, 60], 'color': '#ffcccc'},
            {'range': [60, 80], 'color': '#ffe680'},
            {'range': [80, 100], 'color': '#ccffcc'}
        ],
    }
))

fig_accuracy_gauge.update_layout(
    width=200,  # Make narrower
    height=150,  # Adjust height if needed
    margin=dict(t=50, b=20, l=20, r=20)
)

## === Performance Table (both models)
performance_df = pd.DataFrame({
    "Model": ["Random Forest", "XGBoost"],
    "Accuracy": [rf_metrics['Accuracy'], xgb_metrics['Accuracy']],
    "Precision": [rf_metrics['Precision'], xgb_metrics['Precision']],
    "Recall": [rf_metrics['Recall'], xgb_metrics['Recall']],
    "F1-Score": [rf_metrics['F1-Score'], xgb_metrics['F1-Score']],
    "ROC AUC": [rf_metrics['ROC AUC'], xgb_metrics['ROC AUC']]
}).round(3)

## === ROC Curve
fpr_rf, tpr_rf, _ = roc_curve(y_test, rf_metrics['y_proba'])
fpr_xgb, tpr_xgb, _ = roc_curve(y_test, xgb_metrics['y_proba'])

fig_roc, ax = plt.subplots(figsize=(6, 5))
ax.plot(
    fpr_rf,
    tpr_rf,
    label=f"Random Forest (AUC = {rf_metrics['ROC AUC']:.2f})", linestyle='--')
ax.plot(
    fpr_xgb,
    tpr_xgb,
    label=f"XGBoost (AUC = {xgb_metrics['ROC AUC']:.2f})", color='darkorange')
ax.plot([0, 1], [0, 1], linestyle=':', color='gray')
ax.set_title("ROC Curve Comparison")
ax.set_xlabel("False Positive Rate")
ax.set_ylabel("True Positive Rate")
ax.legend()
fig_roc.tight_layout()

## === Confusion Matrix
fig_conf_matrix, ax_cm = plt.subplots(figsize=(6, 5))

ConfusionMatrixDisplay.from_estimator(
    best_xgb,
    X_test,
    y_test,
    display_labels=["Not Satisfied", "Satisfied"],
    cmap='Blues',
    ax=ax_cm
)

ax_cm.grid(False)
ax_cm.set_title("XGBoost Confusion Matrix")
fig_conf_matrix.tight_layout()

## === Top 5 Feature Importances (XGB)
importances = best_xgb.feature_importances_
features = X_train.columns
importance_df = pd.DataFrame({
    "Feature": features,
    "Importance": importances
}).sort_values(by="Importance", ascending=False).head(5)

fig_top_features, ax_feat = plt.subplots(figsize=(6, 5))
sns.barplot(
    x="Importance",
    y="Feature",
    data=importance_df,
    palette="Blues_r", ax=ax_feat)
ax_feat.set_title("Top 5 Feature Importances (XGBoost)")
fig_top_features.tight_layout()


## === Saitsfaction by Online Boarding Rating

df_plot_online = df_original.copy()

if df_plot_online['Satisfaction'].dtype != 'object':
    df_plot_online['Satisfaction_Label'] = df_plot_online['Satisfaction'].map({
        0: "Neutral or Dissatisfied",
        1: "Satisfied"
    })
else:
    df_plot_online['Satisfaction_Label'] = df_plot_online['Satisfaction']

fig_online_boarding = plt.figure(figsize=(6, 5))
ax_online = fig_online_boarding.add_subplot()
sns.countplot(
    data=df_plot_online,
    x="Online Boarding",
    hue="Satisfaction_Label",
    palette="pastel",
    ax=ax_online
)

ax_online.set_title("Satisfaction by Online Boarding Rating")
ax_online.set_xlabel("Online Boarding Rating (0 = N/A, 5 = Excellent)")
ax_online.set_ylabel("Passenger Count")
ax_online.legend(title="Satisfaction")

for p in ax_online.patches:
    height = p.get_height()
    if not pd.isna(height) and height > 0:
        ax_online.annotate(f'{int(height):,}',
                           (p.get_x() + p.get_width() / 2., height),
                           ha='center', va='bottom',
                           fontsize=8, color='black', xytext=(0, 3),
                           textcoords='offset points')

fig_online_boarding.tight_layout()


## === Saitsfaction by Class

df_plot_class = df_original.copy()

if df_plot_class['Satisfaction'].dtype != 'object':
    df_plot_class['Satisfaction_Label'] = df_plot_class['Satisfaction'].map({
        0: "Neutral or Dissatisfied",
        1: "Satisfied"
    })
else:
    df_plot_class['Satisfaction_Label'] = df_plot_class['Satisfaction']

fig_class_ml = plt.figure(figsize=(6, 5))
ax_class = fig_class_ml.add_subplot()
sns.countplot(
    data=df_plot_class,
    x="Class",
    hue="Satisfaction_Label",
    palette="pastel",
    ax=ax_class
)

ax_class.set_title("Satisfaction by Class")
ax_class.set_xlabel("Class")
ax_class.set_ylabel("Passenger Count")
ax_class.legend(title="Satisfaction")

for p in ax_class.patches:
    height = p.get_height()
    if not pd.isna(height) and height > 0:
        ax_class.annotate(f'{int(height):,}',
                           (p.get_x() + p.get_width() / 2., height),
                           ha='center', va='bottom',
                           fontsize=8, color='black', xytext=(0, 3),
                           textcoords='offset points')

fig_class_ml.tight_layout()


## === Satisfaction by In-flight Wifi Service
df_wifi = df_original.copy()

if df_wifi['Satisfaction'].dtype != 'object':
    df_wifi['Satisfaction_Label'] = df_wifi['Satisfaction'].map({
        0: "Neutral or Dissatisfied",
        1: "Satisfied"
    })
else:
    df_wifi['Satisfaction_Label'] = df_wifi['Satisfaction']


fig_wifi_service = plt.figure(figsize=(6, 5))
ax_wifi = fig_wifi_service.add_subplot()
sns.countplot(
    data=df_wifi,
    x="In-flight Wifi Service",
    hue="Satisfaction_Label",
    palette="pastel",
    ax=ax_wifi
)

ax_wifi.set_title("Satisfaction by In-flight Wifi Service")
ax_wifi.set_xlabel("In-flight Wifi Rating (0 = N/A, 5 = Excellent)")
ax_wifi.set_ylabel("Passenger Count")
ax_wifi.legend(title="Satisfaction")

for p in ax_wifi.patches:
    height = p.get_height()
    if not pd.isna(height) and height > 0:
        ax_wifi.annotate(f'{int(height):,}',
                         (p.get_x() + p.get_width() / 2., height),
                         ha='center', va='bottom',
                         fontsize=8, color='black', xytext=(0, 3),
                         textcoords='offset points')

fig_wifi_service.tight_layout()


## Unsupervised Machine Learning

### Define the clusters variables

In [None]:
X_cluster = df_clean[['Gender',
                      'Age',
                      'Class_Business',
                      'Class_Economy',
                      'Class_Economy Plus',
                      'Online Boarding_5',
                      'In-flight Wifi Service_5']]
k_values = list(range(2, 10))
inertia_values = []
fit_times = []

### Create the graphs

In [None]:
for k in k_values:
    model = KMeans(n_clusters=k, random_state=42, n_init='auto')
    start_time = datetime.now()
    model.fit(X_cluster)
    end_time = datetime.now()

    inertia_values.append(model.inertia_)
    fit_times.append((end_time - start_time).total_seconds())

# --- Find elbow point (based on max curvature heuristic) ---
def find_elbow(k_vals, inertias):
    deltas = np.diff(inertias)
    second_deltas = np.diff(deltas)
    elbow_index = np.argmax(second_deltas) + 2
    return k_vals[elbow_index], inertias[elbow_index]

elbow_k, elbow_score = find_elbow(k_values, inertia_values)

fig_elbow, ax1 = plt.subplots(figsize=(8, 5))
sns.lineplot(
    x=k_values,
    y=inertia_values,
    marker='o',
    ax=ax1,
    label='Distortion')
ax1.axvline(
    x=elbow_k,
    color='black',
    linestyle='--',
    label=f'elbow at k = {elbow_k}')
ax1.set_xlabel('k')
ax1.set_ylabel('distortion score')
ax1.set_title('Distortion Score Elbow for KMeans Clustering')

ax1.annotate(f'elbow at k = {elbow_k}, score = {int(elbow_score):,}',
             xy=(elbow_k, elbow_score),
             xytext=(elbow_k + 0.5, elbow_score + 0.05 * max(inertia_values)),
             arrowprops=dict(facecolor='black', shrink=0.05),
             fontsize=10, color='black')

ax2 = ax1.twinx()
sns.lineplot(
    x=k_values,
    y=fit_times,
    marker='o',
    color='olive',
    ax=ax2, label='Fit time (sec)')
ax2.set_ylabel('fit time (seconds)', color='olive')
ax2.tick_params(axis='y', labelcolor='olive')

fig_elbow.tight_layout()

## === Silhouette Score ===

silhouette_scores = []
k_range = range(2, 10)

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(df_clean)
    score = silhouette_score(df_clean, kmeans.labels_)
    silhouette_scores.append(score)

# === Find the optimal K based on max score ===
optimal_k_silhouette = k_range[np.argmax(silhouette_scores)]
optimal_score = max(silhouette_scores)

# === Plot Silhouette Scores ===
fig_silhouette, ax = plt.subplots(figsize=(8, 5))
sns.barplot(x=list(k_range), y=silhouette_scores, ax=ax, palette='Blues_d')
ax.axhline(
    y=optimal_score,
    linestyle='--',
    color='red',
    label=f'Best Score: {optimal_score:.3f}')
ax.set_title("Silhouette Scores for KMeans Clustering")
ax.set_xlabel("Number of Clusters (k)")
ax.set_ylabel("Silhouette Score")
ax.legend()
fig_silhouette.tight_layout()

# === Plot Silhouette Scores ===

# Features to use for clustering (based on prior selection)
features_for_clustering = [
    'Gender', 'Age', 'Class_Business', 'Class_Economy', 'Class_Economy Plus',
    'Online Boarding_5', 'In-flight Wifi Service_5'
]

X_cluster = df_clean[features_for_clustering]

# Fit KMeans with optimal k
kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
df_clean["Cluster"] = kmeans.fit_predict(X_cluster)

# Save cluster centers
cluster_centers = pd.DataFrame(
    kmeans.cluster_centers_,
    columns=features_for_clustering
)

# === Clusters Age v. Flight Distance ===

features_for_clustering = ['Gender',
                           'Age',
                           'Class_Business',
                           'Online Boarding_5',
                           'In-flight Wifi Service_5']

# Fit KMeans (using 5 clusters as per elbow method)
kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
df_clean['Cluster'] = kmeans.fit_predict(df_clean[features_for_clustering])

# Use original (unscaled) data for plotting
df_plot_clusters = df_original.copy()
df_plot_clusters['Cluster'] = df_clean['Cluster']

# Plot Age vs Flight Distance with clusters
fig_cluster_age_distance, ax = plt.subplots(figsize=(8, 5))
sns.scatterplot(
    data=df_plot_clusters,
    x='Age',
    y='Flight Distance',
    hue='Cluster',
    palette='tab10',
    alpha=0.7,
    s=20,
    ax=ax
)

# Plotting cluster centroids (convert scaled to original scale)
centroids = kmeans.cluster_centers_
age_mean, age_std = df_original['Age'].mean(), df_original['Age'].std()
flight_mean = df_original['Flight Distance'].mean()
flight_std =  df_original['Flight Distance'].std()

# Rescale centroids to original values
centroids_age = (centroids[:, features_for_clustering.index('Age')] * age_std
                 + age_mean)

centroids_flight = (centroids[:, features_for_clustering.index('Class_Business')]
                    * flight_std + flight_mean)

ax.scatter(
    centroids_age,
    centroids_flight,
    s=250, c='black', marker='X', label='Centroids')
ax.set_title('KMeans Clusters (Age vs. Flight Distance)')
ax.legend(title='Cluster')

# === Cluster Size Breakdown ===
cluster_counts = df_clean["Cluster"].value_counts().sort_index()

fig_cluster_size, ax_size = plt.subplots(figsize=(8, 5))
sns.barplot(
    x=cluster_counts.index,
    y=cluster_counts.values,
    palette="pastel",
    ax=ax_size
)
ax_size.set_title("Cluster Size Breakdown")
ax_size.set_xlabel("Cluster")
ax_size.set_ylabel("Number of Passengers")


for p in ax_size.patches:
    height = p.get_height()
    ax_size.annotate(
        f'{int(height):,}',
        (p.get_x() + p.get_width() / 2.,
         height),
        ha='center',
        va='bottom',
        ontsize=9,
        xytext=(0, 3),
        textcoords='offset points')

plt.tight_layout()

# === Satisfaction by Cluster ===
df_cluster_sat = df_clean.copy()

# Convert satisfaction to readable labels
df_cluster_sat['Satisfaction_Label'] = df_cluster_sat['Satisfaction'].map({
    0: "Neutral or Dissatisfied",
    1: "Satisfied"
})

fig_cluster_sat, ax_cluster_sat = plt.subplots(figsize=(8, 5))

sns.countplot(
    data=df_cluster_sat,
    x='Cluster',
    hue='Satisfaction_Label',
    palette="pastel",
    ax=ax_cluster_sat
)

ax_cluster_sat.set_title('Satisfaction by Cluster')
ax_cluster_sat.set_xlabel('Cluster')
ax_cluster_sat.set_ylabel('Passenger Count')
ax_cluster_sat.legend(title='Satisfaction')

for p in ax_cluster_sat.patches:
    height = p.get_height()
    ax_cluster_sat.annotate(f'{int(height):,}',
                            (p.get_x() + p.get_width() / 2., height),
                            ha='center', va='bottom', fontsize=8, xytext=(0, 3),
                            textcoords='offset points')

plt.tight_layout()

# === Flight Distance by Cluster ===
df_clean['Original Flight Distance'] = df_original['Flight Distance']

# Plot the boxplot with original values
fig_distance_cluster, ax_distance_cluster = plt.subplots(figsize=(8, 5))

sns.boxplot(
    data=df_clean,
    x='Cluster',
    y='Original Flight Distance',
    palette="pastel",
    ax=ax_distance_cluster
)

ax_distance_cluster.set_title('Flight Distance Distribution by Cluster')
ax_distance_cluster.set_xlabel('Cluster')
ax_distance_cluster.set_ylabel('Flight Distance (Original Values)')
plt.tight_layout()

# === Age Distribution by Cluster (Original Values) ===

# Copy original ages to df_clean for plotting
df_clean['Original Age'] = df_original['Age']

# Plot the boxplot with original values
fig_age_cluster, ax_age_cluster = plt.subplots(figsize=(8, 5))

sns.boxplot(
    data=df_clean,
    x='Cluster',
    y='Original Age',
    palette="pastel",
    ax=ax_age_cluster
)

ax_age_cluster.set_title('Age Distribution by Cluster')
ax_age_cluster.set_xlabel('Cluster')
ax_age_cluster.set_ylabel('Age (Original Values)')
plt.tight_layout()

# Page layout

In [None]:
st.subheader(st.session_state.page)

# === EDA Page Layout ===
if st.session_state.page == "EDA":
    # Top row (2 columns)
    top1, top2 = st.columns(2)

    with top1:
        st.markdown("##### 1. Overall Satisfaction")
        st.plotly_chart(fig, use_container_width=True)

    with top2:
        st.markdown("##### 2. Loyal Business Customers")
        st.pyplot(fig_loyalty)

    # Middle row (2 columns)
    mid1, mid2 = st.columns(2)

    with mid1:
        st.markdown("##### 3. Flight Distance Distribution")
        st.pyplot(plot_flight_distance_distribution(df_original))
    with mid2:
        st.markdown("##### 4. Flight Age Distribution")
        st.pyplot(plot_age_distribution(df_original))

    # Bottom row (3 columns)
    btm1, btm2, btm3 = st.columns(3)

    with btm1:
        st.markdown("##### 3. Satisfaction by Class")
        st.pyplot(fig_class)

    with btm2:
        st.markdown("##### 4. Satisfaction by Customer Type")
        st.pyplot(fig_customer_type)

    with btm3:
        st.markdown("##### 5. Satisfaction by Type of Travel")
        st.pyplot(fig_travel_type)


# --- Supervised Machine Learning Layout ---
if st.session_state.page == "Supervised":

    st.subheader("Supervised Machine Learning")

    ## === Top Row: Accuracy Gauge + Model Metrics Table ===
    top1, top2 = st.columns(2)

    with top1:
        st.markdown("##### 1. Model Accuracy (XGBoost)")
        st.plotly_chart(fig_accuracy_gauge, use_container_width=True)

    with top2:
        st.markdown("##### 2. Model Performance Metrics")
        st.dataframe(
            performance_df.style.format({
                "Accuracy": "{:.3f}",
                "Precision": "{:.3f}",
                "Recall": "{:.3f}",
                "F1-Score": "{:.3f}",
                "ROC AUC": "{:.3f}"
            }).background_gradient(
                cmap='Blues',
                subset=["Accuracy",
                        "Precision",
                        "Recall",
                        "F1-Score",
                        "ROC AUC"]
            )
        )

    ## === Middle Row: ROC Curve, Confusion Matrix, Feature Importance ===
    mid1, mid2, mid3 = st.columns(3)

    with mid1:
        st.markdown("##### 3. ROC Curve")
        st.pyplot(fig_roc)

    with mid2:
        st.markdown("##### 4. Confusion Matrix (XGBoost)")
        st.pyplot(fig_conf_matrix)

    with mid3:
        st.markdown("##### 5. Top 5 Feature Importances")
        st.pyplot(fig_top_features)

    ## === Bottom Row: 3 Satisfaction Breakdown Graphs ===
    btm1, btm2, btm3 = st.columns(3)

    with btm1:
        st.markdown("##### 6. Satisfaction by Online Boarding")
        st.pyplot(fig_online_boarding)

    with btm2:
        st.markdown("##### 7. Satisfaction by Class")
        st.pyplot(fig_class)

    with btm3:
        st.markdown("##### 8. Satisfaction by In-flight Wifi Service")
        st.pyplot(fig_wifi_service)


# --- Unsupervised Machine Learning Layout ---
if st.session_state.page == "Unsupervised":
    st.subheader("Unsupervised Machine Learning - KMeans")

    # === Top Row: Cluster Metrics ===
    top1, top2 = st.columns(2)

    with top1:
        st.markdown("##### 1. Optimal K (Elbow Method)")
        st.pyplot(fig_elbow, use_container_width=True)

    with top2:
        st.markdown("##### 2. Silhouette Score")
        st.pyplot(fig_silhouette)

    # === Middle Row: Cluster Visuals ===
    mid1, mid2 = st.columns(2)

    with mid1:
        st.markdown("##### 3. Cluster Plot (Age vs. Flight Distance)")
        st.pyplot(fig_cluster_age_distance)

    with mid2:
        st.markdown("##### 4. Cluster Size Breakdown")
        st.pyplot(fig_cluster_size)

    # === Bottom Row: Cluster Characteristics ===
    btm1, btm2, btm3 = st.columns(3)

    with btm1:
        st.markdown("##### 5. Satisfaction by Cluster")
        st.pyplot(fig_cluster_sat)

    with btm2:
        st.markdown("##### 6. Flight Distance by Cluster")
        st.pyplot(fig_distance_cluster)

    with btm3:
        st.markdown("##### 7. Age by Cluster")
        st.pyplot(fig_age_cluster)

# Footer

In [None]:
today = datetime.today().strftime('%B %d, %Y')
st.markdown(f"<div class='footer'>📅 Dashboard last updated on {today}</div>",
            unsafe_allow_html=True)