In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split

In [2]:
df=pd.read_csv("cleaned_data.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   CreditScore          10000 non-null  float64
 1   Geography            10000 non-null  object 
 2   Gender               10000 non-null  object 
 3   Age                  10000 non-null  float64
 4   Tenure               10000 non-null  int64  
 5   Balance              10000 non-null  float64
 6   NumOfProducts        10000 non-null  int64  
 7   HasCrCard            10000 non-null  int64  
 8   IsActiveMember       10000 non-null  int64  
 9   EstimatedSalary      10000 non-null  float64
 10  Exited               10000 non-null  int64  
 11  Balance_per_Product  10000 non-null  float64
 12  Tenure_per_Age       10000 non-null  float64
 13  HighSalary           10000 non-null  int64  
 14  AgeGroup             10000 non-null  object 
dtypes: float64(6), int64(6), object(3)
me

In [4]:
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Balance_per_Product,Tenure_per_Age,HighSalary,AgeGroup
0,619.0,France,Female,42.0,2,0.0,1,1,1,101348.88,1,0.0,0.047619,0,Adult
1,608.0,Spain,Female,41.0,1,83807.86,1,0,1,112542.58,0,83807.86,0.02439,0,Adult
2,502.0,France,Female,42.0,8,159660.8,3,1,0,113931.57,1,53220.266667,0.190476,0,Adult
3,699.0,France,Female,39.0,1,0.0,2,0,0,93826.63,0,0.0,0.025641,0,Adult
4,850.0,Spain,Female,43.0,2,125510.82,1,1,1,79084.1,0,125510.82,0.046512,0,Adult


In [5]:
df.columns.tolist()

['CreditScore',
 'Geography',
 'Gender',
 'Age',
 'Tenure',
 'Balance',
 'NumOfProducts',
 'HasCrCard',
 'IsActiveMember',
 'EstimatedSalary',
 'Exited',
 'Balance_per_Product',
 'Tenure_per_Age',
 'HighSalary',
 'AgeGroup']

# <span style="color: red;">preprocessing</span>

In [6]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)


In [7]:
df.isna().mean().round(4) * 100

CreditScore            0.0
Geography              0.0
Gender                 0.0
Age                    0.0
Tenure                 0.0
Balance                0.0
NumOfProducts          0.0
HasCrCard              0.0
IsActiveMember         0.0
EstimatedSalary        0.0
Exited                 0.0
Balance_per_Product    0.0
Tenure_per_Age         0.0
HighSalary             0.0
AgeGroup               0.0
dtype: float64

In [8]:
X = df.drop('Exited', axis=1)
y = df['Exited']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)
print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)
print("\nChurn distribution in train:", y_train.value_counts(normalize=True).round(3))
print("Churn distribution in test:", y_test.value_counts(normalize=True).round(3))
print("\nData types in X_train:")
print(X_train.dtypes.value_counts())

Training set shape: (8000, 14)
Test set shape: (2000, 14)

Churn distribution in train: Exited
0    0.796
1    0.204
Name: proportion, dtype: float64
Churn distribution in test: Exited
0    0.796
1    0.204
Name: proportion, dtype: float64

Data types in X_train:
float64    6
int64      5
object     3
Name: count, dtype: int64


In [9]:
X 

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Balance_per_Product,Tenure_per_Age,HighSalary,AgeGroup
0,619.0,France,Female,42.0,2,0.00,1,1,1,101348.88,0.000000,0.047619,0,Adult
1,608.0,Spain,Female,41.0,1,83807.86,1,0,1,112542.58,83807.860000,0.024390,0,Adult
2,502.0,France,Female,42.0,8,159660.80,3,1,0,113931.57,53220.266667,0.190476,0,Adult
3,699.0,France,Female,39.0,1,0.00,2,0,0,93826.63,0.000000,0.025641,0,Adult
4,850.0,Spain,Female,43.0,2,125510.82,1,1,1,79084.10,125510.820000,0.046512,0,Adult
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,771.0,France,Male,39.0,5,0.00,2,1,0,96270.64,0.000000,0.128205,0,Adult
9996,516.0,France,Male,35.0,10,57369.61,1,1,1,101699.77,57369.610000,0.285714,0,Adult
9997,709.0,France,Female,36.0,7,0.00,1,0,1,42085.58,0.000000,0.194444,0,Adult
9998,772.0,Germany,Male,42.0,3,75075.31,2,1,0,92888.52,37537.655000,0.071429,0,Adult


In [10]:
y

0       1
1       0
2       1
3       0
4       0
       ..
9995    0
9996    0
9997    1
9998    1
9999    0
Name: Exited, Length: 10000, dtype: int64

In [11]:
num_cols=df.select_dtypes(include=['int64', 'float64']).columns.to_list()
num_cols.remove('Exited')
num_cols

['CreditScore',
 'Age',
 'Tenure',
 'Balance',
 'NumOfProducts',
 'HasCrCard',
 'IsActiveMember',
 'EstimatedSalary',
 'Balance_per_Product',
 'Tenure_per_Age',
 'HighSalary']

In [12]:
cat_cols=df.select_dtypes(include=['object', 'category']).columns.to_list()
cat_cols

['Geography', 'Gender', 'AgeGroup']

In [13]:
for col in cat_cols:
    print(f"{col}: {df[col].nunique()} unique values")
    print(f"Unique values: {df[col].unique()}")

Geography: 3 unique values
Unique values: ['France' 'Spain' 'Germany']
Gender: 2 unique values
Unique values: ['Female' 'Male']
AgeGroup: 3 unique values
Unique values: ['Adult' 'Young' 'Senior']


In [14]:
ohe_cols = ['Geography', 'Gender', 'AgeGroup']


# <span style="color: red;">Pipeline for numerical features and categorical features </span>

In [15]:
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),  
    ('scaler', StandardScaler())
])
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),  
    ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'))
])
numerical_cols = num_cols
categorical_cols = ohe_cols

Preprocessing = ColumnTransformer(transformers=[
    ('num', num_pipeline, numerical_cols),
    ('cat', cat_pipeline, categorical_cols)
])
Preprocessing


# <span style="color: red;">Pipeline for feature_engineering</span>

In [17]:
from sklearn.preprocessing import FunctionTransformer

def feature_engineering(df):
    df = df.copy()
    df['AgeGroup'] = pd.cut(df['Age'], bins=[0, 30, 50, 100], labels=['Young', 'Adult', 'Senior'])
    df['Tenure_per_Age'] = df['Tenure'] / (df['Age'] + 1)
    df['Balance_per_Product'] = df['Balance'] / (df['NumOfProducts'] + 1)
    df['HighSalary'] = (df['EstimatedSalary'] > df['EstimatedSalary'].median()).astype(int)
    return df

feature_engineer = FunctionTransformer(feature_engineering)
feature_engineer


# <span style="color: red;">Pipeline for Modling</span>

In [18]:
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
import pandas as pd

models = [
    ('Logistic Regression', LogisticRegression(random_state=42)),
    ('KNN', KNeighborsClassifier()),
    ('Naive Bayes', GaussianNB()),
    ('Decision Tree', DecisionTreeClassifier(random_state=42)),
    ('Random Forest', RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)),
    ('XGBoost', XGBClassifier(n_estimators=100, max_depth=5, learning_rate=0.1, use_label_encoder=False, eval_metric='logloss')),
    ('CatBoost', CatBoostClassifier(verbose=0, allow_writing_files=False)),
    ('LightGBM', LGBMClassifier(n_estimators=100, max_depth=7, learning_rate=0.1))
]

results = []

for name, clf in models:
    model_pipeline = Pipeline(steps=[
        ('features', feature_engineer),
        ('preprocessing', Preprocessing),
        ('smote', SMOTE(random_state=42)),
        ('classifier', clf)
    ])
    
    result = cross_validate(
        model_pipeline, X, y, scoring='f1', cv=5,
        return_train_score=True, n_jobs=-1
    )
    results.append({
        'Model': name,
        'Train F1 (%)': round(result['train_score'].mean() * 100, 2),
        'Test F1 (%)': round(result['test_score'].mean() * 100, 2)
    })

results_df = pd.DataFrame(results)
print(results_df.sort_values(by='Test F1 (%)', ascending=False))


                 Model  Train F1 (%)  Test F1 (%)
4        Random Forest         73.14        62.01
5              XGBoost         68.21        61.73
7             LightGBM         73.10        61.71
6             CatBoost         76.29        61.58
1                  KNN         72.13        51.06
0  Logistic Regression         50.21        50.13
3        Decision Tree        100.00        49.68
2          Naive Bayes         48.77        48.65


In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import f1_score

base_models = [
    ('rf', RandomForestClassifier(max_depth=10, n_estimators=200, random_state=42)),
    ('xgb', XGBClassifier(max_depth=3, n_estimators=100, random_state=42, use_label_encoder=False, eval_metric='logloss')),
    ('lgbm', LGBMClassifier(max_depth=20, n_estimators=100, random_state=42)),
    ('lr', LogisticRegression(C=10, solver='lbfgs', random_state=42))
]

meta_model = LogisticRegression(random_state=42)


stack_clf = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_model,
    cv=5,
    n_jobs=-1
)

stack_pipeline = Pipeline(steps=[
    ('features', feature_engineer),
    ('Preprocessing', Preprocessing),
    ('smote', SMOTE(random_state=42)),
    ('Model', stack_clf)
])


stack_pipeline.fit(X_train, y_train)
train_pred = stack_pipeline.predict(X_train)
test_pred = stack_pipeline.predict(X_test)

print("Stacking Classifier Results")
print("Train F1:", round(f1_score(y_train, train_pred) * 100, 2))
print("Test F1 :", round(f1_score(y_test, test_pred) * 100, 2))



Stacking Classifier Results
Train F1: 74.71
Test F1 : 63.67


In [None]:
from sklearn.model_selection import GridSearchCV

base_models = [
    ('rf', RandomForestClassifier(random_state=42)),
    ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)),
    ('lgbm', LGBMClassifier(random_state=42)),
    ('lr', LogisticRegression(random_state=42))
]

meta_model = LogisticRegression(random_state=42)

stack_clf = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_model,
    cv=5,
    n_jobs=-1
)


stack_pipeline = Pipeline(steps=[\
    ('features', feature_engineer),
    ('Preprocessing', Preprocessing),
    ('smote', SMOTE(random_state=42)),
    ('Model', stack_clf)
])

params = {
    'Model__final_estimator__C': [0.1, 1, 10],
    'Model__final_estimator__solver': ['lbfgs', 'liblinear']
}

grid_search = GridSearchCV(
    stack_pipeline,
    param_grid=params,
    scoring='f1',
    cv=5,
    return_train_score=True,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X, y)


Fitting 5 folds for each of 6 candidates, totalling 30 fits


In [22]:

print("Best Params :", grid_search.best_params_)
print("Best F1 Score :", grid_search.best_score_)


Best Params : {'Model__final_estimator__C': 0.1, 'Model__final_estimator__solver': 'lbfgs'}
Best F1 Score : 0.6133142610519665


In [None]:
base_models = [
    ('rf', RandomForestClassifier(random_state=42)),
    ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)),
    ('lgbm', LGBMClassifier(random_state=42)),
    ('lr', LogisticRegression(random_state=42))
]
meta_model = LogisticRegression(C=0.1, solver='lbfgs', random_state=42)
stack_clf = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_model,
    cv=5,
    n_jobs=-1
)
final_pipeline = Pipeline(steps=[
    ('features', feature_engineer),
    ('Preprocessing', Preprocessing),
    ('smote', SMOTE(random_state=42)),
    ('Model', stack_clf)
])
final_pipeline.fit(X, y)


In [24]:
final_pipeline.predict(X.head(1))


array([1])

In [25]:
final_pipeline.predict(X.head(1))[0]

np.int64(1)

In [26]:
import joblib

joblib.dump(final_pipeline, 'stacking_model.pkl')


['stacking_model.pkl']

# <span style="color: red;">Deployment pipeline</span>

In [29]:
%%writefile churn_app.py

import streamlit as st
import pandas as pd
import joblib
from sklearn.preprocessing import FunctionTransformer
import plotly.express as px
import plotly.figure_factory as ff
import seaborn as sns
import matplotlib.pyplot as plt

# ================= Feature Engineering =================
def feature_engineering(df):
    df = df.copy()
    df['AgeGroup'] = pd.cut(df['Age'], bins=[0,30,50,100], labels=['Young','Adult','Senior'])
    df['Tenure_per_Age'] = df['Tenure'] / (df['Age'] + 1)
    df['Balance_per_Product'] = df['Balance'] / (df['NumOfProducts'] + 1)
    df['HighSalary'] = (df['EstimatedSalary'] > df['EstimatedSalary'].median()).astype(int)
    return df

feature_engineer = FunctionTransformer(feature_engineering)

# ================= Page Config + CSS =================
st.set_page_config(page_title="Bank Churn Dashboard", layout="wide")

page_bg = """
<style>
.stApp {
    background: linear-gradient(-45deg, #283E51, #485563);
    background-size: 400% 400%;
    animation: gradient 20s ease infinite;
    color: #fff;
    font-family: 'Segoe UI', sans-serif;
}
@keyframes gradient { 
    0%{background-position:0% 50%} 
    50%{background-position:100% 50%} 
    100%{background-position:0% 50%}
}
.block-container {
    background-color: rgba(0,0,0,0.55);
    padding: 25px;
    border-radius: 12px;
}
[data-testid="stSidebar"] {
    background: linear-gradient(180deg, #2c3e50, #4ca1af);
    color: white;
}
.big-title { font-size: 42px; text-align:center; font-weight:bold; }
.sub-title { text-align:center; font-size:18px; color:#ddd; margin-bottom:30px; }
.big-btn {
    display:inline-block;
    background:#1abc9c;
    color:white; 
    padding:16px 30px; 
    border-radius:8px;
    font-size:18px; 
    margin:10px;
    text-decoration:none;
    font-weight:bold;
}
.big-btn:hover { background:#16a085; color:#fff; }
</style>
"""
st.markdown(page_bg, unsafe_allow_html=True)

# ================= Load Data & Model =================
df = pd.read_csv("cleaned_data.csv")
model = joblib.load("stacking_model.pkl")

# ================= NAVIGATION SYSTEM =================
pages = ["🏠 Home","🔮 Prediction","ℹ️ About Data","📊 Analysis","🧩 EDA","💡 Recommendations"]

# Initialize session state if not exists
if "page" not in st.session_state:
    st.session_state["page"] = "🏠 Home"

# ==== Custom CSS for Sidebar Style ====
st.markdown("""
<style>
/* Sidebar background */
[data-testid="stSidebar"] {
    background: linear-gradient(180deg,#0F2027,#203A43,#2C5364);
    color: white;
}

/* Title + labels */
[data-testid="stSidebar"] h2, 
[data-testid="stSidebar"] label, 
[data-testid="stSidebar"] span {
    color: #f1f1f1 !important;
    font-weight: bold;
}

/* Radio button styling */
div[role='radiogroup'] > label {
    background: rgba(255,255,255,0.1);
    margin: 5px 0;
    padding: 10px;
    border-radius: 8px;
    cursor: pointer;
    transition: 0.3s;
}
div[role='radiogroup'] > label:hover {
    background: rgba(255,255,255,0.3);
}
div[role='radiogroup'] > label[data-checked="true"] {
    background: linear-gradient(90deg,#00c6ff,#0072ff);
    color: white !important;
    font-weight: bold;
    border: 1px solid #fff;
}
</style>
""", unsafe_allow_html=True)

# ==== Sidebar content ====
st.sidebar.markdown("## 📌 Navigation")
choice = st.sidebar.radio("", pages, index=pages.index(st.session_state["page"]))
st.session_state["page"] = choice  # Sync sidebar with session_state

# ==== Extra Info in Sidebar ====
st.sidebar.markdown("---")
st.sidebar.markdown("### 📊 Dataset Info")
st.sidebar.write(f"**Total Customers:** {len(df)}")
st.sidebar.write(f"**Churn Rate:** {df['Exited'].mean()*100:.2f}%")

st.sidebar.markdown("---")
st.sidebar.markdown("### 🔮 Last Prediction")
if "last_pred" in st.session_state:
    if st.session_state["last_pred"] == 1:
        st.sidebar.error(f"⚠️ Will Churn ({st.session_state['last_proba']:.0%})")
    else:
        st.sidebar.success(f"✅ Stay ({(1-st.session_state['last_proba']):.0%})")
else:
    st.sidebar.info("No prediction yet.")
# ================= HOME PAGE =================
if st.session_state["page"] == "🏠 Home":
    # ====== Titles with Gradient Color ======
    st.markdown("""
    <style>
    .big-title {
        font-size: 42px; 
        text-align:center; 
        font-weight:bold; 
        background: linear-gradient(135deg,#f12711,#f5af19);
        -webkit-background-clip: text;
        -webkit-text-fill-color: transparent;
    }

    .sub-title {
        text-align:center; 
        font-size:18px; 
        margin-bottom:30px; 
        background: linear-gradient(135deg,#f12711,#f5af19);
        -webkit-background-clip: text;
        -webkit-text-fill-color: transparent;
    }
    </style>
    """, unsafe_allow_html=True)

    st.markdown("<div class='big-title'>Bank Churn Dashboard</div>", unsafe_allow_html=True)
    st.markdown("<div class='sub-title'>Predict • Analyze • Explore • Recommend</div>", unsafe_allow_html=True)

    # ✅ مركز الصورة + تصغير الحجم
    st.markdown("""
    <div style='display:flex; justify-content:center; margin:20px 0;'>
        <img src="https://cdn.dribbble.com/userupload/42177612/file/original-6b80f731604a3ee61165320f9acaf931.gif" 
             width="900">
    </div>
    """, unsafe_allow_html=True)

    # ==== CSS for Colored Buttons ====
    st.markdown("""
    <style>
    .stButton > button {
        padding: 18px 30px;
        margin: 10px;
        border-radius: 12px;
        font-size: 18px;
        font-weight: bold;
        color: white;
        border: none;
        cursor: pointer;
        transition: all 0.3s ease-in-out;
        min-width: 180px;
        text-align: center;
    }
    .stButton > button:hover {
        transform: scale(1.07);
        box-shadow: 0px 8px 18px rgba(0,0,0,0.4);
    }
    /* Assign colors by order */
    div.stButton:nth-of-type(1) > button {background: linear-gradient(135deg,#f12711,#f5af19);} /* Prediction */
    div.stButton:nth-of-type(2) > button {background: linear-gradient(135deg,#00b09b,#96c93d);} /* About */
    div.stButton:nth-of-type(3) > button {background: linear-gradient(135deg,#2193b0,#6dd5ed);} /* Analysis */
    div.stButton:nth-of-type(4) > button {background: linear-gradient(135deg,#cc2b5e,#753a88);} /* EDA */
    div.stButton:nth-of-type(5) > button {background: linear-gradient(135deg,#42275a,#734b6d);} /* Recommendations */
    </style>
    """, unsafe_allow_html=True)

    # ==== Buttons Row ====
    col1, col2, col3, col4, col5 = st.columns(5)

    with col1:
        if st.button("🔮 Prediction"):
            st.session_state["page"] = "🔮 Prediction"

    with col2:
        if st.button("ℹ️ About Data"):
            st.session_state["page"] = "ℹ️ About Data"

    with col3:
        if st.button("📊 Analysis"):
            st.session_state["page"] = "📊 Analysis"

    with col4:
        if st.button("🧩 EDA"):
            st.session_state["page"] = "🧩 EDA"

    with col5:
        if st.button("💡 Recommendations"):
            st.session_state["page"] = "💡 Recommendations"
# ================= PREDICTION =================
# ================= PREDICTION =================
elif choice == "🔮 Prediction":
    st.title("🔮 Predict Customer Churn")

    with st.form("prediction_form"):
        col1, col2 = st.columns(2)
        with col1:
            CreditScore = st.number_input("Credit Score", int(df.CreditScore.min()), int(df.CreditScore.max()), int(df.CreditScore.median()))
            Age = st.number_input("Age", int(df.Age.min()), int(df.Age.max()), int(df.Age.median()))
            Tenure = st.number_input("Tenure", int(df.Tenure.min()), int(df.Tenure.max()), int(df.Tenure.median()))
            Balance = st.number_input("Balance", float(df.Balance.min()), float(df.Balance.max()), float(df.Balance.median()))
        with col2:
            NumOfProducts = st.number_input("Num Of Products", int(df.NumOfProducts.min()), int(df.NumOfProducts.max()), int(df.NumOfProducts.median()))
            HasCrCard = st.selectbox("Has Credit Card?", [0,1])
            IsActiveMember = st.selectbox("Active Member?", [0,1])
            EstimatedSalary = st.number_input("Estimated Salary", float(df.EstimatedSalary.min()), float(df.EstimatedSalary.max()), float(df.EstimatedSalary.median()))
            Geography = st.selectbox("Geography", df.Geography.unique())
            Gender = st.selectbox("Gender", df.Gender.unique())
        
        submit = st.form_submit_button("🔮 Predict Churn")

    if submit:
        # 1️⃣ Prepare input data
        input_data = pd.DataFrame([{
            "CreditScore": CreditScore, 
            "Age": Age, 
            "Tenure": Tenure,
            "Balance": Balance, 
            "NumOfProducts": NumOfProducts,
            "HasCrCard": HasCrCard, 
            "IsActiveMember": IsActiveMember,
            "EstimatedSalary": EstimatedSalary, 
            "Geography": Geography,
            "Gender": Gender
        }])

        # 2️⃣ Apply Feature Engineering (للعرض فقط)
        engineered_data = feature_engineering(input_data)

        # 3️⃣ Prediction (الموديل مدرب على الأعمدة الأصلية فقط)
        prediction = model.predict(input_data)[0]
        proba = model.predict_proba(input_data)[0][1]

        # 4️⃣ Show Results
        st.success("✅ Prediction Completed!")
        if prediction:
            st.error("😟 Prediction: Customer Will Churn")
        else:
            st.success("😃 Prediction: Customer Will Stay")

        colA, colB = st.columns(2)
        with colA:
            st.metric("Probability of Churn", f"{proba:.2%}")
        with colB:
            st.metric("Confidence", f"{(1-proba if prediction==0 else proba):.2%}")

        # 5️⃣ Show Engineered Features للعميل
        st.subheader("🧩 Engineered Features for this Customer")
        st.dataframe(engineered_data)

        # 6️⃣ Save result to session_state
        st.session_state["last_pred"] = prediction
        st.session_state["last_proba"] = proba
        st.session_state["last_input"] = engineered_data
# ================= ABOUT DATA =================
elif choice == "ℹ️ About Data":
    st.markdown("<div class='big-title'>Bank Churn Dashboard</div>", unsafe_allow_html=True)

    # ✅ مركز الصورة
    st.markdown("""
    <div style='display:flex; justify-content:center; margin:20px 0;'>
        <img src="https://hacheemaster.github.io/assets/images/churn.jpeg" 
             width="800">
    </div>
    """, unsafe_allow_html=True)

    st.title("ℹ️ Dataset Information")

    # ------------------- Column Description -------------------
    st.subheader("📋 Column Descriptions")
    desc = {
        "CreditScore":"Credit score of the customer",
        "Age":"Age of customer",
        "Tenure":"Years with bank",
        "Balance":"Account balance",
        "NumOfProducts":"Number of bank products the customer is using",
        "HasCrCard":"Whether customer has a credit card (1=yes,0=no)",
        "IsActiveMember":"Whether customer is active (1=yes,0=no)",
        "EstimatedSalary":"Estimated salary",
        "Geography":"Country of customer",
        "Gender":"Gender of customer",
        "Exited":"Target: 1=Churn, 0=Stay"
    }
    st.table(pd.DataFrame(desc.items(), columns=["Column","Description"]))

    # ------------------- Dataset Filters -------------------
    st.subheader("🔎 Data Filters")

    col1, col2 = st.columns(2)
    with col1:
        geo_filter = st.multiselect("🌍 Select Geography", options=df["Geography"].unique())
    with col2:
        gender_filter = st.multiselect("👤 Select Gender", options=df["Gender"].unique())

    filtered_df = df.copy()
    if geo_filter:
        filtered_df = filtered_df[filtered_df["Geography"].isin(geo_filter)]
    if gender_filter:
        filtered_df = filtered_df[filtered_df["Gender"].isin(gender_filter)]

    n_rows = st.slider("📌 Number of rows to display", 5, 50, 10)
    st.dataframe(filtered_df.head(n_rows))

    # ------------------- Dataset Overview -------------------
    st.subheader("📊 Dataset Overview")
    col1, col2, col3 = st.columns(3)
    with col1:
        st.metric("Total Customers", len(filtered_df))
    with col2:
        st.metric("Features", filtered_df.shape[1])
    with col3:
        st.metric("Churn Rate", f"{filtered_df['Exited'].mean()*100:.2f}%")

    # ------------------- Target Distribution -------------------
    st.subheader("🎯 Churn Distribution")
    fig_target = px.pie(filtered_df, names="Exited", title="Churn vs Stay",
                        color="Exited", color_discrete_map={0:"green", 1:"red"})
    st.plotly_chart(fig_target, use_container_width=True)
# ================= ANALYSIS =================
# ================= ANALYSIS =================
elif choice == "📊 Analysis":
    st.title("📊 Customer Churn Analysis")
    st.markdown("<div class='big-title'>📊 Customer Churn Analysis</div>", unsafe_allow_html=True)

    # ✅ مركز الصورة + تصغير الحجم
    st.markdown("""
    <div style='display:flex; justify-content:center; margin:20px 0;'>
        <img src="https://ece.emory.edu/_includes/images/sections/programs/Data-Analytics-Intro.jpg" 
             width="1000">
    </div>
    """, unsafe_allow_html=True)

    col1, col2 = st.columns(2)
    with col1:
        fig = px.histogram(df, x="Geography", color="Exited", barmode="group", title="Churn by Geography")
        st.plotly_chart(fig, use_container_width=True)

        fig = px.histogram(df, x="NumOfProducts", color="Exited", barmode="group", title="Churn by Number of Products")
        st.plotly_chart(fig, use_container_width=True)

    with col2:
        fig = px.pie(df, names="Gender", title="Gender Distribution")
        st.plotly_chart(fig, use_container_width=True)

        fig = px.histogram(df, x="IsActiveMember", color="Exited", barmode="group", title="Churn vs Active Membership")
        st.plotly_chart(fig, use_container_width=True)

    st.subheader("📈 Age Groups vs Churn")
    df['AgeGroup'] = pd.cut(df['Age'], bins=[0,30,40,50,60,100], labels=['<30','30-40','40-50','50-60','60+'])
    fig_age = px.histogram(df, x="AgeGroup", color="Exited", barmode="group", title="Churn by Age Group")
    st.plotly_chart(fig_age, use_container_width=True)
    fig = px.histogram(df, x="Age", color="Exited", barmode="group", title="Churn by Age")
    st.plotly_chart(fig, use_container_width=True)
    fig = px.histogram(df, x="Tenure", color="Exited", barmode="group", title="Churn by Tenure")
    st.plotly_chart(fig, use_container_width=True)


    st.subheader("💰 Balance Distribution by Churn")
    fig_balance = px.histogram(df, x="Balance", color="Exited", nbins=40, title="Balance Distribution by Churn")
    st.plotly_chart(fig_balance, use_container_width=True)

# ================= EDA =================
# ================= EDA =================

 
elif choice == "🧩 EDA":
    st.title("🧩 Exploratory Data Analysis")
    st.markdown("<div class='big-title'>🧩 Exploratory Data Analysis</div>", unsafe_allow_html=True)

    # ✅ مركز الصورة + تصغير الحجم
    st.markdown("""
    <div style='display:flex; justify-content:center; margin:20px 0;'>
        <img src="https://media2.giphy.com/media/v1.Y2lkPTZjMDliOTUyd3Zqa3AydDdqZzJtZTE4bjdnNXMxczF3NW9jYXF2dWVmbnF3amRnYyZlcD12MV9naWZzX3NlYXJjaCZjdD1n/xT9C25UNTwfZuk85WP/200w.gif" 
             width="1000">
    </div>
    """, unsafe_allow_html=True)

    st.subheader("🔎 Summary Statistics")
    st.write(df.describe())

    st.subheader("🛑 Missing Values")
    st.write(df.isnull().sum())

    # KDE / Distribution plots
    st.subheader("📊 Distribution of Continuous Features")
    for feature in ["Age","CreditScore","Balance","EstimatedSalary"]:
        fig = px.histogram(df, x=feature, color="Exited", nbins=30, opacity=0.7,
                           title=f"Distribution of {feature}")
        st.plotly_chart(fig, use_container_width=True)

    # Correlation Heatmap
    st.subheader("📌 Correlation Heatmap")
    corr = df.corr(numeric_only=True)
    fig_corr = px.imshow(corr, text_auto=True, aspect="auto", color_continuous_scale="RdBu_r", title="Correlation Heatmap")
    st.plotly_chart(fig_corr, use_container_width=True)

    # Boxplots for outlier detection
    st.subheader("📦 Outlier Analysis with Boxplots")
    features_box = ["Age","CreditScore","Balance","EstimatedSalary"]
    for f in features_box:
        fig_box = px.box(df, y=f, color="Exited", title=f"{f} Distribution by Churn")
        st.plotly_chart(fig_box, use_container_width=True)

    # Scatter Matrix
    st.subheader("🔗 Scatter Matrix (Sample 300 Customers)")
    fig_scatter_matrix = px.scatter_matrix(df.sample(300), dimensions=["Age","Balance","CreditScore","EstimatedSalary"], color="Exited")
    st.plotly_chart(fig_scatter_matrix, use_container_width=True)
    
# ================= RECOMMENDATIONS =================
elif choice == "💡 Recommendations":
    st.title("💡 Smart Recommendations")

    # check if prediction was made
    if "last_pred" not in st.session_state or "last_input" not in st.session_state:
        st.warning("⚠️ Please make a prediction first")
    else:
        pred = st.session_state["last_pred"]
        prob = st.session_state["last_proba"]
        input_data = st.session_state["last_input"]

        import plotly.graph_objects as go
        # ===== Gauge Visualization =====
        st.subheader("📊 Churn Probability Gauge")
        gauge = go.Figure(go.Indicator(
            mode="gauge+number",
            value=prob*100,
            title={'text': "Churn Probability (%)"},
            gauge={
                'axis': {'range': [0,100]},
                'bar': {"color": "red" if pred==1 else "green"},
                'steps': [
                    {'range':[0,40],'color':'lightgreen'},
                    {'range':[40,70],'color':'yellow'},
                    {'range':[70,100],'color':'tomato'}
                ]
            }
        ))
        st.plotly_chart(gauge, use_container_width=True)

        # ===== Progress Bar for Churn %
        st.subheader("📊 Probability Progress Bar")
        st.progress(int(prob*100))

        # ===== Customer Insights =====
        st.subheader("🔎 Customer Insights")
        avg_values = df.mean(numeric_only=True)
        st.write(f"- **Age**: {int(input_data['Age'].values[0])} years compared to the average {avg_values['Age']:.0f}")
        st.write(f"- **Balance**: {input_data['Balance'].values[0]:,.0f} compared to the average {avg_values['Balance']:.0f}")
        st.write(f"- **Products Owned**: {input_data['NumOfProducts'].values[0]} compared to the average {avg_values['NumOfProducts']:.1f}")
        st.write(f"- **Estimated Salary**: {input_data['EstimatedSalary'].values[0]:,.0f} compared to the average {avg_values['EstimatedSalary']:.0f}")

        # ===== Similar Customers Analysis =====
        st.subheader("📊 Similar Customers Behavior")
        similar = df[
            (df['Age'].between(int(input_data['Age'].values[0])-5, int(input_data['Age'].values[0])+5)) &
            (df['NumOfProducts'] == input_data['NumOfProducts'].values[0])
        ]
        if not similar.empty:
            fig_sim = px.pie(similar, names="Exited", title="Churn vs Stay among Similar Customers")
            st.plotly_chart(fig_sim, use_container_width=True)
        else:
            st.info("💡 No enough similar customers found in dataset")

           # ===== What-if Analysis =====
            st.subheader("🧪 What-if Analysis")

            new_balance = st.slider(
                "Modify Balance",
                0,
                int(df["Balance"].max()),
                int(input_data["Balance"].values[0])
            )

            # نعمل نسخة ونغير الـ Balance فيها
            whatif_data = input_data.copy()
            whatif_data["Balance"] = new_balance

            # اعادة التنبؤ
            new_proba = model.predict_proba(whatif_data)[0][1]
            new_pred = model.predict(whatif_data)[0]   # 👈 هنا عرفناه قبل الاستخدام

            st.write(f"🔮 Churn Probability after modifying balance = {new_proba:.2%}")

            
            # ===== Recommendations Based on Prediction =====
        if pred==1:
            st.error(f"🚨 High Risk of Churn ({prob:.0%})")
            st.markdown("### Suggested Retention Actions:")
            st.write("- 📞 Personal call with relationship manager")
            st.write("- 💳 Offer better credit card benefits")
            st.write("- 💰 Tailored loan/financial offers")
            st.write("- 🎁 Loyalty rewards program")
        else:
            st.success(f"✅ Customer Likely to Stay ({prob:.0%})")
            st.markdown("### Suggested Opportunities:")
            st.write("- 🛍️ Cross-sell investment plans")
            st.write("- 🤝 Promote loyalty schemes")
            st.write("- 📲 Push digital banking adoption")
            st.write("- 🎯 Personalized marketing")
        # Boxplots
        st.subheader("📊 Boxplots of Key Features vs Churn")
        features_box = ["Age","CreditScore","Balance","EstimatedSalary"]
        for f in features_box:
            fig_box = px.box(df, x="Exited", y=f, color="Exited", title=f"{f} vs Churn")
            st.plotly_chart(fig_box, use_container_width=True)

        # Bar chart for categorical features
        st.subheader("📊 Categorical Features vs Churn")
        for cat in ["Geography","Gender","NumOfProducts","HasCrCard","IsActiveMember"]:
            fig_cat = px.histogram(df, x=cat, color="Exited", barmode="group", title=f"{cat} vs Churn")
            st.plotly_chart(fig_cat, use_container_width=True)

Overwriting churn_app.py


In [28]:
! streamlit run churn_app.py

^C


In [30]:
pip freeze > requirements.txt


Note: you may need to restart the kernel to use updated packages.


In [31]:
pip freeze > requirements.txt

Note: you may need to restart the kernel to use updated packages.
