# **Notebook 4: Predictive Modelling**

## Objectives

* The goal is to build predictive models that may identify customers likely to churn.
* To Interpret results and attain actionable business insights

## Inputs

* Write down which data or information you need to run the notebook 

## Outputs

* Write here which files, code or artefacts you generate by the end of the notebook 

## Additional Comments

* If you have any additional comments that don't fit in the previous bullets, please state them here. 



In [1]:
# change working directory

import os
current_dir = os.getcwd()
current_dir

'c:\\Users\\slych\\Credit_Card_Customer_Churn_Analysis\\jupyter_notebooks'

In [2]:
# make parent of current directory the new current directory

os.chdir(os.path.dirname(current_dir))
print("You set a new current directory")

You set a new current directory


In [3]:
# confirm new current directory

current_dir = os.getcwd()
current_dir

'c:\\Users\\slych\\Credit_Card_Customer_Churn_Analysis'

#### 1. Load Libraries and Data

In [7]:
import pandas as pd
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [5]:
import pandas as pd

# load cleaned data

data = pd.read_csv(r'C:\Users\slych\Credit_Card_Customer_Churn_Analysis\Data\Outputs\cleaned_bank_churn.csv')

In [8]:
# Define models

models = {
    "Random Forest": RandomForestClassifier(
        n_estimators=200,
        max_depth=10,
        class_weight='balanced',
        random_state=42
    ),
    "Logistic Regression": LogisticRegression(
        max_iter=1000,
        class_weight='balanced',
        random_state=42
    )
}

results = []
best_model = None
best_score = 0
best_model_name = None

In [None]:
from sklearn.model_selection import train_test_split  # Bug fixed using Copilot - import train_test_split

# Prepare features and target
X = data.drop('Churn', axis=1)
y = data['Churn']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Train, evaluate, and pick best

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc = roc_auc_score(y_test, y_proba)

    results.append((name, acc, prec, rec, f1, roc))

    if roc > best_score:
        best_score = roc
        best_model = model
        best_model_name = name

In [16]:
# Show comparison

results_df = pd.DataFrame(results, columns=["Model", "Accuracy", "Precision", "Recall", "F1", "ROC-AUC"])
print("Model Performance Comparison:")
print(results_df)

Model Performance Comparison:
                 Model  Accuracy  Precision    Recall        F1   ROC-AUC
0        Random Forest  0.951629   0.851393  0.846154  0.848765  0.986502
1  Logistic Regression  0.846989   0.514507  0.818462  0.631829  0.919486


In [17]:
# Save best model

joblib.dump(best_model, "best_churn_model.pkl")
print(f"\n✅ Best model '{best_model_name}' saved as best_churn_model.pkl with ROC-AUC: {best_score:.4f}")


✅ Best model 'Random Forest' saved as best_churn_model.pkl with ROC-AUC: 0.9865


In [32]:
import streamlit as st
import pandas as pd
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load model & data

@st.cache_resource
def load_model():
    return joblib.load("best_churn_model.pkl")

@st.cache_data
def load_data():
    return pd.read_csv(r'C:\Users\slych\Credit_Card_Customer_Churn_Analysis\Data\Outputs\cleaned_bank_churn.csv')

model = load_model()
df = load_data()

In [None]:
# Tabs for navigation

tab1, tab2 = st.tabs(["📊 Dashboard", "🔮 Prediction"])


# Dashboard Tab

with tab1:
    st.subheader("Key Metrics")
    col1, col2, col3 = st.columns(3)

    total_customers = len(df)
    churn_rate = df["Churn"].mean() * 100
    avg_transactions = df["Total_Trans_Amt"].mean()

    col1.metric("Total Customers", f"{total_customers:,}")
    col2.metric("Churn Rate", f"{churn_rate:.1f}%")
    col3.metric("Avg. Transactions", f"${avg_transactions:,.0f}")

    st.markdown("---")

In [38]:
# Churn Distribution
fig, ax = plt.subplots()
sns.countplot(x="Churn", data=df, ax=ax)
ax.set_xticklabels(["Stayed", "Churned"])
ax.set_title("Churn Distribution")
st.pyplot(fig)

  ax.set_xticklabels(["Stayed", "Churned"])


DeltaGenerator()

In [36]:
# Feature importance (if available)
if hasattr(model, "feature_importances_"):
    importances = pd.Series(model.feature_importances_, index=X.columns)
    importances = importances.sort_values(ascending=False)

    fig2, ax2 = plt.subplots()
    sns.barplot(x=importances, y=importances.index, ax=ax2)
    ax2.set_title("Feature Importance")
    st.pyplot(fig2)



In [39]:
# Prediction Tab

with tab2:
    st.subheader("Single Customer Prediction")

    input_data = {}
    for col in X.columns:
        if df[col].dtype in [int, float, bool]:
            default_val = float(df[col].median()) if df[col].dtype != bool else bool(df[col].mode()[0])
            input_data[col] = st.number_input(col, value=default_val) if df[col].dtype != bool else st.checkbox(col, value=default_val)
        else:
            options = df[col].unique().tolist()
            default_idx = options.index(df[col].mode()[0])
            input_data[col] = st.selectbox(col, options, index=default_idx)

    if st.button("Predict Churn"):
        X_new = pd.DataFrame([input_data])
        prediction = model.predict(X_new)[0]
        probability = model.predict_proba(X_new)[0][1]

        if prediction == 1:
            st.error(f"⚠️ Likely to churn. Probability: {probability:.2%}")
        else:
            st.success(f"✅ Likely to stay. Churn probability: {probability:.2%}")

    st.markdown("---")
    st.subheader("📂 Batch Prediction from CSV")
    uploaded_file = st.file_uploader("Upload CSV file", type=["csv"])
    if uploaded_file is not None:
        batch_df = pd.read_csv(uploaded_file)
        preds = model.predict(batch_df)
        probs = model.predict_proba(batch_df)[:, 1]
        batch_df["Churn_Prediction"] = preds
        batch_df["Churn_Probability"] = probs
        st.write(batch_df)
        csv = batch_df.to_csv(index=False).encode("utf-8")
        st.download_button("Download Predictions", csv, "batch_predictions.csv", "text/csv")

