# Setup Snowpark Session and Load Data


In [None]:
from snowflake.snowpark import Session
from snowflake.snowpark.functions import col
from snowflake.ml.feature_store import FeatureStore, FeatureView, Entity, CreationMode
from snowflake.ml.modeling.pipeline import Pipeline
from snowflake.ml.modeling.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.inspection import PartialDependenceDisplay
import shap
import seaborn as sns
import numpy as np
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report
from snowflake.ml.registry import Registry

# Automatically get the current Snowflake session (no credentials needed)
session = Session.builder.getOrCreate()
# Select your working database and schema
session.sql("USE DATABASE churn_modeling_db").collect()
session.sql("USE SCHEMA churn_modeling_schema").collect()

# Load data from the table created earlier
df = session.table("customer_data")

# Show a sample
df.show(10)


# Load Feature View

In [None]:
# Initialize the Feature Store
fs = FeatureStore(
    session=session,
    database="CHURN_MODELING_DB",
    name="CHURN_FEATURE_STORE",
    default_warehouse="CHURN_WAREHOUSE",
    creation_mode=CreationMode.CREATE_IF_NOT_EXIST
)

# Get the FeatureView
fv = fs.get_feature_view("customer_churn_features_v2", version="v1")

# Use the underlying Snowpark DataFrame
fv_df = fv.feature_df

# Show first 5 rows
fv_df.show(5)

# Select columns
fv_df.select("CLIENTID", "CREDITSCORE", "CHURNED").show(5)

# Filter
fv_df.filter(fv_df["CHURNED"] == 1).show(5)


# Feature Engineering

In [None]:
# Preprocessing steps
encoder = OrdinalEncoder(
    input_cols=["GENDER", "GEOGRAPHY"],
    output_cols=["GENDER_ENCODED", "GEOGRAPHY_ENCODED"]
)

scaler = StandardScaler(
    input_cols=[
        "CREDITSCORE", "AGE", "GRADE", "ACCOUNTBALANCE",
        "PRODUCTCOUNT", "OWNSCREDITCARD", "ISACTIVE", "SALARYESTIMATED"
    ],
    output_cols=[
        "CREDITSCORE_S", "AGE_S", "GRADE_S", "ACCOUNTBALANCE_S",
        "PRODUCTCOUNT_S", "OWNSCREDITCARD_S", "ISACTIVE_S", "SALARYESTIMATED_S"
    ]
)

# Select relevant columns from Snowpark DataFrame
df_for_pipeline = df.select(
    "GENDER", "GEOGRAPHY", "CREDITSCORE", "AGE", "GRADE", "ACCOUNTBALANCE",
    "PRODUCTCOUNT", "OWNSCREDITCARD", "ISACTIVE", "SALARYESTIMATED", "CHURNED"
)

# Fit preprocessing pipeline
pipeline = Pipeline(steps=[("encoder", encoder), ("scaler", scaler)])
fitted_pipeline = pipeline.fit(df_for_pipeline)

# Split Train/Test

In [None]:
# Transform data
transformed_df = fitted_pipeline.transform(df_for_pipeline)

# Convert to pandas for scikit-learn
feature_cols = [
    "CREDITSCORE_S", "AGE_S", "GRADE_S", "ACCOUNTBALANCE_S",
    "PRODUCTCOUNT_S", "OWNSCREDITCARD_S", "ISACTIVE_S", "SALARYESTIMATED_S",
    "GENDER_ENCODED", "GEOGRAPHY_ENCODED"
]
X = transformed_df.select(feature_cols).to_pandas()
y = transformed_df.select("CHURNED").to_pandas().values.ravel()

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)



# Train XGBClassifier
Requires to install *snowflake-ml-python* and *XGBClassifier* libraries to snowflake notebook

In [None]:
# Define parameters
xgb_params = {
    "n_estimators": 300,
    "learning_rate": 0.1,
    "max_depth": 6,
    "eval_metric": "logloss",
    "use_label_encoder": False
}

# Train
model = XGBClassifier(**xgb_params)
model.fit(X_train, y_train)

print("XGBoost trained successfully (Snowflake preprocessing â†’ scikit-learn workflow)")


# Save Model to Model Registry

In [None]:
# Initialize registry
registry = Registry(session=session)

# Compute metrics
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

metrics = {
    "accuracy": float(accuracy_score(y_test, y_pred)),
    "roc_auc": float(roc_auc_score(y_test, y_prob))
}

# Safe logging: if version exists, skip or auto-generate
try:
    model_version = registry.log_model(
        model=model,
        model_name="CHURN_XGB_MODEL",
        version_name="v1",  # optional
        metrics=metrics,
        sample_input_data=pd.DataFrame(X_test).head(10)
    )
    print("Model logged to registry:", model_version)
except ValueError as e:
    if "already existed" in str(e):
        print("Model version already exists, skipping logging.")
    else:
        raise  # re-raise other unexpected errors


# Inspect Feature Importance & interpret results

In [None]:
# Initialize registry
registry = Registry(session=session)

# Retrieve the model object (Model) by name
m = registry.get_model(model_name="CHURN_XGB_MODEL")

# Then select a specific version
mv = m.version("v1")  # or mv = m.default if you want the default version

# Load the actual model object from the version
model = mv.load()

# Now you can do feature importance
feature_cols = [
    "CREDITSCORE_S", "AGE_S", "GRADE_S", "ACCOUNTBALANCE_S",
    "PRODUCTCOUNT_S", "OWNSCREDITCARD_S", "ISACTIVE_S", "SALARYESTIMATED_S",
    "GENDER_ENCODED", "GEOGRAPHY_ENCODED"
]

importance_df = pd.DataFrame({
    "Feature": feature_cols,
    "Importance": model.feature_importances_
}).sort_values(by="Importance", ascending=False)

print(importance_df)
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
plt.figure(figsize=(10,6))
plt.barh(importance_df["Feature"], importance_df["Importance"])
plt.gca().invert_yaxis()
plt.title("Feature Importance")
plt.show()


In [None]:
# ---------------------------
# Model Performance Charts
# ---------------------------

# Predictions
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

# Metrics
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)

print(f"Accuracy: {accuracy:.2f}, ROC-AUC: {roc_auc:.2f}")

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=["Actual No Churn", "Actual Churn"],
                     columns=["Pred No Churn", "Pred Churn"])

plt.figure(figsize=(6,4))
sns.heatmap(cm_df, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix")
plt.show()

# Optional: ROC Curve
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
plt.figure(figsize=(6,4))
plt.plot(fpr, tpr, label=f"ROC-AUC = {roc_auc:.2f}")
plt.plot([0,1],[0,1],'--', color='grey')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.show()


# ---------------------------
# Feature Importance
# ---------------------------

importance_df = pd.DataFrame({
    "Feature": feature_cols,
    "Importance": model.feature_importances_
}).sort_values(by="Importance", ascending=True)

plt.figure(figsize=(8,6))
plt.barh(importance_df["Feature"], importance_df["Importance"], color="skyblue")
plt.title("Feature Importance")
plt.xlabel("Importance")
plt.show()


# ---------------------------
# Risk Segmentation
# ---------------------------

# Predicted churn probability
risk_df = pd.DataFrame({
    "CustomerID": X_test.index,
    "Churn_Prob": y_prob
})

# Define risk bins
bins = [0, 0.3, 0.6, 1]
labels = ["Low Risk", "Medium Risk", "High Risk"]
risk_df['Risk_Level'] = pd.cut(risk_df['Churn_Prob'], bins=bins, labels=labels)

# Plot risk distribution
risk_counts = risk_df['Risk_Level'].value_counts().reindex(labels)
plt.figure(figsize=(6,4))
sns.barplot(x=risk_counts.index, y=risk_counts.values, palette="Reds")
plt.title("Customer Risk Segmentation")
plt.ylabel("Number of Customers")
plt.show()


# ---------------------------
# Limitations & Next Steps (Optional Visual)
# ---------------------------

# Visualize churn vs key driver (e.g., IsActiveMember)
plt.figure(figsize=(6,4))
sns.countplot(x='ISACTIVE_S', hue=y_test, data=X_test.join(pd.Series(y_test, name='Churned')), palette="Set2")
plt.title("Churn by Active Member Status")
plt.xlabel("Is Active Member")
plt.ylabel("Count")
plt.legend(title='Churned', labels=['No', 'Yes'])
plt.show()

# Another view: Churn by Geography
plt.figure(figsize=(6,4))
sns.countplot(x='GEOGRAPHY_ENCODED', hue=y_test, data=X_test.join(pd.Series(y_test, name='Churned')), palette="Set1")
plt.title("Churn by Geography")
plt.xlabel("Geography")
plt.ylabel("Count")
plt.legend(title='Churned', labels=['No', 'Yes'])
plt.show()


In [None]:
# Prepare test_df with predictions and risk levels
test_df = X_test.copy()
test_df['Actual'] = y_test
test_df['Predicted'] = y_pred
test_df['Correct'] = test_df['Actual'] == test_df['Predicted']
test_df['Churn_Prob'] = y_prob

# Define risk bins
bins = [0, 0.3, 0.6, 1]
labels = ["Low Risk", "Medium Risk", "High Risk"]
test_df['Risk_Level'] = pd.cut(test_df['Churn_Prob'], bins=bins, labels=labels)

# Aggregate counts of correct/incorrect by risk level
risk_counts = test_df.groupby(['Risk_Level', 'Correct']).size().unstack(fill_value=0)

# Ensure consistent column order
risk_counts = risk_counts.reindex(columns=[True, False], fill_value=0)
risk_counts.columns = ["Correct", "Incorrect"]

# Convert to percentages for labeling
risk_pct = risk_counts.div(risk_counts.sum(axis=1), axis=0) * 100

# Plot
plt.figure(figsize=(8,5))
ax = risk_counts.plot(kind='bar', stacked=True, 
                      color=['green', 'red'], alpha=0.85, figsize=(8,5))

plt.xlabel("Risk Level")
plt.ylabel("Number of Predictions")
plt.title("Prediction Accuracy by Client Churn Risk Level")

# Add percentage labels on each bar segment
for i, (idx, row) in enumerate(risk_counts.iterrows()):
    total = row.sum()
    correct = row['Correct']
    incorrect = row['Incorrect']
    
    # Position labels in the middle of each bar segment
    if correct > 0:
        ax.text(i, correct/2, f"{risk_pct.loc[idx, 'Correct']:.1f}%", 
                ha='center', va='center', color='white', fontsize=10, fontweight='bold')

    if incorrect > 0:
        ax.text(i, correct + incorrect/2, 
                f"{risk_pct.loc[idx, 'Incorrect']:.1f}%", 
                ha='center', va='center', color='white', fontsize=10, fontweight='bold')

plt.xticks(rotation=0)
plt.legend(["Correct", "Incorrect"], title="Prediction")
plt.tight_layout()
plt.show()


In [None]:
# Define risk bins
bins = [0, 0.3, 0.6, 1]
labels = ["Low Risk", "Medium Risk", "High Risk"]
test_df['Risk_Level'] = pd.cut(test_df['Churn_Prob'], bins=bins, labels=labels)

# Mapping for the chart
feature_name_map = {
    "AGE_S": "Customer Age",
    "PRODUCTCOUNT_S": "Number of Bank Products",
    "GENDER_ENCODED": "Gender",
    "ISACTIVE_S": "Account Active",
    "CREDITSCORE_S": "Credit Score",
    "ACCOUNTBALANCE_S": "Account Balance",
    "GEOGRAPHY_ENCODED": "Country / Region",
    "OWNSCREDITCARD_S": "Has Credit Card",
    "SALARYESTIMATED_S": "Estimated Salary",
    "GRADE_S": "Customer Tier"
}

# Convert feature names in feature_cols to friendly names
friendly_feature_names = [feature_name_map.get(f, f) for f in feature_cols]

# SHAP explainer (replace 'model' and 'X_train' with your actual model and training data)
explainer = shap.Explainer(model, X_train)
shap_values = explainer(X_test)

# Get positions in X_test corresponding to the risk levels
risk_indices = {}
for risk in labels:
    # Find the indices of test_df for this risk
    df_indices = test_df[test_df['Risk_Level'] == risk].index.tolist()
    if df_indices:
        # Map the first index to the position in X_test
        pos_in_X_test = X_test.index.get_loc(df_indices[0])
        risk_indices[risk] = pos_in_X_test

# Plot SHAP waterfall for one client per risk level
for risk, pos in risk_indices.items():
    print(f"SHAP Waterfall Plot for {risk} Client (Position {pos}):")
    
    plt.figure(figsize=(8, 5))
    plt.title(f"{risk} Client", fontsize=14, fontweight='bold')
    
    explanation = shap.Explanation(
        values=shap_values.values[pos],
        base_values=shap_values.base_values[pos],
        data=X_test.iloc[pos],
        feature_names=friendly_feature_names
    )
    
    shap.waterfall_plot(explanation)
    plt.tight_layout()
    plt.show()



In [None]:
# Define risk bins based on predicted churn probability
bins = [0, 0.3, 0.6, 1]
labels = ["Low Risk", "Medium Risk", "High Risk"]
test_df['Risk_Level'] = pd.cut(test_df['Churn_Prob'], bins=bins, labels=labels)

# Filter high-risk customers
high_risk_df = test_df[test_df['Risk_Level'] == 'High Risk']

# Features for PDP
features_to_plot = {
    "Number of Bank Products": "PRODUCTCOUNT_S",
    "Customer Age": "AGE_S",
    "Estimated Salary": "SALARYESTIMATED_S"
}

# Plot partial dependence plots for each feature
for title, feature in features_to_plot.items():
    fig, ax = plt.subplots(figsize=(10, 6))
    
    PartialDependenceDisplay.from_estimator(
        model,                          # your trained classifier
        X=high_risk_df[feature_cols],   # only high-risk customers
        features=[feature],
        target=1,                       # class 1 = churn
        ax=ax
    )
    
    ax.set_title(f"Partial Dependence: {title} vs Churn (High-Risk Customers)", fontsize=14)
    plt.tight_layout()
    plt.show()
