In [1]:
import json
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

In [2]:
# Load data
df = pd.read_csv("DATA/Churn_Modelling.csv")
df = df.drop(columns=["RowNumber", "CustomerId", "Surname"])

X = df.drop("Exited", axis=1)
y = df["Exited"]

In [3]:
# Same features as training
numerical_features = [
    'CreditScore','Age','Tenure','Balance',
    'NumOfProducts','HasCrCard',
    'IsActiveMember','EstimatedSalary'
]

categorical_features = ['Geography','Gender']

In [4]:

# Same preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_features),
        ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), categorical_features)
    ]
)

In [5]:
# SAME final model
model = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("classifier", LogisticRegression(
        class_weight="balanced",
        max_iter=1000,
        random_state=42
    ))
])


In [6]:
# Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

roc_auc_scores = cross_val_score(
    model,
    X,
    y,
    cv=cv,
    scoring="roc_auc"
)

In [11]:
print("ROC-AUC per fold:", roc_auc_scores)
print(f"Mean ROC-AUC: {np.mean(roc_auc_scores):.3f} ± {np.std(roc_auc_scores):.3f}")

ROC-AUC per fold: [0.78589485 0.75206455 0.75303192 0.77192292 0.78366194]
Mean ROC-AUC: 0.769 ± 0.014


In [12]:
# Save results
results = {
    "cv_type": "StratifiedKFold",
    "folds": 5,
    "metric": "ROC-AUC",
    "mean": float(np.mean(roc_auc_scores)),
    "std": float(np.std(roc_auc_scores)),
    "per_fold": roc_auc_scores.tolist()
}


In [14]:
with open("validation/cv_results.json", "w") as f:
    json.dump(results, f, indent=4)