In [67]:
import os
import datetime
import pandas as pd
import joblib
import kagglehub
import mlflow

from sklearn import datasets
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import PowerTransformer, RobustScaler, StandardScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from imblearn.over_sampling import SMOTE

import pandas as pd
import pandas as pd
from sklearn import datasets
    
from evidently import Dataset
from evidently import DataDefinition
from evidently import Report
from evidently.presets import DataDriftPreset, DataSummaryPreset

In [68]:

# Save dir
import os
SAVEDIR = os.getenv('ARTIFACT_DIR', '.') + '/saved_models'
os.makedirs(SAVEDIR, exist_ok=True)

def load_data():
    path = kagglehub.dataset_download("itsmesunil/bank-loan-modelling")
    df = pd.read_excel(
        os.path.join(path, "Bank_Personal_Loan_Modelling.xlsx"),
        sheet_name='Data'
    )
    # DROP via keyword axis=
    return df.drop(['ID', 'ZIP Code'], axis=1)


In [69]:

def split_data(df):
    X = df.drop('Personal Loan', axis=1)
    y = df['Personal Loan']
    X0, X_test, y0, y_test = train_test_split(
        X, y, test_size=0.3, stratify=y, random_state=42
    )
    X_train, X_val, y_train, y_val = train_test_split(
        X0, y0, test_size=0.2, stratify=y0, random_state=42
    )
    return X_train, X_val, X_test, y_train, y_val, y_test


In [70]:
def preprocess_fit(X_train, X_val, X_test):
    import matplotlib.pyplot as plt
    import seaborn as sns
    from scipy.stats import skew

    rb = ['CCAvg','Mortgage']
    st = ['Income','Experience','Age']

    # === 1. Skewness Check ===
    ''' print("\n📈 Skewness Before Transformation:")
    skew_vals = X_train[rb + st].apply(skew)
    print(skew_vals)

    # === 2. Histograms Before Transformation ===
    X_train[rb + st].hist(bins=30, figsize=(12, 8), color='skyblue', edgecolor='black')
    plt.suptitle("Histograms Before Transformation", fontsize=16)
    plt.tight_layout()
    plt.show()

    # === 3. Boxplots Before Transformation ===
    plt.figure(figsize=(15, 6))
    for i, col in enumerate(rb + st, 1):
        plt.subplot(2, 3, i)
        sns.boxplot(y=X_train[col], color='lightcoral')
        plt.title(f"Boxplot: {col}")
    plt.tight_layout()
    plt.show()'''

    # === 4. Preprocessing Steps ===
    pt = PowerTransformer('yeo-johnson')
    rs = RobustScaler()
    ss = StandardScaler()

    X_train[rb] = rs.fit_transform(pt.fit_transform(X_train[rb]))
    X_train[st] = ss.fit_transform(X_train[st])

    for X in (X_val, X_test):
        X[rb] = rs.transform(pt.transform(X[rb]))
        X[st] = ss.transform(X[st])

    joblib.dump(pt, os.path.join(SAVEDIR, 'pt.pkl'))
    joblib.dump(rs, os.path.join(SAVEDIR, 'rs.pkl'))
    joblib.dump(ss, os.path.join(SAVEDIR, 'ss.pkl'))

    return X_train, X_val, X_test


In [71]:
def visualize_eda(df):
    import matplotlib.pyplot as plt
    import seaborn as sns

    plt.style.use('seaborn-v0_8-darkgrid')
    sns.set(font_scale=1.1)

    # === 1. Target Variable Distribution ===
    plt.figure(figsize=(6, 4))
    sns.countplot(data=df, x='Personal Loan', palette='Set2')
    plt.title("Target Variable: Personal Loan")
    plt.xlabel("Personal Loan (0 = No, 1 = Yes)")
    plt.ylabel("Count")
    plt.show()

    # === 2. Univariate Analysis ===
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
    sns.histplot(df['Age'], bins=20, kde=True, ax=axes[0], color='skyblue')
    axes[0].set_title("Age Distribution")

    sns.histplot(df['Experience'], bins=20, kde=True, ax=axes[1], color='lightgreen')
    axes[1].set_title("Experience Distribution")
    plt.tight_layout()
    plt.show()

    plt.figure(figsize=(6, 4))
    sns.histplot(df['Income'], bins=20, kde=True, color='salmon')
    plt.title("Income Distribution")
    plt.show()

    plt.figure(figsize=(6, 4))
    sns.histplot(df['CCAvg'], bins=20, kde=True, color='plum')
    plt.title("Average Credit Card Spending")
    plt.show()

    plt.figure(figsize=(6, 4))
    sns.histplot(df['Mortgage'], bins=20, kde=True, color='gold')
    plt.title("Mortgage Distribution")
    plt.show()

    # === 3. Categorical Features vs Target ===
    plt.figure(figsize=(7, 4))
    sns.countplot(data=df, x='Education', hue='Personal Loan', palette='pastel')
    plt.title("Loan Approval by Education Level")
    plt.xlabel("Education (1:UG, 2:Graduate, 3:Advanced)")
    plt.ylabel("Count")
    plt.legend(title='Loan Approved')
    plt.show()

    plt.figure(figsize=(7, 4))
    sns.countplot(data=df, x='Family', hue='Personal Loan', palette='coolwarm')
    plt.title("Loan Approval by Family Size")
    plt.xlabel("Family Members")
    plt.ylabel("Count")
    plt.legend(title='Loan Approved')
    plt.show()

    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
    sns.countplot(data=df, x='Online', hue='Personal Loan', ax=axes[0], palette='viridis')
    axes[0].set_title("Loan by Online Banking")

    sns.countplot(data=df, x='CreditCard', hue='Personal Loan', ax=axes[1], palette='magma')
    axes[1].set_title("Loan by Credit Card Ownership")
    plt.tight_layout()
    plt.show()

    # === 4. Bivariate Numerical Analysis ===
    plt.figure(figsize=(7, 4))
    sns.boxplot(data=df, x='Personal Loan', y='Income', palette='Set3')
    plt.title("Income vs Loan Status")
    plt.xlabel("Personal Loan")
    plt.ylabel("Income")
    plt.show()

    plt.figure(figsize=(7, 5))
    sns.scatterplot(data=df, x='CCAvg', y='Mortgage', hue='Personal Loan', palette='coolwarm', alpha=0.7)
    plt.title("CCAvg vs Mortgage by Loan Status")
    plt.xlabel("Avg Credit Card Spend")
    plt.ylabel("Mortgage")
    plt.show()

    # === 5. Pairplot ===
    selected = ['Age', 'Experience', 'Income', 'CCAvg', 'Mortgage', 'Personal Loan']
    sns.pairplot(df[selected], hue='Personal Loan', palette='husl', diag_kind='kde')
    plt.suptitle("Pairplot of Important Features", y=1.02)
    plt.show()

    # === 6. Correlation Heatmap ===
    plt.figure(figsize=(12, 8))
    corr = df.corr()
    sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm", square=True)
    plt.title("Correlation Heatmap")
    plt.show()


In [72]:

def feature_select_fit(X_train, y_train, X_val, X_test):
    selector = RFE(LogisticRegression(max_iter=1000), n_features_to_select=8)
    Xtr = selector.fit_transform(X_train, y_train)
    Xv  = selector.transform(X_val)
    Xt  = selector.transform(X_test)
    joblib.dump(selector, os.path.join(SAVEDIR, 'selector.pkl'))
    return Xtr, Xv, Xt


In [73]:

def balance(X, y):
    sm = SMOTE(random_state=42)
    return sm.fit_resample(X, y)


In [74]:
from datetime import datetime
from evidently import Report
from evidently.presets import DataDriftPreset, DataSummaryPreset
import mlflow

def log_evidently_report(reference_data, current_data, dataset_name="train_vs_test"):
    report = Report(metrics=[
        DataDriftPreset(),
        DataSummaryPreset()
    ])
   
    result = report.run(reference_data=reference_data, current_data=current_data)
 
    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    html_path = f"evidently_{dataset_name}_{timestamp}.html"
   
    result.save_html(html_path)  # ✅ Supported in evidently >= 0.4.0
   
    mlflow.log_artifact(html_path, artifact_path="evidently")
    print(f"📄 Evidently report logged: {html_path}")


In [75]:
import os
import time
import joblib
import mlflow
import mlflow.sklearn
import yaml
from mlflow.tracking import MlflowClient
from mlflow.entities import Metric
from mlflow.utils.yaml_utils import YamlSafeDumper
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# ─── 1) Patch MLflow’s YAML dumper so any object is stringified ─────────────
# Catch Metric explicitly
YamlSafeDumper.add_multi_representer(
    Metric,
    lambda dumper, metric: dumper.represent_scalar(
        'tag:yaml.org,2002:str',
        f"{metric.key}={metric.value:.6f}@{metric.timestamp}"
    )
)
# Catch _everything else_ so nothing breaks
YamlSafeDumper.add_multi_representer(
    object,
    lambda dumper, obj: dumper.represent_scalar(
        'tag:yaml.org,2002:str',
        str(obj)
    )
)

# ─── 2) Setup local‐save directory ─────────────────────────────────────────────
SAVEDIR = "saved_models"
os.makedirs(SAVEDIR, exist_ok=True)

def tune_and_save(X, y, X_val, y_val):
    mlflow.set_experiment("Bank Loan Classification")
    client = MlflowClient()

    best_f1 = 0.0
    best_model_name = None
    best_model_path = None

    grids = {
        'LogisticRegression': {
            'model': LogisticRegression(max_iter=1000),
            'params': {'C': [0.01, 0.1, 1, 10],
                       'penalty': ['l1', 'l2'],
                       'solver': ['liblinear']}
        },
        'DecisionTree': {
            'model': DecisionTreeClassifier(),
            'params': {'max_depth': [3, 5, 7, None],
                       'min_samples_split': [2, 5, 10],
                       'min_samples_leaf': [1, 2, 4]}
        },
        'RandomForest': {
            'model': RandomForestClassifier(random_state=42),
            'params': {'n_estimators': [50, 100],
                       'max_depth': [5, 10, None]}
        },
        'GradientBoosting': {
            'model': GradientBoostingClassifier(random_state=42),
            'params': {'n_estimators': [50, 100],
                       'learning_rate': [0.01, 0.1]}
        },
        'KNN': {
            'model': KNeighborsClassifier(),
            'params': {'n_neighbors': [3, 5, 7]}
        },
        'SVM': {
            'model': SVC(probability=True, random_state=42),
            'params': {'C': [0.1, 1, 10],
                       'kernel': ['linear', 'rbf']}
        }
    }

    for name, cfg in grids.items():
        with mlflow.start_run(run_name=name, nested=True) as run:
            # Not using broad autolog to avoid unexpected objects
            # mlflow.sklearn.autolog(log_models=True, log_input_examples=False)

            gs = GridSearchCV(cfg['model'], cfg['params'], scoring='f1',
                              cv=5, n_jobs=-1)
            gs.fit(X, y)
            best_model = gs.best_estimator_

            preds = best_model.predict(X_val)
            acc = accuracy_score(y_val, preds)
            prec = precision_score(y_val, preds)
            rec = recall_score(y_val, preds)
            f1 = f1_score(y_val, preds)

            # Only primitive metric logging
            mlflow.log_metrics({
                "val_accuracy": acc,
                "val_precision": prec,
                "val_recall": rec,
                "val_f1": f1
            })

            print(f"{name} tuned → {gs.best_params_}")
            print(f"→ Accuracy: {acc:.3f}, Precision: {prec:.3f}, "
                  f"Recall: {rec:.3f}, F1: {f1:.3f}")

            # Save locally and log artifact
            local_path = os.path.join(SAVEDIR, f"{name}_model.pkl")
            joblib.dump(best_model, local_path)
            mlflow.sklearn.log_model(best_model, artifact_path="model")

            if f1 > best_f1:
                best_f1 = f1
                best_model_name = name
                best_model_path = f"runs:/{run.info.run_id}/model"

    # Register & promote to Production
    if best_model_name and best_model_path:
        print(f"\n🏆 Registering best model: {best_model_name} (F1={best_f1:.3f})")
        mv = mlflow.register_model(
            model_uri=best_model_path,
            name="BankLoanBestModel"
        )
        print(f"✅ Registered: {mv.name}, version {mv.version}")

        # Wait for registry metadata to be ready
        for _ in range(15):
            info = client.get_model_version(name=mv.name, version=mv.version)
            if info.status == "READY":
                break
            time.sleep(1)

        client.transition_model_version_stage(
            name=mv.name,
            version=mv.version,
            stage="Production",
            archive_existing_versions=True
        )
        print(f"🚀 {mv.name} v{mv.version} → Production")


In [76]:
def main():
    mlflow.set_experiment("Bank Loan Classification")

    with mlflow.start_run(run_name="Preprocessing and Tuning"):
        df = load_data()
        Xtr, Xv, Xt, ytr, yv, yt = split_data(df)

        # Save unprocessed versions for Evidently
        df_train = Xtr.copy()
        df_test = Xt.copy()

        # Simulate new batch
        # Replace this with your actual CSV path
        csv_path = "New_Customer_Bank_Personal_Loan.csv"

        # Load new data from a CSV file
        df_new = pd.read_csv(csv_path)

        # Ensure target column is dropped (since this is new/unlabeled data)
        if "Personal Loan" in df_new.columns:
            df_new = df_new.drop(columns=["Personal Loan"])

        df_new = df.sample(n=200, replace=True, random_state=42).drop(columns=["Personal Loan"])

        # Preprocess
        Xtr, Xv, Xt = preprocess_fit(Xtr, Xv, Xt)
        Xtf, Xvf, Xsf = feature_select_fit(Xtr, ytr, Xv, Xt)

        # Log preprocessing artifacts
        for file in ['pt.pkl', 'rs.pkl', 'ss.pkl', 'selector.pkl']:
            mlflow.log_artifact(os.path.join(SAVEDIR, file))

        Xb, yb = balance(Xtf, ytr)
        tune_and_save(Xb, yb, Xvf, yv)

        # Log Evidently reports
        log_evidently_report(df_train, df_test, dataset_name="train_vs_test")
        log_evidently_report(df_train, df_new, dataset_name="train_vs_new_batch")
        log_evidently_report(df_test, df_new, dataset_name="test_vs_new_batch")



In [77]:
if __name__=='__main__':
    main()



LogisticRegression tuned → {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
→ Accuracy: 0.884, Precision: 0.446, Recall: 0.866, F1: 0.589




DecisionTree tuned → {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
→ Accuracy: 0.969, Precision: 0.857, Recall: 0.806, F1: 0.831




RandomForest tuned → {'max_depth': None, 'n_estimators': 100}
→ Accuracy: 0.983, Precision: 0.982, Recall: 0.836, F1: 0.903




GradientBoosting tuned → {'learning_rate': 0.1, 'n_estimators': 100}
→ Accuracy: 0.980, Precision: 0.934, Recall: 0.851, F1: 0.891




KNN tuned → {'n_neighbors': 3}
→ Accuracy: 0.971, Precision: 0.873, Recall: 0.821, F1: 0.846




SVM tuned → {'C': 10, 'kernel': 'rbf'}
→ Accuracy: 0.977, Precision: 0.918, Recall: 0.836, F1: 0.875





🏆 Registering best model: RandomForest (F1=0.903)


Registered model 'BankLoanBestModel' already exists. Creating a new version of this model...
Created version '3' of model 'BankLoanBestModel'.
  client.transition_model_version_stage(


✅ Registered: BankLoanBestModel, version 3
🚀 BankLoanBestModel v3 → Production
📄 Evidently report logged: evidently_train_vs_test_2025-07-02_17-12-43.html
📄 Evidently report logged: evidently_train_vs_new_batch_2025-07-02_17-12-46.html
📄 Evidently report logged: evidently_test_vs_new_batch_2025-07-02_17-12-49.html
