In [13]:
import os
import datetime
import pandas as pd
import joblib
import kagglehub
import mlflow

from sklearn import datasets
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import PowerTransformer, RobustScaler, StandardScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from imblearn.over_sampling import SMOTE

import pandas as pd
import pandas as pd
from sklearn import datasets
    
from evidently import Dataset
from evidently import DataDefinition
from evidently import Report
from evidently.presets import DataDriftPreset, DataSummaryPreset

In [14]:

# Save dir
import os
SAVEDIR = os.getenv('ARTIFACT_DIR', '.') + '/saved_models'
os.makedirs(SAVEDIR, exist_ok=True)

def load_data():
    df = pd.read_excel(
        "Bank_Personal_Loan_Modelling.xlsx",
        sheet_name='Data'
    )
    # DROP via keyword axis=
    return df.drop(['ID', 'ZIP Code'], axis=1)


In [15]:
import json
from pathlib import Path
from datetime import datetime

import mlflow
from evidently import Report
from evidently.presets import DataDriftPreset, DataSummaryPreset

def log_evidently_report(reference_data, current_data, dataset_name="train_vs_test"):
    
    #  Align columns: use only the intersection to avoid partial-column errors
    common_cols = set(reference_data.columns).intersection(current_data.columns)
    if not common_cols:
        print(f"⚠️ No common columns between reference and {dataset_name}; skipping Evidently report.")
        return
    ref = reference_data[sorted(common_cols)]
    cur = current_data[sorted(common_cols)]

    #  Run the Evidently report (drift + summary)
    report = Report(metrics=[DataDriftPreset(), DataSummaryPreset()])
    result = report.run(reference_data=ref, current_data=cur)

    #  Ensure local save directory exists
    save_dir = Path.cwd() / "evidently_reports"
    save_dir.mkdir(parents=True, exist_ok=True)

    #  Save HTML and JSON
    ts = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    html_path = save_dir / f"evidently_{dataset_name}_{ts}.html"
    json_path = save_dir / f"evidently_{dataset_name}_{ts}.json"
    result.save_html(str(html_path))
    with open(json_path, "w", encoding="utf-8") as fp:
        fp.write(result.json())

    #  Log artifacts to MLflow
    mlflow.log_artifact(str(html_path), artifact_path="evidently")
    mlflow.log_artifact(str(json_path), artifact_path="evidently")
    print(f"📄 Logged HTML: {html_path.name}")
    print(f"🗄️  Logged JSON: {json_path.name}")

    #  Load JSON and extract metrics list
    with open(json_path, "r", encoding="utf-8") as fp:
        report_json = json.load(fp)
    metrics_list = report_json.get("metrics", [])

    #  Overall drifted columns metrics
    drift_entry = next((m for m in metrics_list if m.get("metric_id", "").startswith("DriftedColumnsCount")), None)
    if drift_entry:
        count = drift_entry["value"]["count"]
        share = drift_entry["value"]["share"]
        mlflow.log_metric("drifted_columns_count", float(count))
        mlflow.log_metric("drifted_columns_share", float(share))
        print(f"🔢 drifted_columns_count = {count}")
        print(f"🔢 drifted_columns_share = {share}")
    else:
        print("⚠️ No DriftedColumnsCount entry found.")

    #  Row and column counts
    rowcount = next((m["value"] for m in metrics_list if m.get("metric_id") == "RowCount()"), None)
    colcount = next((m["value"] for m in metrics_list if m.get("metric_id") == "ColumnCount()"), None)
    if rowcount is not None:
        mlflow.log_metric("dataset_row_count", float(rowcount))
        print(f"🔢 dataset_row_count = {rowcount}")
    if colcount is not None:
        mlflow.log_metric("dataset_column_count", float(colcount))
        print(f"🔢 dataset_column_count = {colcount}")

    #  Per-feature value drift metrics
    for m in metrics_list:
        mid = m.get("metric_id", "")
        if mid.startswith("ValueDrift(column="):
            # extract column name
            col = mid.split("=")[1].rstrip(")")
            val = m.get("value")
            if isinstance(val, (int, float)):
                mlflow.log_metric(f"drift_{col}", float(val))
                print(f"🔢 drift_{col} = {val}")
    
    print("✅ All requested drift & dataset metrics logged to MLflow.")


In [16]:
import os
import pandas as pd
import mlflow
from mlflow.exceptions import MlflowException
from mlflow.tracking import MlflowClient


EXPERIMENT_NAME = "Risk Classification Evidently"

def main():
    client = MlflowClient()

    # ─── 1️⃣ Ensure the MLflow experiment exists and is active ───
    exp = client.get_experiment_by_name(EXPERIMENT_NAME)
    if exp is None:
        exp_id = client.create_experiment(EXPERIMENT_NAME)
        print(f"✅ Created new experiment '{EXPERIMENT_NAME}' (ID={exp_id})")
    elif exp.lifecycle_stage == "deleted":
        client.restore_experiment(exp.experiment_id)
        print(f"🔄 Restored deleted experiment '{EXPERIMENT_NAME}' (ID={exp.experiment_id})")
    else:
        print(f"ℹ️ Using existing experiment '{EXPERIMENT_NAME}' (ID={exp.experiment_id})")

    mlflow.set_experiment(EXPERIMENT_NAME)

    # ─── 2️⃣ Start your MLflow run ───
    with mlflow.start_run(run_name="Preprocessing and Tuning"):
        # Load and split
        df = load_data()
        Xtr, Xv, Xt, ytr, yv, yt = split_data(df)

        # Keep raw for Evidently
        df_train = Xtr.copy()
        df_test  = Xt.copy()

        # Load or simulate new batch
        csv_path = "New_Customer_Bank_Personal_Loan.csv"
        df_new = pd.read_csv(csv_path)
        if "Defaulter" in df_new.columns:
            df_new = df_new.drop(columns=["Defaulter"])

        # Log Evidently reports
        log_evidently_report(df_train, df_test,      dataset_name="train_vs_test")
        log_evidently_report(df_train, df_new,        dataset_name="train_vs_new_batch")
        log_evidently_report(df_test,  df_new,        dataset_name="test_vs_new_batch")



In [17]:
if __name__=='__main__':
    main()

ℹ️ Using existing experiment 'Risk Classification Evidently' (ID=458900522193773038)
📄 Logged HTML: evidently_train_vs_test_2025-07-03_10-30-41.html
🗄️  Logged JSON: evidently_train_vs_test_2025-07-03_10-30-41.json
🔢 drifted_columns_count = 0.0
🔢 drifted_columns_share = 0.0
🔢 dataset_row_count = 1500.0
🔢 dataset_column_count = 11.0
🔢 drift_Age = 0.028061626698313084
🔢 drift_CCAvg = 0.03250447331201192
🔢 drift_Experience = 0.026506967833842664
🔢 drift_Income = 0.029321736822988533
🔢 drift_Mortgage = 0.02517476499730154
🔢 drift_CD Account = 0.00032384174356389915
🔢 drift_CreditCard = 0.01668874408751289
🔢 drift_Education = 0.012499112211892531
🔢 drift_Family = 0.018132241949840956
🔢 drift_Online = 0.0005154147823947248
🔢 drift_Securities Account = 0.003982768135332864
✅ All requested drift & dataset metrics logged to MLflow.
📄 Logged HTML: evidently_train_vs_new_batch_2025-07-03_10-30-44.html
🗄️  Logged JSON: evidently_train_vs_new_batch_2025-07-03_10-30-44.json
🔢 drifted_columns_count =