In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import os
import sys
import mlflow
import pickle
import evidently
import urllib
import gc

from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from dotenv import load_dotenv

# EVIDENTLY :
from evidently.report import Report
from evidently.metric_preset import DataDriftPreset
from evidently.metric_preset import ClassificationPreset
# /EVIDENTLY

sys.path.append("../")

load_dotenv()
sns.color_palette('colorblind')
plt.style.use('Solarize_Light2')

# Setting default DPI, pulling it from dotenv if it exists, setting it on 100 if not

try:
    pc_dpi = int(os.getenv('DPI'))
except TypeError:
    pc_dpi = 100
if pc_dpi is None:
    pc_dpi = 100

client = mlflow.MlflowClient(tracking_uri=os.path.abspath("../mlruns/"))

mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URI"))


# This notebook focuses on the evaluation of the potential data drift of the retained models
- Retained models are xgboost and catboost (Keras DNN is less accurate and requires more data preprocessing, see previous nb.)
- Evidently will be used to simulate the addition of data over time
- This will be done under the assumption that `applications_train` is known data and `applications_test` is unseen data.
- The aim is to detect eventual drift and propose solutions to counter it, either by retraining the model or other options that Evidently proposes.

# 1 : Recovering the models from MLFlow and restoring the splits
- Both catboost and xgboost have been trained, crossvalidated and assessed on a hold out set. These models have been logged through mlflow along their hyperparameters. We can avoid recomputing the models by loading the serialized files from MLFlow.
- The data used for training will be restored as it was done in the previous notebook (random seed of `123` and `train/test/validation` split)
- The variable order is important to evidently, so we will also load the training columns, also saved in MLFlow.

In [2]:
# XGBOOST CV infos :
xgb_run_id = "f62231946a8f412d87a1198a731827b8"
xgb_run = mlflow.get_run(run_id=xgb_run_id)

xgb_artifact_uri = xgb_run.info.artifact_uri

xgb_mlflow_model = f"runs:/{xgb_run_id}/xGboost-model"

xgb_classifier = mlflow.xgboost.load_model(model_uri=xgb_mlflow_model)

# columns train : 
columns_file = "columns_train/columns_train.pkl"
columns_path = os.path.join(xgb_artifact_uri, columns_file)
columns_path = urllib.parse.urlparse(columns_path).path

with open(columns_path, "rb") as col_pkl:
    columns_train = pickle.load(col_pkl)


In [3]:
# Catboost CV :

cb_run_id = "c21201ee795547eda136f395bf3ec8a6"
cb_run = mlflow.get_run(run_id=cb_run_id)

cb_artifact_uri = cb_run.info.artifact_uri

cb_mlflow_model = f"runs:/{cb_run_id}/catboost-model"

cb_classifier = mlflow.catboost.load_model(model_uri=cb_mlflow_model)


In [4]:
# Reloading the training data and restoring the initial split :
df_train = pd.read_pickle(filepath_or_buffer="../data/df_hc_nm.pkl")


In [5]:
df_train.rename(columns={"TARGET": "Payment_difficulties"}, inplace=True)

target_col = "Payment_difficulties"


In [6]:
# Splitting the data into training, validation and test sets
X_train_val, X_test, y_train_val, y_test = train_test_split(
    df_train.drop(columns=target_col),
    df_train[target_col],
    test_size=0.3,
    random_state=123
    )

X_train, X_val, y_train, y_val = train_test_split(
    X_train_val,
    y_train_val,
    test_size=0.25,
    random_state=123
    )


In [7]:
# unknown data loading :

df_unknown = pd.read_pickle(filepath_or_buffer="../data/home_credit_data_test.pkl")


In [8]:
# selection of the columns used by estimators : 

df_unknown = df_unknown[columns_train]


In [9]:
gc.collect()


0

# 2 : General pupose functions :
- `generate_report` : takes a classifier, known and unknown data to generate a drift report

In [10]:
def generate_report(clf, known_data: pd.DataFrame, unknown_data: pd.DataFrame, target_col: str):
    """
    Generate reports using Evidently library.

    Args:
    - clf : Fitted classifier object (xgboost or catboost).
    - known_data : Dataframe containing the known data (labelled).
    - unknown_data : Dataframe containing the unknown data (no label col.).
    - target_col : Name of the target column in the dataframes.

    Returns:
    - drift_report : Data drift report object.
    """

    # Get predictions for the known data
    known_data["prediction"] = clf.predict(known_data.drop(target_col, axis=1))

    # Get predictions for the unknown data
    unknown_data["prediction"] = clf.predict(unknown_data)  # No target -> no need to drop

    known_data = known_data.drop(columns=target_col)

    # Generate data drift report
    drift_report = Report(metrics=[DataDriftPreset()])
    drift_report.run(reference_data=known_data, current_data=unknown_data)

    return drift_report


# 3 : Reports

In [11]:
df_known = pd.concat([X_train, y_train], axis=1)


In [12]:
# Sampling because it takes ages : 

df_known_sample = df_known.sample(n=10_000, replace=False, random_state=123)
df_unknown_sample = df_unknown.sample(n=10_000, replace=False, random_state=123)


In [13]:
df_known_sample["Payment_difficulties"].value_counts()


1    5060
0    4940
Name: Payment_difficulties, dtype: int64

## 3.1 : xGboost

In [14]:
xgb_drift = generate_report(
    clf=xgb_classifier,
    known_data=df_known_sample,
    unknown_data=df_unknown_sample,
    target_col="Payment_difficulties"
    )


In [15]:
xgb_drift.save_html(filename="../data/xgb_drift.html")


## 3.2 : Catboost :

In [16]:
cb_drift = generate_report(
    clf=cb_classifier,
    known_data=df_known_sample,
    unknown_data=df_unknown_sample,
    target_col="Payment_difficulties"
    )


In [17]:
cb_drift.save_html(filename="../data/catboost_drift.html")
