In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import os
import sys
import mlflow
import warnings
import time
import json
import shap

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval
from matplotlib import pyplot as plt
from dotenv import load_dotenv
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score, accuracy_score, recall_score
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

sys.path.append("../")

from models.scorer import home_credit_scorer
from scripts.patch_shap import patched_beeswarm
from scripts.mlflow_functions import plot_auc_conf, describe_run, get_shap_summary


load_dotenv()
sns.color_palette('colorblind')
plt.style.use('Solarize_Light2')

# Setting default DPI, pulling it from dotenv if it exists, setting it on 100 if not

try:
    pc_dpi = int(os.getenv('DPI'))
except TypeError:
    pc_dpi = 100
if pc_dpi is None:
    pc_dpi = 100

warnings.filterwarnings("ignore", message="Setuptools is replacing distutils.")
client = mlflow.MlflowClient(tracking_uri=os.path.abspath("../mlruns/"))

mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URI"))


# Mlflow definitions :

In [2]:
def load_penguins(keep_na: bool = False) -> pd.DataFrame:
    """
    Function : Returns the pandas DataFrame of the Penguins dataset, let's face it, penguins are better than Iris

    Args :
    - keep_na : bool = False, whether or not the returned dataset contains the NA values of the OG dataset

    Returns :
    - penguin dataset as a pandas.DataFrame object
    """

    filepath = """https://github.com/allisonhorst/palmerpenguins/\
raw/5b5891f01b52ae26ad8cb9755ec93672f49328a8/data/penguins_size.csv"""
    penguins = pd.read_csv(filepath)
    if not keep_na:
        return penguins.dropna()
    elif keep_na:
        return penguins


In [3]:
df = load_penguins()


In [4]:
df.head()


Unnamed: 0,species_short,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,MALE


In [5]:
df = df[df["sex"] != "."]

df["sex"].value_counts()


MALE      168
FEMALE    165
Name: sex, dtype: int64

In [6]:
df.loc[:, "is_male"] = df["sex"].apply(lambda x: 1 if x == "MALE" else 0)


In [7]:
df.drop(columns=["sex"], inplace=True)


In [8]:
df.head()


Unnamed: 0,species_short,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,is_male
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,1
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,0
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,0
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,0
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,1


In [9]:
to_encode = ["species_short", "island"]
encoded = pd.get_dummies(df[to_encode])
df[encoded.columns] = encoded

df_model = df.drop(columns=to_encode)


In [10]:
df_model.head()


Unnamed: 0,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,is_male,species_short_Adelie,species_short_Chinstrap,species_short_Gentoo,island_Biscoe,island_Dream,island_Torgersen
0,39.1,18.7,181.0,3750.0,1,1,0,0,0,0,1
1,39.5,17.4,186.0,3800.0,0,1,0,0,0,0,1
2,40.3,18.0,195.0,3250.0,0,1,0,0,0,0,1
4,36.7,19.3,193.0,3450.0,0,1,0,0,0,0,1
5,39.3,20.6,190.0,3650.0,1,1,0,0,0,0,1


In [11]:
df_model.head()

Unnamed: 0,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,is_male,species_short_Adelie,species_short_Chinstrap,species_short_Gentoo,island_Biscoe,island_Dream,island_Torgersen
0,39.1,18.7,181.0,3750.0,1,1,0,0,0,0,1
1,39.5,17.4,186.0,3800.0,0,1,0,0,0,0,1
2,40.3,18.0,195.0,3250.0,0,1,0,0,0,0,1
4,36.7,19.3,193.0,3450.0,0,1,0,0,0,0,1
5,39.3,20.6,190.0,3650.0,1,1,0,0,0,0,1


In [12]:
columns_pred = [col for col in df_model.columns if col != "is_male"]


In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    df_model[columns_pred],
    df_model["is_male"],
    test_size=0.3,
    random_state=123
    )


In [17]:
def train_and_log(
        estimator, X_train, X_test, y_train, y_test, model_name="classifier",
        dataset_version="default", imb_method="None", na_thresh=0, params: dict = None,
        ):
    """
    Trains and predict the model based on X_train/test and y_train/test + classifier
    Logs the model name (default is classifier) using mlflow as well as the params and
    the metrics (accuracy, f1, recall) using mlflow. Handles description creation with json template

    Args :
    - estimator : classifier (supported are : sklearn, xgboost and catboost)
    - X_train : the training data
    - X_test : the test data
    - y_train : the training labels
    - y_test : the test labels
    - model_name : default = classifier, the name of the model as will appear on mlflow's logs
    - id_dict : dictionnary of ids to log as train/test split for replication, optionnal
    - dataset_version : default = default, the version of the dataset
    - imb_method : default = None, the class imbalance method used
    - na_thresh : default = 0, float between 0 and 1, informs that variables
    containing more than this thresh have been dropped
    - params : the parameters to pass to the estimator

    Returns :
    - metrics : dictionnary of the evaluated metrics
    - model : the classifier fitted on X_train/y_train and params
    """

    mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URI"))

    description = describe_run(
        template_path="../templates/description_mlflow.json",
        model_name=model_name,
        data_version=dataset_version,
        imb_learn_method=imb_method,
        column_drop_na_threshold=na_thresh,
        )

    with mlflow.start_run(run_name=model_name, description=description):
        start = time.perf_counter()
        mlflow.log_params(params)

        if estimator.__module__.startswith("xgboost"):
            classifier = estimator(**params, eval_metric=home_credit_scorer)
        elif estimator.__module__.startswith("lightgbm") or \
                estimator.__module__.startswith("catboost") or estimator.__module__.startswith("sklearn"):
            classifier = estimator(**params)
        classifier.fit(X=X_train, y=y_train)

        # Metrics :
        home_credit_score = home_credit_scorer(estimator=classifier, X=X_test, y_true=y_test)
        accuracy = accuracy_score(y_test, classifier.predict(X_test))
        f1 = f1_score(y_test, classifier.predict(X_test), average="macro")
        recall = recall_score(y_test, classifier.predict(X_test), average="macro")
        auroc, auroc_conf_fig = plot_auc_conf(estimator=classifier, X_test=X_test, y_test=y_test, display=False)

        metrics = {
            "home_credit_score": home_credit_score, "accuracy": accuracy,
            "f1": f1, "recall": recall, "auroc": auroc
            }

        # MLFlow log :
        mlflow.log_metrics(metrics)

        mlflow.log_figure(auroc_conf_fig, "AUROC_Conf_matrix.png")

        # waterfall = get_shap_waterfall(clf=classifier, X_train=X_train)
        # mlflow.log_figure(waterfall, "waterfall_shap.png")

        summary = get_shap_summary(clf=classifier, X_train=X_train)
        mlflow.log_figure(summary, "summary_shap.png")

        model_type = type(classifier)

        if model_type.__module__.startswith("sklearn"):
            artifact_path = "sklearn-model"
            mlflow.sklearn.log_model(classifier, artifact_path=artifact_path, registered_model_name=model_name)
        elif model_type.__module__.startswith("xgboost"):
            artifact_path = "xGboost-model"
            mlflow.xgboost.log_model(classifier, artifact_path=artifact_path, registered_model_name=model_name)
        elif model_type.__module__.startswith("catboost"):
            artifact_path = "catboost-model"
            mlflow.catboost.log_model(classifier, artifact_path=artifact_path, registered_model_name=model_name)
        elif model_type.__module__.startswith("lightgbm"):
            artifact_path = "lightgbm-model"
            mlflow.lightgbm.log_model(classifier, artifact_path=artifact_path, registered_model_name=model_name)

        run_id = mlflow.active_run().info.run_id

        model_uri = f"runs:/{run_id}/{artifact_path}"

        end = time.perf_counter()

        mlflow.log_param(key="processing time", value=end - start)

        mlflow.register_model(model_uri=model_uri, name=model_name)

        return metrics, classifier


In [18]:
params = {
    "n_estimators": 100
}

In [20]:
mlflow.set_experiment(experiment_name="penguins_classifications")

metrics, clf = train_and_log(
    estimator=XGBClassifier,
    X_train=X_train,
    X_test=X_test,
    y_train=y_train,
    y_test=y_test,
    dataset_version="v1",
    imb_method="xyz",
    na_thresh=0,
    params=params,
    model_name="test_metric",
)


ntree_limit is deprecated, use `iteration_range` or model slicing instead.
No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored
eval_metric is not saved in Scikit-Learn meta.
Registered model 'test_metric' already exists. Creating a new version of this model...
2023/03/05 15:28:06 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: test_metric, version 3
Created version '3' of model 'test_metric'.
Registered model 'test_metric' already exists. Creating a new version of this model...
2023/03/05 15:28:06 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: test_metric, version 4
Created version '4' of model 'test_metric'.


# Hyperopt test :

Will time the difference of processing time between gridsearchcv, increasing the range of possibilities.

# SHAP

In [None]:
clf = XGBClassifier()

clf.fit(X_train, y_train)

X_all = df_model.drop(columns=["is_male"]).to_numpy()

features = X_train.columns

explainer = shap.TreeExplainer(clf)
shap_values = explainer(X_train)


In [None]:
my_waterfall = shap.plots.waterfall(shap_values[0], max_display=20, show=False)
my_waterfall.figure.set_size_inches(8, 8)
my_waterfall.figure.set_dpi(150)

my_waterfall.suptitle("Impact des differentes variables sur le modele, classées par ordre d'importance")

plt.show()


In [None]:

plt.show()