In [1]:
#imports
import os
import random
import mlflow
import json
import logging
import time
import copy
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import geopy.distance
import statsmodels.api as sm
import pickle as pkl
import optuna

from sqlalchemy import create_engine
from dotenv import load_dotenv
from sklearn.model_selection import train_test_split
from matplotlib.colors import LinearSegmentedColormap
from phik import resources, report
from phik.report import plot_correlation_matrix
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.base import clone
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import quantile_transform, robust_scale, scale, power_transform
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
from autofeat import AutoFeatRegressor
from catboost import CatBoostClassifier, Pool
from sklearn.multioutput import MultiOutputClassifier
from category_encoders import CatBoostEncoder
from autofeat import AutoFeatRegressor
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold
from statistics import median
from optuna.samplers import CmaEsSampler, RandomSampler
from optuna.integration.mlflow import MLflowCallback
from mlflow.utils.mlflow_tags import MLFLOW_PARENT_RUN_ID
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import skew, kurtosis
from sklearn.metrics import precision_score, recall_score, roc_curve, roc_auc_score





* 'schema_extra' has been renamed to 'json_schema_extra'
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#experiment_id = mlflow.create_experiment("diploma")

In [3]:
def log_to_mlflow(name, model, metrics,X_test, EXPERIMENT_NAME="diploma"):
    RUN_NAME = f"{name}_step"

    experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

    with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:

        mlflow.sklearn.log_model(sk_model=model, 
            artifact_path='models', 
            registered_model_name=name, 
            await_registration_for=60, 
            pip_requirements='../requirements.txt'
            )
        mlflow.log_metrics(metrics)

In [4]:
def get_metrics(model, x_train, y_train, x_val, y_val, need_fit=False):
    start_time = time.time()

    if isinstance(y_train, pd.DataFrame):
        y_train = y_train.values
    if isinstance(y_val, pd.DataFrame):
        y_val = y_val.values

    if need_fit:
        model = clone(model)
        model.fit(x_train, y_train)
    elapsed_fit_time = time.time() - start_time

    start_time = time.time()
    y_proba = model.predict_proba(x_val) 
    threshold = y_proba[:,1].mean()
    print(threshold)
    y_pred = (y_proba[:,1] >= threshold).astype(int)
    elapsed_predict_time = time.time() - start_time

    metrics = {
        'precision': precision_score(y_val, y_pred, average='weighted'),
        'recall': recall_score(y_val, y_pred, average='weighted'),
        'auc': roc_auc_score(y_val, y_proba[:, 1]),
        'fit_time': elapsed_fit_time,
        'predict_time': elapsed_predict_time
    }

    print(f"Fit Time: {metrics['fit_time']:.4f} seconds")
    print(f"Predict Time: {metrics['predict_time']:.4f} seconds")
    print(f"Precision: {metrics['precision']}")
    print(f"Recall: {metrics['recall']}")
    print(f"AUC: {metrics['auc']}")

    return metrics

In [5]:
# connections
load_dotenv()

os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net"
os.environ["AWS_ACCESS_KEY_ID"] = os.getenv("S3_ACCESS_KEY")
os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv("S3_SECRET_KEY")

TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

In [6]:
# Variables
DATA = os.getenv('DATA_DIR', '../data')
EXPERIMENT_NAME = 'diploma'
RANDOM_STATE = 42
NOTEBOOK = 'modeling.ipynb'
MODELS_DIR = 'models'

In [7]:
train_df = pd.read_csv(os.path.join(DATA, "processed", "train_prep.csv"))
test_df =  pd.read_csv(os.path.join(DATA, "processed", "test.csv"))

  test_df =  pd.read_csv(os.path.join(DATA, "processed", "test.csv"))


In [8]:
features = [ 'ind_empleado', 'sexo', 'age',
       'ind_nuevo', 'antiguedad', 'indrel', 'tiprel_1mes',
       'indresi', 'indext', 'canal_entrada', 'indfall', 'tipodom', 'nomprov',
       'ind_actividad_cliente', 'renta', 'segmento', 'ind_ahor_fin_ult1']
target = ['ind_ahor_fin_ult1',
       'ind_aval_fin_ult1', 'ind_cco_fin_ult1', 'ind_cder_fin_ult1',
       'ind_cno_fin_ult1', 'ind_ctju_fin_ult1', 'ind_ctma_fin_ult1',
       'ind_ctop_fin_ult1', 'ind_ctpp_fin_ult1', 'ind_deco_fin_ult1',
       'ind_deme_fin_ult1', 'ind_dela_fin_ult1', 'ind_ecue_fin_ult1',
       'ind_fond_fin_ult1', 'ind_hip_fin_ult1', 'ind_plan_fin_ult1',
       'ind_pres_fin_ult1', 'ind_reca_fin_ult1', 'ind_tjcr_fin_ult1',
       'ind_valo_fin_ult1', 'ind_viv_fin_ult1', 'ind_nomina_ult1',
       'ind_nom_pens_ult1', 'ind_recibo_ult1']

In [9]:
X_train = train_df[features]
X_test = test_df[features]
y_train = train_df[target]
y_test = test_df[target]

In [10]:
cat_columns = ['ind_empleado', 'sexo', 'ind_nuevo',
                'indrel',  'tiprel_1mes',
                'indresi', 'indext', 'indfall', 'tipodom',
                'ind_actividad_cliente', 'segmento',
                'canal_entrada', 'ind_ahor_fin_ult1', 'nomprov']
num_columns = ['age', 'antiguedad', 'renta']

In [11]:
print(len(X_train.columns), len(num_columns), len(cat_columns))

17 3 14


In [12]:
del train_df
del test_df

In [13]:
print(X_train.shape, X_test.shape)

(8652061, 17) (4621976, 17)


In [14]:
X_test.head()

Unnamed: 0,ind_empleado,sexo,age,ind_nuevo,antiguedad,indrel,tiprel_1mes,indresi,indext,canal_entrada,indfall,tipodom,nomprov,ind_actividad_cliente,renta,segmento,ind_ahor_fin_ult1
0,N,V,20.0,1.0,5.0,1.0,I,1.0,1.0,KHQ,0.0,1.0,ALICANTE,0.0,34745.28,03 - UNIVERSITARIO,0
1,N,H,25.0,1.0,5.0,1.0,I,1.0,0.0,KHQ,0.0,1.0,CASTELLON,1.0,184449.27,03 - UNIVERSITARIO,0
2,N,H,20.0,1.0,5.0,1.0,I,1.0,0.0,KHQ,0.0,1.0,BARCELONA,0.0,67044.99,03 - UNIVERSITARIO,0
3,N,V,23.0,1.0,5.0,1.0,I,1.0,1.0,KHQ,0.0,1.0,ALICANTE,0.0,42478.02,03 - UNIVERSITARIO,0
4,N,H,20.0,1.0,5.0,1.0,I,1.0,0.0,KHQ,0.0,1.0,TOLEDO,0.0,89482.35,03 - UNIVERSITARIO,0


In [15]:
X_train.head()

Unnamed: 0,ind_empleado,sexo,age,ind_nuevo,antiguedad,indrel,tiprel_1mes,indresi,indext,canal_entrada,indfall,tipodom,nomprov,ind_actividad_cliente,renta,segmento,ind_ahor_fin_ult1
0,N,H,35.0,0.0,6.0,1.0,A,1.0,0.0,KHL,0.0,1.0,MALAGA,1.0,87218.1,02 - PARTICULARES,0
1,N,V,23.0,0.0,35.0,1.0,I,1.0,1.0,KHE,0.0,1.0,CIUDAD REAL,0.0,35548.74,03 - UNIVERSITARIO,0
2,N,V,23.0,0.0,35.0,1.0,I,1.0,0.0,KHE,0.0,1.0,CIUDAD REAL,0.0,122179.11,03 - UNIVERSITARIO,0
3,N,H,22.0,0.0,35.0,1.0,I,1.0,0.0,KHD,0.0,1.0,ZARAGOZA,0.0,119775.54,03 - UNIVERSITARIO,0
4,N,V,23.0,0.0,35.0,1.0,A,1.0,0.0,KHE,0.0,1.0,ZARAGOZA,1.0,110793.284291,03 - UNIVERSITARIO,0


In [16]:
y_train.head()

Unnamed: 0,ind_ahor_fin_ult1,ind_aval_fin_ult1,ind_cco_fin_ult1,ind_cder_fin_ult1,ind_cno_fin_ult1,ind_ctju_fin_ult1,ind_ctma_fin_ult1,ind_ctop_fin_ult1,ind_ctpp_fin_ult1,ind_deco_fin_ult1,...,ind_hip_fin_ult1,ind_plan_fin_ult1,ind_pres_fin_ult1,ind_reca_fin_ult1,ind_tjcr_fin_ult1,ind_valo_fin_ult1,ind_viv_fin_ult1,ind_nomina_ult1,ind_nom_pens_ult1,ind_recibo_ult1
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.0,0.0,0
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.0,0.0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.0,0.0,0
3,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0.0,0.0,0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.0,0.0,0


In [15]:

models = {}
predictions = {}
metrics = {}
for product in target:
    print(product)
    preprocessor = ColumnTransformer(
        transformers=[
            ('scaler', StandardScaler(), num_columns),
            ('cat', CatBoostEncoder(), cat_columns)], 
            remainder='passthrough',
            verbose_feature_names_out=True,
            )
    preprocessor.set_output(transform='pandas')
    pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('classifier', CatBoostClassifier(iterations=500,
                                            learning_rate=0.1,
                                            depth=6,
                                            eval_metric='AUC',
                                            verbose=500))
                    ])
    pipeline.fit(X_train, y_train[product])
    with open(f'../models/{product}_model.pkl', 'wb') as f:
        pkl.dump(pipeline, f)
    models[product] = pipeline
    predictions[product] = pipeline.predict(X_test)
    metrics[product] = get_metrics(pipeline, X_train, y_train[product], X_test, y_test[product], need_fit=False)
    log_to_mlflow(product, pipeline, metrics[product], X_test)



ind_ahor_fin_ult1
0:	total: 891ms	remaining: 7m 24s
499:	total: 5m 16s	remaining: 0us
8.429539374476561e-05
Fit Time: 0.0000 seconds
Predict Time: 4.8868 seconds
Precision: 0.9999917984877948
Recall: 0.9999909129774798
AUC: 1.0


2024-09-11 22:15:30,518 INFO: Found credentials in environment variables.
Registered model 'ind_ahor_fin_ult1' already exists. Creating a new version of this model...
2024/09/11 22:15:30 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: ind_ahor_fin_ult1, version 3
Created version '3' of model 'ind_ahor_fin_ult1'.


ind_aval_fin_ult1
0:	total: 830ms	remaining: 6m 54s
499:	total: 6m 12s	remaining: 0us
1.3819830523950796e-05
Fit Time: 0.0000 seconds
Predict Time: 5.0164 seconds
Precision: 0.9999798327659655
Recall: 0.9764005697995836
AUC: 0.9520846310846677


Successfully registered model 'ind_aval_fin_ult1'.
2024/09/11 22:22:26 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: ind_aval_fin_ult1, version 1
Created version '1' of model 'ind_aval_fin_ult1'.


ind_cco_fin_ult1
0:	total: 800ms	remaining: 6m 39s
499:	total: 6m 12s	remaining: 0us
0.589626748399629
Fit Time: 0.0000 seconds
Predict Time: 5.1751 seconds
Precision: 0.7361809070460277
Recall: 0.7314029756969747
AUC: 0.794568917993528


Successfully registered model 'ind_cco_fin_ult1'.
2024/09/11 22:29:16 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: ind_cco_fin_ult1, version 1
Created version '1' of model 'ind_cco_fin_ult1'.


ind_cder_fin_ult1
0:	total: 836ms	remaining: 6m 57s
499:	total: 6m 6s	remaining: 0us
0.00032958681628767784
Fit Time: 0.0000 seconds
Predict Time: 5.1363 seconds
Precision: 0.9995930731192951
Recall: 0.8521190936517196
AUC: 0.9383041820487124


Successfully registered model 'ind_cder_fin_ult1'.
2024/09/11 22:36:07 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: ind_cder_fin_ult1, version 1
Created version '1' of model 'ind_cder_fin_ult1'.


ind_cno_fin_ult1
0:	total: 855ms	remaining: 7m 6s
499:	total: 6m 20s	remaining: 0us
0.08257231170171517
Fit Time: 0.0000 seconds
Predict Time: 5.1609 seconds
Precision: 0.9361994325642782
Recall: 0.753065139239148
AUC: 0.8964409638525302


Successfully registered model 'ind_cno_fin_ult1'.
2024/09/11 22:43:10 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: ind_cno_fin_ult1, version 1
Created version '1' of model 'ind_cno_fin_ult1'.


ind_ctju_fin_ult1
0:	total: 804ms	remaining: 6m 41s
499:	total: 6m 4s	remaining: 0us
0.008252700417986465
Fit Time: 0.0000 seconds
Predict Time: 5.0681 seconds
Precision: 0.9984629827716912
Recall: 0.9981207171997432
AUC: 0.9998853782127332


Successfully registered model 'ind_ctju_fin_ult1'.
2024/09/11 22:49:57 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: ind_ctju_fin_ult1, version 1
Created version '1' of model 'ind_ctju_fin_ult1'.


ind_ctma_fin_ult1
0:	total: 898ms	remaining: 7m 28s
499:	total: 6m 7s	remaining: 0us
0.0083508780917676
Fit Time: 0.0000 seconds
Predict Time: 5.1595 seconds
Precision: 0.990379704659613
Recall: 0.9401870541950023
AUC: 0.9579745668436807


Successfully registered model 'ind_ctma_fin_ult1'.
2024/09/11 22:56:49 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: ind_ctma_fin_ult1, version 1
Created version '1' of model 'ind_ctma_fin_ult1'.


ind_ctop_fin_ult1
0:	total: 781ms	remaining: 6m 29s
499:	total: 5m 56s	remaining: 0us
0.11292600710219064
Fit Time: 0.0000 seconds
Predict Time: 5.0928 seconds
Precision: 0.9247296474961401
Recall: 0.8145803872629369
AUC: 0.9303822727204764


Successfully registered model 'ind_ctop_fin_ult1'.
2024/09/11 23:03:29 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: ind_ctop_fin_ult1, version 1
Created version '1' of model 'ind_ctop_fin_ult1'.


ind_ctpp_fin_ult1
0:	total: 886ms	remaining: 7m 22s
499:	total: 6m 17s	remaining: 0us
0.041006029105590244
Fit Time: 0.0000 seconds
Predict Time: 5.1247 seconds
Precision: 0.96462651609989
Recall: 0.8069027186640519
AUC: 0.9136699231716984


Successfully registered model 'ind_ctpp_fin_ult1'.
2024/09/11 23:10:31 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: ind_ctpp_fin_ult1, version 1
Created version '1' of model 'ind_ctpp_fin_ult1'.


ind_deco_fin_ult1
0:	total: 829ms	remaining: 6m 53s
499:	total: 5m 57s	remaining: 0us
0.0009298778942038279
Fit Time: 0.0000 seconds
Predict Time: 5.1567 seconds
Precision: 0.9995500718593059
Recall: 0.8121625901995164
AUC: 0.9114212834064453


Successfully registered model 'ind_deco_fin_ult1'.
2024/09/11 23:17:12 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: ind_deco_fin_ult1, version 1
Created version '1' of model 'ind_deco_fin_ult1'.


ind_deme_fin_ult1
0:	total: 833ms	remaining: 6m 55s
499:	total: 6m 6s	remaining: 0us
0.0012056071386965714
Fit Time: 0.0000 seconds
Predict Time: 5.1506 seconds
Precision: 0.9987055365208878
Recall: 0.8257784116576979
AUC: 0.9373986524649629


Successfully registered model 'ind_deme_fin_ult1'.
2024/09/11 23:24:03 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: ind_deme_fin_ult1, version 1
Created version '1' of model 'ind_deme_fin_ult1'.


ind_dela_fin_ult1
0:	total: 847ms	remaining: 7m 2s
499:	total: 6m 17s	remaining: 0us
0.044789161513677865
Fit Time: 0.0000 seconds
Predict Time: 5.1551 seconds
Precision: 0.9676445057754461
Recall: 0.7946516814453385
AUC: 0.943738235440145


Successfully registered model 'ind_dela_fin_ult1'.
2024/09/11 23:31:05 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: ind_dela_fin_ult1, version 1
Created version '1' of model 'ind_dela_fin_ult1'.


ind_ecue_fin_ult1
0:	total: 840ms	remaining: 6m 59s
499:	total: 6m 17s	remaining: 0us
0.07753964966906146
Fit Time: 0.0000 seconds
Predict Time: 5.1635 seconds
Precision: 0.9315045953082369
Recall: 0.7802933204326461
AUC: 0.9024906773784815


Successfully registered model 'ind_ecue_fin_ult1'.
2024/09/11 23:38:06 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: ind_ecue_fin_ult1, version 1
Created version '1' of model 'ind_ecue_fin_ult1'.


ind_fond_fin_ult1
0:	total: 820ms	remaining: 6m 49s
499:	total: 6m 9s	remaining: 0us
0.015774698859360673
Fit Time: 0.0000 seconds
Predict Time: 5.0917 seconds
Precision: 0.9836453255906631
Recall: 0.81091853354496
AUC: 0.9360349835363113


Successfully registered model 'ind_fond_fin_ult1'.
2024/09/11 23:45:00 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: ind_fond_fin_ult1, version 1
Created version '1' of model 'ind_fond_fin_ult1'.


ind_hip_fin_ult1
0:	total: 855ms	remaining: 7m 6s
499:	total: 6m 10s	remaining: 0us
0.004804895706015536
Fit Time: 0.0000 seconds
Predict Time: 5.1321 seconds
Precision: 0.9951130362183833
Recall: 0.8775887196298726
AUC: 0.9634837014119898


Successfully registered model 'ind_hip_fin_ult1'.
2024/09/11 23:51:55 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: ind_hip_fin_ult1, version 1
Created version '1' of model 'ind_hip_fin_ult1'.


ind_plan_fin_ult1
0:	total: 833ms	remaining: 6m 55s
499:	total: 6m 7s	remaining: 0us
0.008279438042661591
Fit Time: 0.0000 seconds
Predict Time: 5.1124 seconds
Precision: 0.9917750378636684
Recall: 0.8272803666656858
AUC: 0.9434324923125037


Successfully registered model 'ind_plan_fin_ult1'.
2024/09/11 23:58:46 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: ind_plan_fin_ult1, version 1
Created version '1' of model 'ind_plan_fin_ult1'.


ind_pres_fin_ult1
0:	total: 821ms	remaining: 6m 49s
499:	total: 6m 9s	remaining: 0us
0.0026785051844099962
Fit Time: 0.0000 seconds
Predict Time: 5.1229 seconds
Precision: 0.9977078169156726
Recall: 0.889052431254511
AUC: 0.9749577837507154


Successfully registered model 'ind_pres_fin_ult1'.
2024/09/12 00:05:39 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: ind_pres_fin_ult1, version 1
Created version '1' of model 'ind_pres_fin_ult1'.


ind_reca_fin_ult1
0:	total: 834ms	remaining: 6m 56s
499:	total: 6m 18s	remaining: 0us
0.05218262311601845
Fit Time: 0.0000 seconds
Predict Time: 5.1774 seconds
Precision: 0.9522127088913537
Recall: 0.770710838827376
AUC: 0.8883578128647763


Successfully registered model 'ind_reca_fin_ult1'.
2024/09/12 00:12:42 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: ind_reca_fin_ult1, version 1
Created version '1' of model 'ind_reca_fin_ult1'.


ind_tjcr_fin_ult1
0:	total: 821ms	remaining: 6m 49s
499:	total: 6m 19s	remaining: 0us
0.04732797786008566
Fit Time: 0.0000 seconds
Predict Time: 5.1010 seconds
Precision: 0.9659305404837227
Recall: 0.7981363382241707
AUC: 0.9187972390599933


Successfully registered model 'ind_tjcr_fin_ult1'.
2024/09/12 00:19:47 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: ind_tjcr_fin_ult1, version 1
Created version '1' of model 'ind_tjcr_fin_ult1'.


ind_valo_fin_ult1
0:	total: 817ms	remaining: 6m 47s
499:	total: 6m 12s	remaining: 0us
0.02299063802547178
Fit Time: 0.0000 seconds
Predict Time: 5.1343 seconds
Precision: 0.976866729201058
Recall: 0.8112153762806211
AUC: 0.9281212293290494


Successfully registered model 'ind_valo_fin_ult1'.
2024/09/12 00:26:45 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: ind_valo_fin_ult1, version 1
Created version '1' of model 'ind_valo_fin_ult1'.


ind_viv_fin_ult1
0:	total: 842ms	remaining: 7m
499:	total: 6m 12s	remaining: 0us
0.0032525179748777912
Fit Time: 0.0000 seconds
Predict Time: 5.1408 seconds
Precision: 0.996541108241312
Recall: 0.7992572873593459
AUC: 0.9292205205441828


Successfully registered model 'ind_viv_fin_ult1'.
2024/09/12 00:33:43 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: ind_viv_fin_ult1, version 1
Created version '1' of model 'ind_viv_fin_ult1'.


ind_nomina_ult1
0:	total: 829ms	remaining: 6m 53s
499:	total: 6m 22s	remaining: 0us
0.06266981740822232
Fit Time: 0.0000 seconds
Predict Time: 5.2212 seconds
Precision: 0.9561056153693931
Recall: 0.7457565768407278
AUC: 0.9053539413477245


Successfully registered model 'ind_nomina_ult1'.
2024/09/12 00:40:50 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: ind_nomina_ult1, version 1
Created version '1' of model 'ind_nomina_ult1'.


ind_nom_pens_ult1


: 

In [16]:

models = {}
predictions = {}
metrics = {}
for product in ['ind_nom_pens_ult1', 'ind_recibo_ult1']:
    print(product)
    preprocessor = ColumnTransformer(
        transformers=[
            ('scaler', StandardScaler(), num_columns),
            ('cat', CatBoostEncoder(), cat_columns)], 
            remainder='passthrough',
            verbose_feature_names_out=True,
            )
    preprocessor.set_output(transform='pandas')
    pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('classifier', CatBoostClassifier(iterations=500,
                                            learning_rate=0.1,
                                            depth=6,
                                            eval_metric='AUC',
                                            verbose=500))
                    ])
    pipeline.fit(X_train, y_train[product])
    with open(f'../models/{product}_model.pkl', 'wb') as f:
        pkl.dump(pipeline, f)
    models[product] = pipeline
    predictions[product] = pipeline.predict(X_test)
    metrics[product] = get_metrics(pipeline, X_train, y_train[product], X_test, y_test[product], need_fit=False)
    log_to_mlflow(product, pipeline, metrics[product], X_test)



ind_nom_pens_ult1
0:	total: 840ms	remaining: 6m 59s


In [None]:
metrics = {}
for product in target:
    with open(f'../models/{product}_model.pkl', 'rb') as file:
        model = pkl.load(file)
    metrics[product] = get_metrics(model, X_train, y_train[product], X_test, y_test[product], need_fit=False)

In [22]:
metrics_df = pd.DataFrame.from_dict(metrics, orient='index')
metrics_df

Unnamed: 0,precision,recall,auc,fit_time,predict_time
ind_ahor_fin_ult1,0.999992,0.999991,1.0,3e-06,5.117586
ind_aval_fin_ult1,0.99998,0.976401,0.952085,2e-06,5.204835
ind_cco_fin_ult1,0.736181,0.731403,0.794569,2e-06,5.381997
ind_cder_fin_ult1,0.999593,0.852119,0.938304,2e-06,5.734502
ind_cno_fin_ult1,0.936199,0.753065,0.896441,2e-06,5.304681
ind_ctju_fin_ult1,0.998463,0.998121,0.999885,2e-06,5.32306
ind_ctma_fin_ult1,0.99038,0.940187,0.957975,2e-06,6.007132
ind_ctop_fin_ult1,0.92473,0.81458,0.930382,2e-06,5.675728
ind_ctpp_fin_ult1,0.964627,0.806903,0.91367,2e-06,5.289916
ind_deco_fin_ult1,0.99955,0.812163,0.911421,1e-06,5.304837


In [23]:
metrics_df.to_csv("../metrics/metrics.csv")

#### Интерпретация для каждой модели:
- Модели с высоким precision, recall и AUC (например, ind_ahor_fin_ult1, ind_ctju_fin_ult1, ind_plan_fin_ult1): Эти модели почти идеально предсказывают как положительные, так и отрицательные примеры. Вероятно, в данных был либо хороший баланс классов, либо модель смогла эффективно справиться с задачей.
- Модели с умеренными значениями AUC и recall (например, ind_cco_fin_ult1 с AUC 0.794569 и recall 0.731403): Такие модели могли испытывать трудности с распознаванием всех положительных примеров или с балансом классов. Возможно, стоит поработать с гиперпараметрами или использовать методы балансировки классов 