# Imports

In [1]:
import sys
import pandas as pd
import joblib
sys.path.insert(0, '../src/')

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from feature_engine.imputation import MeanMedianImputer
from feature_engine.wrappers import SklearnTransformerWrapper
from sklearn.preprocessing import RobustScaler, StandardScaler
from feature_engine.discretisation import EqualFrequencyDiscretiser, EqualWidthDiscretiser


from data.data_load import DataLoad
from data.data_validation import DataValidation
from data.data_transformation import DataTransformation
from data.data_preprocess import DataPreprocess
from utils.utils import load_config_file
from train.train_model import TrainModel
from evaluation.classifiers_eval import ModelEvaluation

# 1.0 Data Load

In [2]:
dl = DataLoad()
df = dl.load_data('train_dataset_name')

2023-12-09 07:10:03 [info     ] Iniciando o carregamento


# 2.0 Data Validation

In [3]:
dv = DataValidation()
is_valid = dv.run(df)

2023-12-09 07:10:03 [info     ] Validação iniciou..
2023-12-09 07:10:03 [info     ] Validation columns passed...
2023-12-09 07:10:03 [info     ] Validacao com sucesso.


# 3.0 Data Transformation

In [4]:
dt = DataTransformation(df)

In [5]:
X_train, X_valid, y_train, y_valid = dt.train_test_split()

# 4.0 Experimentations

In [6]:
import mlflow
from mlflow.tracking import MlflowClient


* 'schema_extra' has been renamed to 'json_schema_extra'


In [7]:
mlflow.set_tracking_uri('http://127.0.0.1:5000')
mlflow.set_experiment('prob_loan')

2023/12/09 07:10:04 INFO mlflow.tracking.fluent: Experiment with name 'prob_loan' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1702116604955, experiment_id='1', last_update_time=1702116604955, lifecycle_stage='active', name='prob_loan', tags={}>

In [9]:
with mlflow.start_run(run_name='baseline'):
    mlflow.set_tag('model_name', 'lr_baseline')

    # preprocessar os dados
    pipe = Pipeline([
        ('imputer', MeanMedianImputer(variables=load_config_file().get('vars_imputer'))),
        ('scaler', SklearnTransformerWrapper(StandardScaler()))
    ])

    preprocessador = DataPreprocess(pipe)
    preprocessador.train(X_train)

    X_train_processed = preprocessador.transform(X_train)
    X_valid_processed = preprocessador.transform(X_valid)
    joblib.dump(preprocessador, '../models/preprocess.joblib')

    # logar um artefato (preprocessador)
    mlflow.log_artifact('../models/preprocess.joblib')

    # logar os parametros do preprocessador
    mlflow.log_params(params={'imputer': pipe['imputer'], 'scaler': pipe['scaler']})

    # Inicia o experimento com cross validation
    model = LogisticRegression()
    model_eval = ModelEvaluation(model, X_train_processed, y_train, n_splits=5)

    roc_auc_scores = model_eval.cross_val_evaluate()

    # logar o resultado da performance
    mlflow.log_metric('train_roc_auc', roc_auc_scores.mean())

    # treinar modelo
    model.fit(X_train_processed, y_train)

    # salvar as metricas de validacao
    y_val_preds = model_eval.model.predict_proba(X_valid_processed)[:,1]
    val_roc_auc = model_eval.evaluate_predictions(y_valid, y_val_preds)

    mlflow.log_metric('valid_roc_auc', val_roc_auc)

    # logar o modelo
    mlflow.sklearn.log_model(model, 'lr_model', pyfunc_predict_fn='predict_proba')

    mlflow.end_run()


2023-12-09 07:11:13 [info     ] Iniciando o processamento
2023-12-09 07:11:13 [info     ] Iniciando a Transformação
2023-12-09 07:11:13 [info     ] Iniciando a Transformação
2023-12-09 07:11:13 [info     ] Iniciando a Cross Validation...




## 4.1 Experimento 01

In [10]:
from mlflow.models import MetricThreshold
from sklearn.dummy import DummyClassifier
from mlflow.models import infer_signature

In [12]:
with mlflow.start_run(run_name='with_discretizer'):
    mlflow.set_tag('model_name', 'lr_discretizer')

    # preprocessar os dados
    pipe = Pipeline([
        ('imputer', MeanMedianImputer(variables=load_config_file().get('vars_imputer'))),
        ('discretizer', EqualFrequencyDiscretiser(variables=load_config_file().get('vars_discretizer'))),
        ('scaler', SklearnTransformerWrapper(StandardScaler()))
    ])

    preprocessador = DataPreprocess(pipe)
    preprocessador.train(X_train)

    X_train_processed = preprocessador.transform(X_train)
    X_valid_processed = preprocessador.transform(X_valid)
    joblib.dump(preprocessador, '../models/preprocess.joblib')

    # logar um artefato (preprocessador)
    mlflow.log_artifact('../models/preprocess.joblib')

    # logar os parametros do preprocessador
    mlflow.log_params(params={'imputer': pipe['imputer'],
                              'discretizer': pipe['discretizer'],
                               'scaler': pipe['scaler']})

    # Inicia o experimento com cross validation
    model = LogisticRegression()
    model_eval = ModelEvaluation(model, X_train_processed, y_train, n_splits=5)

    roc_auc_scores = model_eval.cross_val_evaluate()

    # logar o resultado da performance
    mlflow.log_metric('train_roc_auc', roc_auc_scores.mean())

    # treinar modelo
    model.fit(X_train_processed, y_train)

    # salvar as metricas de validacao
    y_val_preds = model_eval.model.predict_proba(X_valid_processed)[:,1]
    val_roc_auc = model_eval.evaluate_predictions(y_valid, y_val_preds)

    mlflow.log_metric('valid_roc_auc', val_roc_auc)

    # logar o modelo
    candidate_model_uri = mlflow.sklearn.log_model(model, 'lr_model').model_uri


    #########
    signature = infer_signature(X_valid_processed, y_valid)

    eval_data = X_valid_processed
    eval_data['label'] = y_valid
    thresholds = {
        "accuracy_score": MetricThreshold(
            threshold=0.7, # o score da accuracy precisa ser maior que 0.7 para ser salvo
            min_absolute_change=0.05,
            min_relative_change=0.05,
            greater_is_better=True
        )
    }

    baseline_model = DummyClassifier(strategy='uniform').fit(X_train_processed, y_train)
    baseline_model_uri = mlflow.sklearn.log_model(baseline_model, 'baseline_model', signature=signature).model_uri

    #Processo de avaliação do mlflow
    mlflow.evaluate(candidate_model_uri, eval_data, targets='label', model_type='classifier', validation_thresholds=thresholds,baseline_model=baseline_model_uri)

    #Explicabilidade do modelo com SHAP
    mlflow.shap.log_explanation(model.predict, X_valid_processed.drop('label', axis=1))

    mlflow.end_run()


2023-12-09 07:27:47 [info     ] Iniciando o processamento
2023-12-09 07:27:47 [info     ] Iniciando a Transformação
2023-12-09 07:27:47 [info     ] Iniciando a Transformação
2023-12-09 07:27:47 [info     ] Iniciando a Cross Validation...


  outputs = _infer_schema(model_output) if model_output is not None else None
  from .autonotebook import tqdm as notebook_tqdm
Downloading artifacts: 100%|██████████| 5/5 [00:00<00:00, 165.78it/s]
Downloading artifacts: 100%|██████████| 5/5 [00:00<00:00, 79.71it/s]
  string_columns = trimmed_df.columns[(df.applymap(type) == str).all(0)]
  data = data.applymap(_hash_array_like_element_as_bytes)
  data = data.applymap(_hash_array_like_element_as_bytes)
  return _infer_schema(self._df)
2023/12/09 07:27:58 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/12/09 07:27:58 INFO mlflow.models.evaluation.default_evaluator: Evaluating candidate model:
2023/12/09 07:27:58 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
2023/12/09 07:27:58 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2023/12/09 07:27:58 INFO mlflow.models.evaluat

KeyboardInterrupt: 