# Imports

In [1]:
import sys
import pandas as pd
import joblib
sys.path.insert(0, '../src/')

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from feature_engine.imputation import MeanMedianImputer
from feature_engine.wrappers import SklearnTransformerWrapper
from sklearn.preprocessing import RobustScaler, StandardScaler
from feature_engine.discretisation import EqualFrequencyDiscretiser, EqualWidthDiscretiser


from data.data_load import DataLoad
from data.data_validation import DataValidation
from data.data_transformation import DataTransformation
from data.data_preprocess import DataPreprocess
from utils.utils import load_config_file
from train.train_model import TrainModel
from evaluation.classifiers_eval import ModelEvaluation

# 1.0 Data Load

In [2]:
dl = DataLoad()
df = dl.load_data('train_dataset_name')

2023-12-09 07:47:32 [info     ] Iniciando o carregamento


# 2.0 Data Validation

In [3]:
dv = DataValidation()
is_valid = dv.run(df)

2023-12-09 07:47:32 [info     ] Validação iniciou..
2023-12-09 07:47:32 [info     ] Validation columns passed...
2023-12-09 07:47:32 [info     ] Validacao com sucesso.


# 3.0 Data Transformation

In [4]:
dt = DataTransformation(df)

In [5]:
X_train, X_valid, y_train, y_valid = dt.train_test_split()

# 4.0 Experimentations

In [6]:
import mlflow
from mlflow.tracking import MlflowClient


* 'schema_extra' has been renamed to 'json_schema_extra'


In [7]:
mlflow.set_tracking_uri('http://127.0.0.1:5000')
mlflow.set_experiment('prob_loan')

<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1702116604955, experiment_id='1', last_update_time=1702116604955, lifecycle_stage='active', name='prob_loan', tags={}>

## 4.1 Hyperparameters

In [8]:
from mlflow.models import MetricThreshold
from sklearn.dummy import DummyClassifier
from mlflow.models import infer_signature

In [9]:
from hyperopt import fmin, tpe, hp, STATUS_OK

In [12]:
#preprocessar os dados
pipe = Pipeline([
    ('imputer', MeanMedianImputer(variables=load_config_file().get('vars_imputer'))),
    ('discretizer', EqualFrequencyDiscretiser(variables=load_config_file().get('vars_discretizer'))),
    ('scaler', SklearnTransformerWrapper(StandardScaler()))
])

In [18]:
def objective(params):
    with mlflow.start_run(run_name='with_discretizer_hyperopt'):
        mlflow.set_tag('model_name', 'lr_hyperopt')
        mlflow.log_params(params)


        preprocessador = DataPreprocess(pipe)
        preprocessador.train(X_train)

        X_train_processed = preprocessador.transform(X_train)
        X_valid_processed = preprocessador.transform(X_valid)
        joblib.dump(preprocessador, '../models/preprocess.joblib')

        # logar um artefato (preprocessador)
        mlflow.log_artifact('../models/preprocess.joblib')

        # logar os parametros do preprocessador
        mlflow.log_params(params={'imputer': pipe['imputer'],
                                'discretizer': pipe['discretizer'],
                                'scaler': pipe['scaler']})

        # Inicia o experimento com cross validation
        model = LogisticRegression(**params)
        model_eval = ModelEvaluation(model, X_train_processed, y_train, n_splits=5)

        roc_auc_scores = model_eval.cross_val_evaluate()

        # logar o resultado da performance
        mlflow.log_metric('train_roc_auc', roc_auc_scores.mean())

        # treinar modelo
        model.fit(X_train_processed, y_train)

        # salvar as metricas de validacao
        y_val_preds = model_eval.model.predict_proba(X_valid_processed)[:,1]
        val_roc_auc = model_eval.evaluate_predictions(y_valid, y_val_preds)

        mlflow.log_metric('valid_roc_auc', val_roc_auc)

        # logar o modelo
        candidate_model_uri = mlflow.sklearn.log_model(model, 'lr_model').model_uri


        #########
        signature = infer_signature(X_valid_processed, y_valid)

        eval_data = X_valid_processed
        eval_data['label'] = y_valid
        thresholds = {
            "accuracy_score": MetricThreshold(
                threshold=0.1, # o score da accuracy precisa ser maior que 0.7 para ser salvo
                min_absolute_change=0.05,
                min_relative_change=0.05,
                greater_is_better=True
            )
        }

        baseline_model = DummyClassifier(strategy='uniform').fit(X_train_processed, y_train)
        baseline_model_uri = mlflow.sklearn.log_model(baseline_model, 'baseline_model', signature=signature).model_uri

        #Processo de avaliação do mlflow
        mlflow.evaluate(candidate_model_uri, eval_data, targets='label', model_type='classifier', validation_thresholds=thresholds,baseline_model=baseline_model_uri)

        mlflow.end_run()

        return {'loss': -roc_auc_scores.mean(), 'status': STATUS_OK}


In [19]:
search_space = {
    'warm_start': hp.choice('warm_start', [True,False]),
    'fit_intercept': hp.choice('fit_intercept', [True,False]),
    'tol': hp.uniform('tol', 0.00001, 0.0001),
    'C': hp.uniform('C', 0.05, 3),
    'solver': hp.choice('solver', ['newton-cg', 'lbfgs', 'liblinear']),
    'max_iter': hp.choice('max_iter', range(100,1000)),
    'multi_class': 'auto',
    'class_weight': hp.choice('class_weight', [None,'balanced'])
}

In [20]:
best_result = fmin(fn=objective, space=search_space, algo=tpe.suggest, max_evals=5)

2023-12-09 08:08:50                                   [info     ] Iniciando o processamento
2023-12-09 08:08:50                                   [info     ] Iniciando a Transformação
2023-12-09 08:08:50                                   [info     ] Iniciando a Transformação
2023-12-09 08:08:50                                   [info     ] Iniciando a Cross Validation...
  0%|          | 0/5 [00:00<?, ?trial/s, best loss=?]

Distutils was imported before Setuptools, but importing Setuptools also replaces the `distutils` module in `sys.modules`. This may lead to undesirable behaviors or errors. To avoid these issues, avoid using distutils directly, ensure that setuptools is installed in the traditional way (e.g. not an editable install), and/or make sure that setuptools is always imported before distutils.

Setuptools is replacing distutils.

Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/doc

2023-12-09 08:09:08                                                              [info     ] Iniciando o processamento
2023-12-09 08:09:08                                                              [info     ] Iniciando a Transformação
2023-12-09 08:09:08                                                              [info     ] Iniciando a Transformação
2023-12-09 08:09:09                                                              [info     ] Iniciando a Cross Validation...
 20%|██        | 1/5 [00:18<01:12, 18.14s/trial, best loss: -0.8245986009834112]

Distutils was imported before Setuptools, but importing Setuptools also replaces the `distutils` module in `sys.modules`. This may lead to undesirable behaviors or errors. To avoid these issues, avoid using distutils directly, ensure that setuptools is installed in the traditional way (e.g. not an editable install), and/or make sure that setuptools is always imported before distutils.

Setuptools is replacing distutils.

Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/doc

2023-12-09 08:09:24                                                              [info     ] Iniciando o processamento
2023-12-09 08:09:25                                                              [info     ] Iniciando a Transformação
2023-12-09 08:09:25                                                              [info     ] Iniciando a Transformação
2023-12-09 08:09:25                                                              [info     ] Iniciando a Cross Validation...
 40%|████      | 2/5 [00:34<00:50, 16.75s/trial, best loss: -0.8245986009834112]

Distutils was imported before Setuptools, but importing Setuptools also replaces the `distutils` module in `sys.modules`. This may lead to undesirable behaviors or errors. To avoid these issues, avoid using distutils directly, ensure that setuptools is installed in the traditional way (e.g. not an editable install), and/or make sure that setuptools is always imported before distutils.

Setuptools is replacing distutils.

Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/doc

2023-12-09 08:09:41                                                              [info     ] Iniciando o processamento
2023-12-09 08:09:41                                                              [info     ] Iniciando a Transformação
2023-12-09 08:09:41                                                              [info     ] Iniciando a Transformação
2023-12-09 08:09:41                                                              [info     ] Iniciando a Cross Validation...
 60%|██████    | 3/5 [00:51<00:33, 16.91s/trial, best loss: -0.8245986009834112]

Distutils was imported before Setuptools, but importing Setuptools also replaces the `distutils` module in `sys.modules`. This may lead to undesirable behaviors or errors. To avoid these issues, avoid using distutils directly, ensure that setuptools is installed in the traditional way (e.g. not an editable install), and/or make sure that setuptools is always imported before distutils.

Setuptools is replacing distutils.

Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/doc

2023-12-09 08:10:00                                                              [info     ] Iniciando o processamento
2023-12-09 08:10:00                                                              [info     ] Iniciando a Transformação
2023-12-09 08:10:00                                                              [info     ] Iniciando a Transformação
2023-12-09 08:10:00                                                              [info     ] Iniciando a Cross Validation...
 80%|████████  | 4/5 [01:10<00:17, 17.59s/trial, best loss: -0.8245986009834112]

Distutils was imported before Setuptools, but importing Setuptools also replaces the `distutils` module in `sys.modules`. This may lead to undesirable behaviors or errors. To avoid these issues, avoid using distutils directly, ensure that setuptools is installed in the traditional way (e.g. not an editable install), and/or make sure that setuptools is always imported before distutils.

Setuptools is replacing distutils.

Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/doc

100%|██████████| 5/5 [01:25<00:00, 17.02s/trial, best loss: -0.8349507962534523]
