# Imports

In [15]:
from sklearn.pipeline import Pipeline
from feature_engine.discretisation import EqualFrequencyDiscretiser, EqualWidthDiscretiser
from feature_engine.imputation import MeanMedianImputer
from feature_engine.wrappers import SklearnTransformerWrapper
from sklearn.preprocessing import RobustScaler, StandardScaler

import sys
import joblib
sys.path.insert(0,'../src/')
from utils.utils import load_config_file

from data.data_load import DataLoad
from data.data_validation import DataValidation
from data.data_transformation import DataTransformation
from data.data_preprocess import DataPreprocess
from train.train import TrainModels
from sklearn.linear_model import LogisticRegression
from evaluation.classifier_eval import ModelEvaluation

# Data Load

In [16]:
dl = DataLoad()
df = dl.load_data('train_dataset_name')

2024-01-18 07:26:40 [info     ] Initiating data load with name: train_dataset_name


# Data Validation

In [17]:
dv = DataValidation()
is_valid = dv.run(df)

2024-01-18 07:26:41 [info     ] Initiating validation...      
2024-01-18 07:26:41 [info     ] Validation columns passed...  
2024-01-18 07:26:41 [info     ] Success on validate data      


# Data Transformation

In [18]:
dt = DataTransformation(df)

In [19]:
X_train, X_val, y_train, y_val = dt.train_test_spliting()

# Experimentations

In [20]:
import mlflow
from mlflow.tracking import MlflowClient

In [21]:
mlflow.set_tracking_uri('http://127.0.0.1:5000')
mlflow.set_experiment('prob_loan')

<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1705570296224, experiment_id='1', last_update_time=1705570296224, lifecycle_stage='active', name='prob_loan', tags={}>

## Hyperparameters

In [22]:
from mlflow.models import MetricThreshold, infer_signature
from sklearn.dummy import DummyClassifier

In [23]:
from hyperopt import fmin, tpe, hp, STATUS_OK

In [24]:
pipe = Pipeline(
    [
    ('imputer', MeanMedianImputer(variables=load_config_file().get('vars_imputer'))),
        ('discretizer', EqualFrequencyDiscretiser(variables=load_config_file().get('vars_discretizer'))),
            ('scaler', SklearnTransformerWrapper(StandardScaler()))
        ]
            )

In [25]:
def objective(params):

    with mlflow.start_run(run_name='with_discretizer_hyperopt'):
        mlflow.set_tag('model_name','lr_hyperopt')
        mlflow.log_params(params)

        preprocessor = DataPreprocess(pipe)
        preprocessor.train(X_train)

        X_train_processed = preprocessor.transform(X_train)
        X_val_processed = preprocessor.transform(X_val)
        joblib.dump(preprocessor,'../models/preprocessor.pkl')

        mlflow.log_artifact('../models/preprocessor.pkl')
        mlflow.log_params(params={'imputer':pipe['imputer'], 'discretizer': pipe['discretizer'], 'scaler': pipe['scaler']})

        model = LogisticRegression(**params)

        model_eval = ModelEvaluation(model, X_train_processed, y_train, n_splits=5)
        roc_auc_scores = model_eval.cross_val_evaluate()
        mlflow.log_metric('train_roc_auc',roc_auc_scores.mean())

        model.fit(X_train_processed, y_train)

        y_val_preds = model_eval.model.predict_proba(X_val_processed)[:,1]
        val_roc_auc = model_eval.evaluate_predictions(y_val,y_val_preds)
        
        mlflow.log_metric('val_roc_auc',val_roc_auc)


        candidate_model_uri = mlflow.sklearn.log_model(model, 'lr_model').model_uri

        signature = infer_signature(X_val_processed, y_val)

        eval_data = X_val_processed
        eval_data['label'] = y_val

        thresholds = {
            'accuracy_score': MetricThreshold(threshold=0.1, # o score da acuracia precisa ser > 0,7
                            min_absolute_change=0.05, min_relative_change=0.05, greater_is_better=True)
        }
        baseline_model = DummyClassifier(strategy='uniform').fit(X_train_processed, y_train)
        baseline_model_uri = mlflow.sklearn.log_model(baseline_model, 'baseline_model',signature=signature).model_uri

        mlflow.evaluate(candidate_model_uri,eval_data, 
                        targets='label',model_type='classifier',validation_thresholds=thresholds,
                        baseline_model = baseline_model_uri)
        
        # explicabilidade shape
        # mlflow.shap.log_explanation(model.predict,X_val_processed.drop('label', axis = 1))
        
        mlflow.end_run()

        return {'loss': -1*roc_auc_scores.mean(), 'status': STATUS_OK}

In [26]:
search_space = {'warm_start' : hp.choice('warm_start', [True, False]),
                'fit_intercept' : hp.choice('fit_intercept', [True, False]),
                'tol' : hp.uniform('tol', 0.00001, 0.0001),
                'C' : hp.uniform('C', 0.05, 3),
                'solver' : hp.choice('solver', ['newton-cg', 'lbfgs', 'liblinear']),
                'max_iter' : hp.choice('max_iter', range(100,1000)),
                'multi_class' : 'auto',
                'class_weight' : hp.choice('class_weight', [None, 'balanced'])}

In [27]:
best_result = fmin(fn=objective,
                   space=search_space,
                   algo=tpe.suggest,
                   max_evals=5)

  0%|          | 0/5 [00:00<?, ?trial/s, best loss=?]

2024-01-18 07:26:41                                   [info     ] Starting preprocessing...     
2024-01-18 07:26:41                                   [info     ] Initiating preprocessor data transformation...
2024-01-18 07:26:41                                   [info     ] Initiating preprocessor data transformation...
2024-01-18 07:26:41                                   [info     ] Initiating cross validation...
2024-01-18 07:26:42                                   [info     ] Initiating model validation...
  0%|          | 0/5 [00:01<?, ?trial/s, best loss=?]

Distutils was imported before Setuptools, but importing Setuptools also replaces the `distutils` module in `sys.modules`. This may lead to undesirable behaviors or errors. To avoid these issues, avoid using distutils directly, ensure that setuptools is installed in the traditional way (e.g. not an editable install), and/or make sure that setuptools is always imported before distutils.

Setuptools is replacing distutils.

Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/doc

2024-01-18 07:26:51                                                              [info     ] Starting preprocessing...     
2024-01-18 07:26:52                                                              [info     ] Initiating preprocessor data transformation...
2024-01-18 07:26:52                                                              [info     ] Initiating preprocessor data transformation...
2024-01-18 07:26:52                                                              [info     ] Initiating cross validation...
2024-01-18 07:26:53                                                              [info     ] Initiating model validation...
 20%|██        | 1/5 [00:12<00:41, 10.31s/trial, best loss: -0.7919636161700507]

Distutils was imported before Setuptools, but importing Setuptools also replaces the `distutils` module in `sys.modules`. This may lead to undesirable behaviors or errors. To avoid these issues, avoid using distutils directly, ensure that setuptools is installed in the traditional way (e.g. not an editable install), and/or make sure that setuptools is always imported before distutils.

Setuptools is replacing distutils.

Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/doc

2024-01-18 07:27:02                                                              [info     ] Starting preprocessing...     
2024-01-18 07:27:02                                                              [info     ] Initiating preprocessor data transformation...
2024-01-18 07:27:02                                                              [info     ] Initiating preprocessor data transformation...
2024-01-18 07:27:02                                                              [info     ] Initiating cross validation...
2024-01-18 07:27:03                                                              [info     ] Initiating model validation...
 40%|████      | 2/5 [00:22<00:30, 10.28s/trial, best loss: -0.8230082262376218]

Distutils was imported before Setuptools, but importing Setuptools also replaces the `distutils` module in `sys.modules`. This may lead to undesirable behaviors or errors. To avoid these issues, avoid using distutils directly, ensure that setuptools is installed in the traditional way (e.g. not an editable install), and/or make sure that setuptools is always imported before distutils.

Setuptools is replacing distutils.

Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/doc

2024-01-18 07:27:12                                                              [info     ] Starting preprocessing...     
2024-01-18 07:27:12                                                              [info     ] Initiating preprocessor data transformation...
2024-01-18 07:27:12                                                              [info     ] Initiating preprocessor data transformation...
2024-01-18 07:27:12                                                              [info     ] Initiating cross validation...
2024-01-18 07:27:13                                                              [info     ] Initiating model validation...
 60%|██████    | 3/5 [00:31<00:20, 10.24s/trial, best loss: -0.8334516351645499]

Distutils was imported before Setuptools, but importing Setuptools also replaces the `distutils` module in `sys.modules`. This may lead to undesirable behaviors or errors. To avoid these issues, avoid using distutils directly, ensure that setuptools is installed in the traditional way (e.g. not an editable install), and/or make sure that setuptools is always imported before distutils.

Setuptools is replacing distutils.

Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/doc

2024-01-18 07:27:22                                                              [info     ] Starting preprocessing...     
2024-01-18 07:27:22                                                              [info     ] Initiating preprocessor data transformation...
2024-01-18 07:27:22                                                              [info     ] Initiating preprocessor data transformation...
2024-01-18 07:27:22                                                              [info     ] Initiating cross validation...
2024-01-18 07:27:24                                                              [info     ] Initiating model validation...
 80%|████████  | 4/5 [00:42<00:10, 10.09s/trial, best loss: -0.8334714654361722]

Distutils was imported before Setuptools, but importing Setuptools also replaces the `distutils` module in `sys.modules`. This may lead to undesirable behaviors or errors. To avoid these issues, avoid using distutils directly, ensure that setuptools is installed in the traditional way (e.g. not an editable install), and/or make sure that setuptools is always imported before distutils.

Setuptools is replacing distutils.

Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/doc

100%|██████████| 5/5 [00:52<00:00, 10.47s/trial, best loss: -0.8334714654361722]
