# Imports

In [1]:
from sklearn.pipeline import Pipeline
from feature_engine.discretisation import EqualFrequencyDiscretiser, EqualWidthDiscretiser
from feature_engine.imputation import MeanMedianImputer
from feature_engine.wrappers import SklearnTransformerWrapper
from sklearn.preprocessing import RobustScaler, StandardScaler

import sys
import joblib
sys.path.insert(0,'../src/')
from utils.utils import load_config_file

from data.data_load import DataLoad
from data.data_validation import DataValidation
from data.data_transformation import DataTransformation
from data.data_preprocess import DataPreprocess
from train.train import TrainModels
from sklearn.linear_model import LogisticRegression
from evaluation.classifier_eval import ModelEvaluation

# Data Load

In [2]:
dl = DataLoad()
df = dl.load_data('train_dataset_name')

2024-01-21 16:46:02 [info     ] Initiating data load with name: train_dataset_name


# Data Validation

In [3]:
dv = DataValidation()
is_valid = dv.run(df)

2024-01-21 16:46:02 [info     ] Initiating validation...      
2024-01-21 16:46:02 [info     ] Validation columns passed...  
2024-01-21 16:46:02 [info     ] Success on validate data      


# Data Transformation

In [4]:
dt = DataTransformation(df)

In [5]:
X_train, X_val, y_train, y_val = dt.train_test_spliting()

# Experimentations

In [6]:
import mlflow
from mlflow.tracking import MlflowClient

In [7]:
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'credentials.json'
mlflow.set_tracking_uri('http://localhost:5000')
mlflow.set_experiment('prob_loan_gcp')

<Experiment: artifact_location='gs://cds_mlflow_pedro/1', creation_time=1705864558757, experiment_id='1', last_update_time=1705864558757, lifecycle_stage='active', name='prob_loan_gcp', tags={}>

## Hyperparameters

In [8]:
from mlflow.models import MetricThreshold, infer_signature
from sklearn.dummy import DummyClassifier

In [9]:
from hyperopt import fmin, tpe, hp, STATUS_OK

In [10]:
pipe = Pipeline(
    [
    ('imputer', MeanMedianImputer(variables=load_config_file().get('vars_imputer'))),
        ('discretizer', EqualFrequencyDiscretiser(variables=load_config_file().get('vars_discretizer'))),
            ('scaler', SklearnTransformerWrapper(StandardScaler()))
        ]
            )

In [11]:
def objective(params):

    with mlflow.start_run(run_name='with_discretizer_hyperopt'):
        mlflow.set_tag('model_name','lr_hyperopt')
        mlflow.log_params(params)

        preprocessor = DataPreprocess(pipe)
        preprocessor.train(X_train)

        X_train_processed = preprocessor.transform(X_train)
        X_val_processed = preprocessor.transform(X_val)
        joblib.dump(preprocessor,'../models/preprocessor.pkl')

        mlflow.log_artifact('../models/preprocessor.pkl')
        mlflow.log_params(params={'imputer':pipe['imputer'], 'discretizer': pipe['discretizer'], 'scaler': pipe['scaler']})

        model = LogisticRegression(**params)

        model_eval = ModelEvaluation(model, X_train_processed, y_train, n_splits=5)
        roc_auc_scores = model_eval.cross_val_evaluate()
        mlflow.log_metric('train_roc_auc',roc_auc_scores.mean())

        model.fit(X_train_processed, y_train)

        y_val_preds = model_eval.model.predict_proba(X_val_processed)[:,1]
        val_roc_auc = model_eval.evaluate_predictions(y_val,y_val_preds)
        
        mlflow.log_metric('val_roc_auc',val_roc_auc)


        candidate_model_uri = mlflow.sklearn.log_model(model, 'lr_model').model_uri

        signature = infer_signature(X_val_processed, y_val)

        eval_data = X_val_processed
        eval_data['label'] = y_val

        thresholds = {
            'accuracy_score': MetricThreshold(threshold=0.1, # o score da acuracia precisa ser > 0,7
                            min_absolute_change=0.05, min_relative_change=0.05, greater_is_better=True)
        }
        baseline_model = DummyClassifier(strategy='uniform').fit(X_train_processed, y_train)
        baseline_model_uri = mlflow.sklearn.log_model(baseline_model, 'baseline_model',signature=signature).model_uri

        mlflow.evaluate(candidate_model_uri,eval_data, 
                        targets='label',model_type='classifier',validation_thresholds=thresholds,
                        baseline_model = baseline_model_uri)
        
        # explicabilidade shape
        # mlflow.shap.log_explanation(model.predict,X_val_processed.drop('label', axis = 1))
        
        mlflow.end_run()

        return {'loss': -1*roc_auc_scores.mean(), 'status': STATUS_OK}

In [12]:
search_space = {'warm_start' : hp.choice('warm_start', [True, False]),
                'fit_intercept' : hp.choice('fit_intercept', [True, False]),
                'tol' : hp.uniform('tol', 0.00001, 0.0001),
                'C' : hp.uniform('C', 0.05, 3),
                'solver' : hp.choice('solver', ['newton-cg', 'lbfgs', 'liblinear']),
                'max_iter' : hp.choice('max_iter', range(100,1000)),
                'multi_class' : 'auto',
                'class_weight' : hp.choice('class_weight', [None, 'balanced'])}

In [13]:
best_result = fmin(fn=objective,
                   space=search_space,
                   algo=tpe.suggest,
                   max_evals=5)

2024-01-21 16:46:04                                   [info     ] Starting preprocessing...     
2024-01-21 16:46:04                                   [info     ] Initiating preprocessor data transformation...
2024-01-21 16:46:04                                   [info     ] Initiating preprocessor data transformation...
2024-01-21 16:46:07                                   [info     ] Initiating cross validation...
2024-01-21 16:46:09                                   [info     ] Initiating model validation...
  0%|          | 0/5 [00:05<?, ?trial/s, best loss=?]

  outputs = _infer_schema(model_output) if model_output is not None else None



* 'schema_extra' has been renamed to 'json_schema_extra'

  from .autonotebook import tqdm as notebook_tqdm

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading artifacts:  20%|##        | 1/5 [00:00<00:00,  6.85it/s]

Downloading artifacts:  20%|##        | 1/5 [00:00<00:00,  6.85it/s]

Downloading artifacts:  40%|####      | 2/5 [00:00<00:01,  2.73it/s]

Downloading artifacts:  40%|####      | 2/5 [00:00<00:01,  2.73it/s]

Downloading artifacts:  60%|######    | 3/5 [00:00<00:00,  2.73it/s]

Downloading artifacts:  80%|########  | 4/5 [00:00<00:00,  2.73it/s]

Downloading artifacts: 100%|##########| 5/5 [00:00<00:00,  2.73it/s]

Downloading artifacts: 100%|##########| 5/5 [00:00<00:00,  7.19it/s]
Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading artifacts:  20%|##        | 1/5 [00:00<00:00,  6.49it/s]

Downloading artifacts:  20%|##        | 1/5 [00:00<00:00,

  0%|          | 0/5 [00:22<?, ?trial/s, best loss=?]


MlflowException: API request to endpoint /api/2.0/mlflow/runs/log-inputs failed with error code 404 != 200. Response body: '<!doctype html>
<html lang=en>
<title>404 Not Found</title>
<h1>Not Found</h1>
<p>The requested URL was not found on the server. If you entered the URL manually please check your spelling and try again.</p>
'