# Imports

In [5]:
from sklearn.pipeline import Pipeline
from feature_engine.discretisation import EqualFrequencyDiscretiser, EqualWidthDiscretiser
from feature_engine.imputation import MeanMedianImputer
from feature_engine.wrappers import SklearnTransformerWrapper
from sklearn.preprocessing import RobustScaler, StandardScaler

import sys
import joblib
sys.path.insert(0,'../src/')
from utils.utils import load_config_file

from data.data_load import DataLoad
from data.data_validation import DataValidation
from data.data_transformation import DataTransformation
from data.data_preprocess import DataPreprocess
from train.train import TrainModels
from sklearn.linear_model import LogisticRegression
from evaluation.classifier_eval import ModelEvaluation

# Data Load

In [6]:
dl = DataLoad()
df = dl.load_data('train_dataset_name')

2024-01-18 06:43:25 [info     ] Initiating data load with name: train_dataset_name


# Data Validation

In [7]:
dv = DataValidation()
is_valid = dv.run(df)

2024-01-18 06:43:25 [info     ] Initiating validation...      
2024-01-18 06:43:25 [info     ] Validation columns passed...  
2024-01-18 06:43:25 [info     ] Success on validate data      


# Data Transformation

In [8]:
dt = DataTransformation(df)

In [9]:
X_train, X_val, y_train, y_val = dt.train_test_spliting()

# Experimentations

In [10]:
import mlflow
from mlflow.tracking import MlflowClient

In [11]:
mlflow.set_tracking_uri('http://127.0.0.1:5000')
mlflow.set_experiment('prob_loan')

<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1705570296224, experiment_id='1', last_update_time=1705570296224, lifecycle_stage='active', name='prob_loan', tags={}>

In [14]:
with mlflow.start_run(run_name='baseline'):
    mlflow.set_tag('model_name','lr_baseline')

    pipe = Pipeline(
    [
    ('imputer', MeanMedianImputer(variables=load_config_file().get('vars_imputer'))),
        #('discretizer', EqualFrequencyDiscretiser(variables=load_config_file().get('vars_discretizer'))),
            ('scaler', SklearnTransformerWrapper(StandardScaler()))
        ]
            )
    
    preprocessor = DataPreprocess(pipe)

    preprocessor.train(X_train)

    X_train_processed = preprocessor.transform(X_train)
    X_val_processed = preprocessor.transform(X_val)
    joblib.dump(preprocessor,'../models/preprocessor.pkl')

    mlflow.log_artifact('../models/preprocessor.pkl')
    mlflow.log_params(params={'imputer':pipe['imputer'], 'scaler': pipe['scaler']})

    model = LogisticRegression()

    model_eval = ModelEvaluation(model, X_train_processed, y_train, n_splits=5)
    roc_auc_scores = model_eval.cross_val_evaluate()
    mlflow.log_metric('train_roc_auc',roc_auc_scores.mean())

    model.fit(X_train_processed, y_train)

    y_val_preds = model_eval.model.predict_proba(X_val_processed)[:,1]
    val_roc_auc = model_eval.evaluate_predictions(y_val,y_val_preds)
    mlflow.log_metric('val_roc_auc',val_roc_auc)


    mlflow.sklearn.log_model(model, 'lr_model',pyfunc_predict_fn='predict_proba')
    mlflow.end_run()

2024-01-18 06:44:57 [info     ] Starting preprocessing...     


2024-01-18 06:44:57 [info     ] Initiating preprocessor data transformation...
2024-01-18 06:44:57 [info     ] Initiating preprocessor data transformation...
2024-01-18 06:44:58 [info     ] Initiating cross validation...
2024-01-18 06:44:59 [info     ] Initiating model validation...


## Experimento 01

In [16]:
from mlflow.models import MetricThreshold, infer_signature
from sklearn.dummy import DummyClassifier

In [17]:
with mlflow.start_run(run_name='with_discretizer'):
    mlflow.set_tag('model_name','lr_discretizer')

    pipe = Pipeline(
    [
    ('imputer', MeanMedianImputer(variables=load_config_file().get('vars_imputer'))),
        ('discretizer', EqualFrequencyDiscretiser(variables=load_config_file().get('vars_discretizer'))),
            ('scaler', SklearnTransformerWrapper(StandardScaler()))
        ]
            )
    
    preprocessor = DataPreprocess(pipe)

    preprocessor.train(X_train)

    X_train_processed = preprocessor.transform(X_train)
    X_val_processed = preprocessor.transform(X_val)
    joblib.dump(preprocessor,'../models/preprocessor.pkl')

    mlflow.log_artifact('../models/preprocessor.pkl')
    mlflow.log_params(params={'imputer':pipe['imputer'], 'discretizer': pipe['discretizer'], 'scaler': pipe['scaler']})

    model = LogisticRegression()

    model_eval = ModelEvaluation(model, X_train_processed, y_train, n_splits=5)
    roc_auc_scores = model_eval.cross_val_evaluate()
    mlflow.log_metric('train_roc_auc',roc_auc_scores.mean())

    model.fit(X_train_processed, y_train)

    y_val_preds = model_eval.model.predict_proba(X_val_processed)[:,1]
    val_roc_auc = model_eval.evaluate_predictions(y_val,y_val_preds)
    mlflow.log_metric('val_roc_auc',val_roc_auc)


    candidate_model_uri = mlflow.sklearn.log_model(model, 'lr_model').model_uri

    signature = infer_signature(X_val_processed, y_val)

    eval_data = X_val_processed
    eval_data['label'] = y_val

    thresholds = {
        'accuracy_score': MetricThreshold(threshold=0.7, # o score da acuracia precisa ser > 0,7
                        min_absolute_change=0.05, min_relative_change=0.05, greater_is_better=True)
    }
    baseline_model = DummyClassifier(strategy='uniform').fit(X_train_processed, y_train)
    baseline_model_uri = mlflow.sklearn.log_model(baseline_model, 'baseline_model',signature=signature).model_uri

    mlflow.evaluate(candidate_model_uri,eval_data, 
                    targets='label',model_type='classifier',validation_thresholds=thresholds,
                    baseline_model = baseline_model_uri)
    
    # explicabilidade shape
    mlflow.shap.log_explanation(model.predict,X_val_processed.drop('label', axis = 1))
    mlflow.end_run()

2024-01-18 07:00:36 [info     ] Starting preprocessing...     
2024-01-18 07:00:36 [info     ] Initiating preprocessor data transformation...
2024-01-18 07:00:36 [info     ] Initiating preprocessor data transformation...
2024-01-18 07:00:36 [info     ] Initiating cross validation...
2024-01-18 07:00:37 [info     ] Initiating model validation...


  outputs = _infer_schema(model_output) if model_output is not None else None
* 'schema_extra' has been renamed to 'json_schema_extra'
  from .autonotebook import tqdm as notebook_tqdm
Downloading artifacts: 100%|██████████| 5/5 [00:00<00:00, 714.26it/s] 
Downloading artifacts: 100%|██████████| 5/5 [00:00<00:00, 106.38it/s]
  string_columns = trimmed_df.columns[(df.applymap(type) == str).all(0)]
  data = data.applymap(_hash_array_like_element_as_bytes)
  data = data.applymap(_hash_array_like_element_as_bytes)
  return _infer_schema(self._df)
2024/01/18 07:00:45 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2024/01/18 07:00:45 INFO mlflow.models.evaluation.default_evaluator: Evaluating candidate model:
2024/01/18 07:00:45 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
2024/01/18 07:00:45 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative

KeyboardInterrupt: 