# Imports

In [14]:
import sys
import pandas as pd
import joblib
sys.path.insert(0, '../src/')

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from feature_engine.imputation import MeanMedianImputer
from feature_engine.wrappers import SklearnTransformerWrapper
from sklearn.preprocessing import RobustScaler, StandardScaler
from feature_engine.discretisation import EqualFrequencyDiscretiser, EqualWidthDiscretiser


from data.data_load import DataLoad
from data.data_validation import DataValidation
from data.data_transformation import DataTransformation
from data.data_preprocess import DataPreprocess
from utils.utils import load_config_file
from train.train_model import TrainModel
from evaluation.classifiers_eval import ModelEvaluation

# 1.0 Data Load

In [15]:
dl = DataLoad()
df = dl.load_data('train_dataset_name')

2023-12-06 10:47:06 [info     ] Iniciando o carregamento


# 2.0 Data Validation

In [16]:
dv = DataValidation()
is_valid = dv.run(df)

2023-12-06 10:47:06 [info     ] Validação iniciou..
2023-12-06 10:47:06 [info     ] Validation columns passed...
2023-12-06 10:47:06 [info     ] Validacao com sucesso.


# 3.0 Data Transformation

In [17]:
dt = DataTransformation(df)

In [18]:
X_train, X_valid, y_train, y_valid = dt.train_test_split()

# 4.0 Experimentations

In [19]:
import mlflow
from mlflow.tracking import MlflowClient

In [20]:
mlflow.set_tracking_uri('http://127.0.0.1:5000')
mlflow.set_experiment('prob_loan')

<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1701870074069, experiment_id='1', last_update_time=1701870074069, lifecycle_stage='active', name='prob_loan', tags={}>

In [23]:
with mlflow.start_run(run_name='baseline'):
    mlflow.set_tag('model_name', 'lr_baseline')

    # preprocessar os dados
    pipe = Pipeline([
        ('imputer', MeanMedianImputer(variables=load_config_file().get('vars_imputer'))),
        ('scaler', SklearnTransformerWrapper(StandardScaler()))
    ])

    preprocessador = DataPreprocess(pipe)
    preprocessador.train(X_train)

    X_train_processed = preprocessador.transform(X_train)
    X_valid_processed = preprocessador.transform(X_valid)
    joblib.dump(preprocessador, '../models/preprocess.joblib')

    # logar um artefato (preprocessador)
    mlflow.log_artifact('../models/preprocess.joblib')

    # logar os parametros do preprocessador
    mlflow.log_params(params={'imputer': pipe['imputer'], 'scaler': pipe['scaler']})

    # Inicia o experimento com cross validation

    model = LogisticRegression()
    model_eval = ModelEvaluation(model, X_train_processed, y_train, n_splits=5)

    roc_auc_scores = model_eval.cross_val_evaluate()

    # logar o resultado da performance
    mlflow.log_metric('train_roc_auc', roc_auc_scores.mean())

    # treinar modelo
    model.fit(X_train_processed, y_train)

    # salvar as metricas de validacao
    y_val_preds = model_eval.model.predict_proba(X_valid_processed)[:,1]
    val_roc_auc = model_eval.evaluate_predictions(y_valid, y_val_preds)

    mlflow.log_metric('valid_roc_auc', val_roc_auc)

    # logar o modelo
    mlflow.sklearn.log_model(model, 'lr_model', pyfunc_predict_fn='predict_proba')

    mlflow.end_run()




    

2023-12-06 10:53:42 [info     ] Iniciando o processamento


2023-12-06 10:53:43 [info     ] Iniciando a Transformação
2023-12-06 10:53:43 [info     ] Iniciando a Transformação
2023-12-06 10:53:43 [info     ] Iniciando a Cross Validation...
