# Imports

In [1]:
import sys
import pandas as pd
import joblib
sys.path.insert(0, '../src/')

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from feature_engine.imputation import MeanMedianImputer
from feature_engine.wrappers import SklearnTransformerWrapper
from sklearn.preprocessing import RobustScaler, StandardScaler
from feature_engine.discretisation import EqualFrequencyDiscretiser, EqualWidthDiscretiser


from data.data_load import DataLoad
from data.data_validation import DataValidation
from data.data_transformation import DataTransformation
from data.data_preprocess import DataPreprocess
from utils.utils import load_config_file
from train.train_model import TrainModel
from evaluation.classifiers_eval import ModelEvaluation

# 1.0 Data Load

In [2]:
dl = DataLoad()
df = dl.load_data('train_dataset_name')

2023-12-09 08:19:45 [info     ] Iniciando o carregamento


# 2.0 Data Validation

In [3]:
dv = DataValidation()
is_valid = dv.run(df)

2023-12-09 08:19:46 [info     ] Validação iniciou..
2023-12-09 08:19:46 [info     ] Validation columns passed...
2023-12-09 08:19:46 [info     ] Validacao com sucesso.


# 3.0 Data Transformation

In [4]:
dt = DataTransformation(df)

In [5]:
X_train, X_valid, y_train, y_valid = dt.train_test_split()

# 4.0 Experimentations

In [6]:
import mlflow
from mlflow.tracking import MlflowClient


* 'schema_extra' has been renamed to 'json_schema_extra'


In [7]:
mlflow.set_tracking_uri('http://127.0.0.1:5000')
mlflow.set_experiment('prob_loan')

<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1702116604955, experiment_id='1', last_update_time=1702116604955, lifecycle_stage='active', name='prob_loan', tags={}>

## 4.1 Select Best Model

In [8]:
current_experiment = dict(mlflow.get_experiment_by_name('prob_loan'))

In [10]:
experiment_id = current_experiment['experiment_id']

In [11]:
df_mlflow = mlflow.search_runs(filter_string='metrics.valid_roc_auc < 1').sort_values('metrics.valid_roc_auc', ascending=False)

In [18]:
# ID do melhor modelo com a melhor métrica de roc_auc
run_id = df_mlflow.loc[df_mlflow['metrics.valid_roc_auc'].idxmax()]['run_id']
run_id

'9fe118c17bc84499aa8c65578cca5e46'

'9fe118c17bc84499aa8c65578cca5e46'