# Imports

In [3]:
import pandas as pd
from sklearn.pipeline import Pipeline
from feature_engine.discretisation import EqualFrequencyDiscretiser, EqualWidthDiscretiser
from feature_engine.imputation import MeanMedianImputer
from feature_engine.wrappers import SklearnTransformerWrapper
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.linear_model import LogisticRegression

import sys
import joblib
sys.path.insert(0,'../src/')
from utils.utils import load_config_file

from data.data_load import DataLoad
from data.data_validation import DataValidation
from data.data_transformation import DataTransformation
from data.data_preprocess import DataPreprocess
from train.train import TrainModels
from evaluation.classifier_eval import ModelEvaluation

pd.set_option('display.max_column',None)

# Data Load

In [2]:
dl = DataLoad()
df = dl.load_data('train_dataset_name')

2024-01-18 07:33:24 [info     ] Initiating data load with name: train_dataset_name


# Data Validation

In [3]:
dv = DataValidation()
is_valid = dv.run(df)

2024-01-18 07:33:24 [info     ] Initiating validation...      
2024-01-18 07:33:25 [info     ] Validation columns passed...  
2024-01-18 07:33:25 [info     ] Success on validate data      


# Data Transformation

In [4]:
dt = DataTransformation(df)

In [5]:
X_train, X_val, y_train, y_val = dt.train_test_spliting()

# Experimentations

In [6]:
import mlflow
from mlflow.tracking import MlflowClient

In [7]:
mlflow.set_tracking_uri('http://127.0.0.1:5000')
mlflow.set_experiment('prob_loan')

<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1705570296224, experiment_id='1', last_update_time=1705570296224, lifecycle_stage='active', name='prob_loan', tags={}>

## Select Best Model

In [10]:
current_experiment = dict(mlflow.get_experiment_by_name('prob_loan'))

In [11]:
experiment_id = current_experiment['experiment_id']

In [19]:
df_mlflow = mlflow.search_runs(filter_string='metrics.val_roc_auc < 1').sort_values('metrics.val_roc_auc',ascending = False)

In [20]:
df_mlflow

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.false_negatives,metrics.roc_auc,metrics.example_count,metrics.val_roc_auc,metrics.precision_score,metrics.recall_score,metrics.log_loss,metrics.true_negatives,metrics.true_positives,metrics.precision_recall_auc,metrics.false_positives,metrics.train_roc_auc,metrics.f1_score,metrics.accuracy_score,metrics.score,params.warm_start,params.scaler,params.C,params.imputer,params.discretizer,params.solver,params.tol,params.class_weight,params.fit_intercept,params.multi_class,params.max_iter,tags.mlflow.user,tags.mlflow.source.type,tags.mlflow.source.name,tags.model_name,tags.mlflow.datasets,tags.mlflow.log-model.history,tags.mlflow.runName
1,126210dda4d445fdb7730040e4519711,1,FINISHED,mlflow-artifacts:/1/126210dda4d445fdb7730040e4...,2024-01-18 10:27:12.499000+00:00,2024-01-18 10:27:22.164000+00:00,321.0,0.840613,37500.0,0.840613,0.135002,0.871907,0.677543,20994.0,2185.0,0.358467,14000.0,0.833471,0.233802,0.618107,0.618107,False,SklearnTransformerWrapper(transformer=Standard...,0.9518330371360946,"MeanMedianImputer(variables=['RendaMensal', 'N...",EqualFrequencyDiscretiser(variables=['TaxaDeUt...,lbfgs,2.802148842823204e-05,balanced,False,auto,803.0,Pedro,LOCAL,c:\Users\Pedro\.conda\envs\mlflow\lib\site-pac...,lr_hyperopt,"[{""name"":""cfc78614469742a54251023c73a82207"",""h...","[{""run_id"": ""126210dda4d445fdb7730040e4519711""...",with_discretizer_hyperopt
2,2ebd13775aa044c280219d77905ee696,1,FINISHED,mlflow-artifacts:/1/2ebd13775aa044c280219d7790...,2024-01-18 10:27:02.160000+00:00,2024-01-18 10:27:12.306000+00:00,321.0,0.840595,37500.0,0.840595,0.13496,0.871907,0.677613,20989.0,2185.0,0.358438,14005.0,0.833452,0.23374,0.617973,0.617973,True,SklearnTransformerWrapper(transformer=Standard...,0.5212475342689141,"MeanMedianImputer(variables=['RendaMensal', 'N...",EqualFrequencyDiscretiser(variables=['TaxaDeUt...,newton-cg,3.339600849088671e-05,balanced,False,auto,327.0,Pedro,LOCAL,c:\Users\Pedro\.conda\envs\mlflow\lib\site-pac...,lr_hyperopt,"[{""name"":""cfc78614469742a54251023c73a82207"",""h...","[{""run_id"": ""2ebd13775aa044c280219d77905ee696""...",with_discretizer_hyperopt
3,55b4f479ef954094b8afed894458002a,1,FINISHED,mlflow-artifacts:/1/55b4f479ef954094b8afed8944...,2024-01-18 10:26:51.900000+00:00,2024-01-18 10:27:02.106000+00:00,530.0,0.831089,37500.0,0.831089,0.169862,0.788508,0.53859,25337.0,1976.0,0.339271,9657.0,0.823008,0.279511,0.728347,0.728347,False,SklearnTransformerWrapper(transformer=Standard...,1.492768901018179,"MeanMedianImputer(variables=['RendaMensal', 'N...",EqualFrequencyDiscretiser(variables=['TaxaDeUt...,liblinear,5.052929735259461e-05,balanced,True,auto,125.0,Pedro,LOCAL,c:\Users\Pedro\.conda\envs\mlflow\lib\site-pac...,lr_hyperopt,"[{""name"":""cfc78614469742a54251023c73a82207"",""h...","[{""run_id"": ""55b4f479ef954094b8afed894458002a""...",with_discretizer_hyperopt
5,ace6b20f011f468ba68ddf6b8b202d85,1,FAILED,mlflow-artifacts:/1/ace6b20f011f468ba68ddf6b8b...,2024-01-18 10:00:36.330000+00:00,2024-01-18 10:03:03.689000+00:00,2398.0,0.800393,37500.0,0.800393,0.6,0.043097,0.203741,34922.0,108.0,0.289661,72.0,0.792034,0.080417,0.934133,0.934133,,SklearnTransformerWrapper(transformer=Standard...,,"MeanMedianImputer(variables=['RendaMensal', 'N...",EqualFrequencyDiscretiser(variables=['TaxaDeUt...,,,,,,,Pedro,LOCAL,c:\Users\Pedro\.conda\envs\mlflow\lib\site-pac...,lr_discretizer,"[{""name"":""cfc78614469742a54251023c73a82207"",""h...","[{""run_id"": ""ace6b20f011f468ba68ddf6b8b202d85""...",with_discretizer
0,792661fa949a47618ef7362bee4ca11b,1,FINISHED,mlflow-artifacts:/1/792661fa949a47618ef7362bee...,2024-01-18 10:27:22.221000+00:00,2024-01-18 10:27:33.883000+00:00,2398.0,0.800385,37500.0,0.800385,0.6,0.043097,0.20374,34922.0,108.0,0.289702,72.0,0.792015,0.080417,0.934133,0.934133,True,SklearnTransformerWrapper(transformer=Standard...,1.1691205548657253,"MeanMedianImputer(variables=['RendaMensal', 'N...",EqualFrequencyDiscretiser(variables=['TaxaDeUt...,newton-cg,6.063510703680488e-05,,True,auto,236.0,Pedro,LOCAL,c:\Users\Pedro\.conda\envs\mlflow\lib\site-pac...,lr_hyperopt,"[{""name"":""cfc78614469742a54251023c73a82207"",""h...","[{""run_id"": ""792661fa949a47618ef7362bee4ca11b""...",with_discretizer_hyperopt
4,a3ec8c5e01dc489a9d14278706a480d2,1,FINISHED,mlflow-artifacts:/1/a3ec8c5e01dc489a9d14278706...,2024-01-18 10:26:41.597000+00:00,2024-01-18 10:26:51.849000+00:00,2398.0,0.800354,37500.0,0.800354,0.6,0.043097,0.203736,34922.0,108.0,0.289784,72.0,0.791964,0.080417,0.934133,0.934133,True,SklearnTransformerWrapper(transformer=Standard...,2.244401289727354,"MeanMedianImputer(variables=['RendaMensal', 'N...",EqualFrequencyDiscretiser(variables=['TaxaDeUt...,lbfgs,2.146759033312009e-05,,True,auto,760.0,Pedro,LOCAL,c:\Users\Pedro\.conda\envs\mlflow\lib\site-pac...,lr_hyperopt,"[{""name"":""cfc78614469742a54251023c73a82207"",""h...","[{""run_id"": ""a3ec8c5e01dc489a9d14278706a480d2""...",with_discretizer_hyperopt
6,473187f27f9b4b8bb363f0915cf2e1e4,1,FINISHED,mlflow-artifacts:/1/473187f27f9b4b8bb363f0915c...,2024-01-18 09:44:57.813000+00:00,2024-01-18 09:45:03.919000+00:00,,,,0.70964,,,,,,,,0.69362,,,,,SklearnTransformerWrapper(transformer=Standard...,,"MeanMedianImputer(variables=['RendaMensal', 'N...",,,,,,,,Pedro,LOCAL,c:\Users\Pedro\.conda\envs\mlflow\lib\site-pac...,lr_baseline,,"[{""run_id"": ""473187f27f9b4b8bb363f0915cf2e1e4""...",baseline


In [22]:
run_id = df_mlflow.loc[df_mlflow['metrics.val_roc_auc'].idxmax()]['run_id']
run_id

'126210dda4d445fdb7730040e4519711'