In [1]:
import mlflow
from mlflow import MlflowClient
import pprint
import os
import shutil
import pickle
import sys

sys.path.remove('c:\\users\\user\\desktop\\python project\\mlflow\\real loan application')

from utils import utils as ut

In [2]:
# import basic
import pandas as pd
import numpy as np

# import mlflow
import mlflow
from mlflow import MlflowClient
from mlflow.models import infer_signature

# import hyper-parameter tuning tool
import optuna

# import compile steps
from dsteps import data_ingestion as di
from dsteps import data_transformation as dt
from dsteps import model_training as mt
import mlflow_exp as mle


# model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier

# model validation include cross validation score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score


# os
import os

In [3]:
# set the tracking uri
 
mlflow.set_tracking_uri("http://127.0.0.1:5000") # -> other than clien, high level api
client = MlflowClient() # -> for CRUD experiments, runs, model version and registered model, low level api

In [4]:
# set up an experiemnt

experiment_name = "EX_3_Loan_Application_Classification"

run_name = 'base_experiment'

n_trials = 3

In [6]:
# create experiment

experiment_description = (
    "This is the third experiment of loan prediction project \n"
    "This experiment contains the base experiments made"
)


model_involved = (
    "Logistic Regression"
    "AdaBoost"
)

other_tag = (
    "Optuna"
)

# Provide searchable tags that define characteristics of the Runs that
# will be in this Experiment
experiment_tags = {
    "project_name": "loan_application_project",
    "team": "syamil",
    "project_quarter": "Q1-2024",
    "mlflow.note.content": experiment_description,
    "mode_involved" : model_involved,
    "other_tag" : other_tag
}

# Create the Experiment, providing a unique name
try:
    create_experiment = client.create_experiment(
       name=experiment_name, tags=experiment_tags
   )
    print(f'Experiment {experiment_name} created')#

except:
    print(f'Experiment {experiment_name} already_exist')

Experiment EX_3_Loan_Application_Classification created


In [5]:
# Cleaning and Preprocessing Model Here


file_path = '../data/Loan_Data.csv'

df = di.data_ingest(file_path)
X_train, y_train, X_test, y_test, encoder, scaler = dt.cleaning_train_pipeline(df)

In [8]:
# Optuna Experiment

optuna_study = {}


# Optuna code

def objective(trial, name, model):
    

        # Adaboost classification
        if name == 'AdaBoost':
            params = {
                'n_estimators' : trial.suggest_int('n_estimators',100, 1000),
                'learning_rate': trial.suggest_float('learning_rate',0.0, 1.0),
                'algorithm' : trial.suggest_categorical('algorithm',['SAMME','SAMME.R'])
            }
            
        # Logistic Regression
        elif name == 'Logistic_Regression':    

            penalty_choices = ['l1', 'l2', 'elasticnet']

            params = {
                #'penalty' : trial.suggest_categorical('penalty', [None, 'l2', 'l1', 'elasticnet']),
                'class_weight': trial.suggest_categorical('class_weight', ['balanced', None]),
                'solver': trial.suggest_categorical('solver', ['lbfgs','liblinear','newton-cg','newton-cholesky','sag','saga']),
                'max_iter': trial.suggest_int('max_iter', 100, 1000)

            }


        model = model.set_params(**params)

        score = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy').mean()
        
        return score

In [9]:
# trying to combine two classifier

names = [
    'Logistic_Regression',
    'AdaBoost'
]

models = [
    LogisticRegression(),
    AdaBoostClassifier()
]



# set the experiment name here
# set experiment
mlflow.set_experiment(experiment_name)

for name, model in zip(names, models):
    
    
    with mlflow.start_run(run_name=f'run_name_{name}') as parent_run:
        
        # get mlflow id
        run_id = parent_run.info.run_id
        
        #score = model_training(model,X_train, y_train, X_test, y_test)
        study = optuna.create_study(direction='maximize', sampler=optuna.samplers.RandomSampler())
        objective_fn = lambda trial: objective(trial, name, model)
        study.optimize(objective_fn, n_trials=n_trials)
        
        
        ##  ---- Trial ------ ##
        
        # log the experiment result into mlflow
        for i, trial in enumerate(study.trials):
            with mlflow.start_run(run_name=f"{run_name}_{name}_trial_{i+1}",nested=True):
                mlflow.log_params(trial.params)
                mlflow.log_metric('accuracy', trial.value)    
            
                
        ##  ---- Champion Model ------ ##
        
        # update the best model run with unseen data
        model, score_train, score_valid, signature = mt.train_model(model, X_train, y_train, X_test, y_test, study.best_params)

        # update the run in experiemnt
        exp_run_param = {
            'name' : f'{experiment_name}',
            'run_name': f'{run_name}_{name}_best_params',
            'artifact_path': f'loan_application_model/{run_name}',
            'model_name': f'{name}_best_params', 
            'signature': signature
        }
        
        model_params = study.best_params
        
        metrics = {
            'Accuracy Training' : score_train,
            'Accuracy Test' : score_valid 
        }
        
        mle.mlflow_logging(exp_run_param, model, model_params, metrics)

        optuna_study[name] = study

        
print('Done')

[I 2024-06-03 11:24:12,381] A new study created in memory with name: no-name-2774a0f1-3a5c-423d-acbc-ad778ac331d0
[I 2024-06-03 11:24:12,428] Trial 0 finished with value: 0.7543972199946538 and parameters: {'class_weight': 'balanced', 'solver': 'newton-cg', 'max_iter': 252}. Best is trial 0 with value: 0.7543972199946538.
[I 2024-06-03 11:24:12,452] Trial 1 finished with value: 0.7543972199946538 and parameters: {'class_weight': 'balanced', 'solver': 'liblinear', 'max_iter': 253}. Best is trial 0 with value: 0.7543972199946538.
[I 2024-06-03 11:24:12,478] Trial 2 finished with value: 0.7986367281475542 and parameters: {'class_weight': None, 'solver': 'sag', 'max_iter': 364}. Best is trial 2 with value: 0.7986367281475542.
Registered model 'Logistic_Regression_best_params' already exists. Creating a new version of this model...
2024/06/03 11:24:20 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Logistic_Regress

Done


Created version '5' of model 'AdaBoost_best_params'.


In [None]:
# after updating everything and testing, validating the model in mlflow


In [None]:
# maybe can consider getting model that are
# 1. latest
# 2. best


# update the file