In [94]:
import mlflow
from mlflow import MlflowClient
import pandas as pd
import numpy as np
from mlflow.models import infer_signature

In [63]:
import optuna
import os


from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

In [9]:
# get data

file_path = '../data/Loan_Data.csv'
df = pd.read_csv(file_path)

In [10]:
# cleaning a bit

# 1. drop id col
df.drop(columns = ['Loan_ID'], inplace=True)

# 2. drop dupe
df.drop_duplicates(inplace=True)

# 2.1 drop missing values
df.dropna(inplace=True)


# 3. classify num and cat columns

cat_columns = [column for column in df.columns if df[column].dtype == 'object']
num_columns = [column for column in df.columns if df[column].dtype != 'object']

cat_columns.remove('Loan_Status')



# 4. split to training and testing set

X = df.drop(columns = 'Loan_Status').copy()
y = df['Loan_Status'].copy()
        
X_train, X_test, y_train, y_test = train_test_split(X,y, 
                                    random_state=42,  
                                    test_size=0.1,
                                    shuffle=True)


# 5. encoder

encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
encoder.fit(X_train[cat_columns])

X_train[cat_columns] = encoder.transform(X_train[cat_columns])    
X_test[cat_columns] = encoder.transform(X_test[cat_columns])


# 6. scale

scaler = StandardScaler()
scaler.fit(X_train)

X_train= scaler.transform(X_train)
X_test= scaler.transform(X_test)


# Optuna Part + MLFlow

In [108]:
def mlflow_logging(exp_param, model, params:dict, metrics:dict):

    experiment_name = exp_param['name']
    run_name = exp_param['run_name']
    artifact_path = exp_param['artifact_path']
    model_name = exp_param['model_name']
    signature = exp_param['signature']


    # set experiment
    mlflow.set_experiment(experiment_name)



    # initiate the logging
    with mlflow.start_run(run_name = run_name, nested=True) as run:
        
        mlflow.log_params(params)

        mlflow.log_metrics(metrics)


        # Log an instance of the trained model for later use
        mlflow.sklearn.log_model(
            sk_model=model,  
            artifact_path=artifact_path,
            registered_model_name= model_name,
            signature=signature
        )


In [97]:
# set up mlflow client

client = MlflowClient()

In [99]:
# create experiment

experiment_description = (
    "This is the application loan prediction project"
    "This experiment contains the base experiments made"
)

data_clean_made = (
    "remove id col, remove duplicate, [remove missing], categorical encoder, standard scaler"
)

model_involved = (
    "Logistic Regression"
    "AdaBoost"
)

other_tag = (
    "Optuna"
)

# Provide searchable tags that define characteristics of the Runs that
# will be in this Experiment
experiment_tags = {
    "project_name": "loan_application_project",
    "team": "syamil",
    "project_quarter": "Q1-2024",
    "mlflow.note.content": experiment_description,
    "data_cleaning" : data_clean_made,
    "mode_involved" : model_involved,
    "other_tag" : other_tag
}

# Create the Experiment, providing a unique name
create_experiment = client.create_experiment(
    name="Loan_Application_Classification", tags=experiment_tags
)


In [None]:
experiment_name = 'first_experiment'
run_name = "first_run"
artifact_path = 'loan_application_model'
model_name = 


In [104]:
def train_model(model_ex, X_train, y_train, X_test, y_test, params:dict):

    #logging.info("Running up Logistic Regression...")

    # run logistic regression model
    model = model_ex.set_params(**params)
    model.fit(X_train, y_train)

    score_train = model.score(X_train, y_train)
    score_valid = model.score(X_test, y_test)


    # trying to get the signature 
    # -> (responsible to save the schema of model input and output)
    predictions = model.predict(X_test)
    signature = infer_signature(X_test, predictions)
    
    
    return model, score_train, score_valid, signature



In [91]:
# Optuna code

def objective(trial, name, model):
    
    #with mlflow.start_run(nested=True):
    
    
        # Adaboost classification
        if name == 'AdaBoost':
            params = {
                'n_estimators' : trial.suggest_int('n_estimators',100, 1000),
                'learning_rate': trial.suggest_float('learning_rate',0.0, 1.0),
                'algorithm' : trial.suggest_categorical('algorithm',['SAMME','SAMME.R'])
            }
            
        # Logistic Regression
        elif name == 'Logistic_Regression':    

            penalty_choices = ['l1', 'l2', 'elasticnet']

            params = {
                #'penalty' : trial.suggest_categorical('penalty', [None, 'l2', 'l1', 'elasticnet']),
                'class_weight': trial.suggest_categorical('class_weight', ['balanced', None]),
                'solver': trial.suggest_categorical('solver', ['lbfgs','liblinear','newton-cg','newton-cholesky','sag','saga']),
                'max_iter': trial.suggest_int('max_iter', 100, 1000)

            }


        model = model.set_params(**params)

        score = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy').mean()

        score
        
        return score


#study = optuna.create_study(direction='maximize', sampler=optuna.samplers.RandomSampler())

#study.optimize(objective, n_trials=5)

In [105]:
run_name = 'base_experiment'

In [109]:
# trying to combine two classifier

names = [
    'Logistic_Regression',
    #'AdaBoost'
]

models = [
    LogisticRegression(),
    #AdaBoostClassifier()
]

optuna_study = {}

def model_training(model, X_train, y_train, X_test, y_test):
    model_fitted = model.fit(X_train, y_train)
    y_pred = model_fitted.predict(X_test)
    score = accuracy_score(y_pred, y_test)
    return model, round(score,3)


for name, model in zip(names, models):
    
    
    with mlflow.start_run(run_name=run_name) as parent_run:
        
        # get mlflow id
        run_id = parent_run.info.run_id
        
        #score = model_training(model,X_train, y_train, X_test, y_test)
        study = optuna.create_study(direction='maximize', sampler=optuna.samplers.RandomSampler())
        objective_fn = lambda trial: objective(trial, name, model)
        study.optimize(objective_fn, n_trials=5)
        
        for i, trial in enumerate(study.trials):
            with mlflow.start_run(run_name=f"{run_name}_{name}_trial_{i+1}",nested=True):
                mlflow.log_params(trial.params)
                mlflow.log_metric('accuracy', trial.value)    
            
                
        
        # update the best model run with unseen data
        
        model, score_train, score_valid, signature = train_model(model, X_train, y_train, X_test, y_test, study.best_params)

        # update the run in experiemnt
        exp_param = {
            'name' : 'Loan_Application_Classification',
            'run_name': f'{run_name}_{name}_best_params',
            'artifact_path': f'loan_application_model/{run_name}',
            'model_name': f'{name}_best_params', 
            'signature': signature
        }
        
        model_params = study.best_params
        
        metrics = {
            'Accuracy Training' : score_train,
            'Accuracy Test' : score_valid 
        }
        
        mlflow_logging(exp_param, model, model_params, metrics)

        optuna_study[name] = study

        #print(f'{name}: {score}')

[I 2024-04-14 11:42:49,436] A new study created in memory with name: no-name-c6987f2b-87d8-4fbb-8ace-d93b8ff815d9
[I 2024-04-14 11:42:49,467] Trial 0 finished with value: 75.43972199946538 and parameters: {'class_weight': 'balanced', 'solver': 'liblinear', 'max_iter': 734}. Best is trial 0 with value: 75.43972199946538.
[I 2024-04-14 11:42:49,492] Trial 1 finished with value: 79.86367281475542 and parameters: {'class_weight': None, 'solver': 'newton-cholesky', 'max_iter': 631}. Best is trial 1 with value: 79.86367281475542.
[I 2024-04-14 11:42:49,509] Trial 2 finished with value: 79.86367281475542 and parameters: {'class_weight': None, 'solver': 'liblinear', 'max_iter': 604}. Best is trial 1 with value: 79.86367281475542.
[I 2024-04-14 11:42:49,554] Trial 3 finished with value: 75.43972199946538 and parameters: {'class_weight': 'balanced', 'solver': 'newton-cg', 'max_iter': 240}. Best is trial 1 with value: 79.86367281475542.
[I 2024-04-14 11:42:49,579] Trial 4 finished with value: 75.

{'class_weight': 'balanced', 'solver': 'liblinear', 'max_iter': 734}
{'class_weight': None, 'solver': 'newton-cholesky', 'max_iter': 631}
{'class_weight': None, 'solver': 'liblinear', 'max_iter': 604}
{'class_weight': 'balanced', 'solver': 'newton-cg', 'max_iter': 240}
{'class_weight': 'balanced', 'solver': 'newton-cholesky', 'max_iter': 732}


Successfully registered model 'Logistic_Regression_best_params'.
2024/04/14 11:42:52 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: Logistic_Regression_best_params, version 1
Created version '1' of model 'Logistic_Regression_best_params'.


In [112]:
# test get experiment info

mlflow.search_experiments()

[<Experiment: artifact_location='file:///C:/Users/USER/Desktop/MLOPS%20NEW/E2E%20Loan%20Application%20Classification/E2E_Loan_Application/experiment/mlruns/615822914642047264', creation_time=1713065435867, experiment_id='615822914642047264', last_update_time=1713065435867, lifecycle_stage='active', name='Loan_Application_Classification', tags={'data_cleaning': 'remove id col, remove duplicate, [remove missing], '
                   'categorical encoder, standard scaler',
  'mlflow.note.content': 'This is the application loan prediction project This '
                         'experiment contains the base experiments made',
  'mode_involved': 'Logistic RegressionAdaBoost',
  'other_tag': 'Optuna',
  'project_name': 'loan_application_project',
  'project_quarter': 'Q1-2024',
  'team': 'syamil'}>,
 <Experiment: artifact_location='file:///C:/Users/USER/Desktop/MLOPS%20NEW/E2E%20Loan%20Application%20Classification/E2E_Loan_Application/experiment/mlruns/0', creation_time=1713012293169, exper