In [1]:
# import basic
import pandas as pd
import numpy as np

# import mlflow
import mlflow
from mlflow import MlflowClient
from mlflow.models import infer_signature

# import hyper-parameter tuning tool
import optuna

# import compile steps
from dsteps import data_ingestion as di
from dsteps import data_transformation as dt
from dsteps import model_training as mt
import mlflow_exp as mle


# model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier

# model validation include cross validation score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score


# os
import os

In [2]:
import dagshub 

DAGSHUB_TOKEN = dagshub.auth.get_token()

In [3]:
import os

# MLFLOW_TRACKING_URI='https://dagshub.com/Syamil-Ali/E2E_Loan_Application.mlflow'
# MLFLOW_TRACKING_USERNAME='Syamil-Ali'
# MLFLOW_TRACKING_PASSWORD='2e6b27e2093c83eacbf6777913b6d087592d1710'

# # It's recommended to define this within the code, because it's project specific (but this works too)
# os.environ['MLFLOW_TRACKING_URI'] = MLFLOW_TRACKING_URI

# # Recommended to define as environment variables
# os.environ['MLFLOW_TRACKING_USERNAME'] = MLFLOW_TRACKING_USERNAME
# os.environ['MLFLOW_TRACKING_PASSWORD'] = DAGSHUB_TOKEN

# Define Dataset Path, Mlflow Client Server and So on.

In [4]:
# try connecting to dagshub

MLFLOW_TRACKING_URI = 'https://dagshub.com/Syamil-Ali/E2E_Loan_Application.mlflow'
MLFLOW_TRACKING_USERNAME = 'Syamil-Ali'
MLFLOW_TRACKING_PASSWORD = '2e6b27e2093c83eacbf6777913b6d087592d1710'

os.environ['MLFLOW_TRACKING_URI'] = MLFLOW_TRACKING_URI

# Recommended to define as environment variables
os.environ['MLFLOW_TRACKING_USERNAME'] = MLFLOW_TRACKING_USERNAME
os.environ['MLFLOW_TRACKING_PASSWORD'] = MLFLOW_TRACKING_PASSWORD

In [5]:
os.environ['MLFLOW_TRACKING_URI']

'https://dagshub.com/Syamil-Ali/E2E_Loan_Application.mlflow'

In [6]:
# define path, mlflow client server and so on
file_path = '../data/Loan_Data.csv'

# set up mlflow client
client = MlflowClient(tracking_uri = os.environ['MLFLOW_TRACKING_URI'])

experiment_name = "EX_2_Loan_Application_Classification_3"

run_name = 'titi-base_experiment'

n_trials = 5

# Define MLflow Experiment

In [7]:
# create experiment

experiment_description = (
    "This is the second experiment of loan prediction project"
    "This experiment contains the base experiments made"
)


model_involved = (
    "Logistic Regression"
    "AdaBoost"
)

other_tag = (
    "Optuna"
)

# Provide searchable tags that define characteristics of the Runs that
# will be in this Experiment
experiment_tags = {
    "project_name": "loan_application_project",
    "team": "syamil",
    "project_quarter": "Q1-2024",
    "mlflow.note.content": experiment_description,
    "mode_involved" : model_involved,
    "other_tag" : other_tag
}

# Create the Experiment, providing a unique name
#try:
#    create_experiment = client.create_experiment(
#        name=experiment_name, tags=experiment_tags
#    )
#    print(f'Experiment {experiment_name} created')#

#except:
#    print(f'Experiment {experiment_name} already_exist')


In [8]:
create_experiment = client.create_experiment(experiment_name) #, tags=experiment_tags

RestException: BAD_REQUEST: Response: {'error_code': 'BAD_REQUEST'}

# Start Your Experiment Here

#### Maybe Data Preprocessing / Anything Here

In [9]:
# cleaning and transform

df = di.data_ingest(file_path)
X_train, y_train, X_test, y_test = dt.cleaning_train_pipeline(df)

#print(X_train.shape)
#print(y_train.shape)
#print(X_test.shape)
#print(y_test.shape)

In [10]:
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
5,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y


#### Model Experiment Here

In [11]:
optuna_study = {}

In [12]:
# Optuna code

def objective(trial, name, model):
    

        # Adaboost classification
        if name == 'AdaBoost':
            params = {
                'n_estimators' : trial.suggest_int('n_estimators',100, 1000),
                'learning_rate': trial.suggest_float('learning_rate',0.0, 1.0),
                'algorithm' : trial.suggest_categorical('algorithm',['SAMME','SAMME.R'])
            }
            
        # Logistic Regression
        elif name == 'Logistic_Regression':    

            penalty_choices = ['l1', 'l2', 'elasticnet']

            params = {
                #'penalty' : trial.suggest_categorical('penalty', [None, 'l2', 'l1', 'elasticnet']),
                'class_weight': trial.suggest_categorical('class_weight', ['balanced', None]),
                'solver': trial.suggest_categorical('solver', ['lbfgs','liblinear','newton-cg','newton-cholesky','sag','saga']),
                'max_iter': trial.suggest_int('max_iter', 100, 1000)

            }


        model = model.set_params(**params)

        score = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy').mean()
        
        return score

In [13]:
# trying to combine two classifier

names = [
    'Logistic_Regression',
    'AdaBoost'
]

models = [
    LogisticRegression(),
    AdaBoostClassifier()
]



for name, model in zip(names, models):
    
    
    with mlflow.start_run(run_name=f'run_name_{name}') as parent_run:
        
        # get mlflow id
        run_id = parent_run.info.run_id
        
        #score = model_training(model,X_train, y_train, X_test, y_test)
        study = optuna.create_study(direction='maximize', sampler=optuna.samplers.RandomSampler())
        objective_fn = lambda trial: objective(trial, name, model)
        study.optimize(objective_fn, n_trials=n_trials)
        
        
        ##  ---- Trial ------ ##
        
        # log the experiment result into mlflow
        for i, trial in enumerate(study.trials):
            with mlflow.start_run(run_name=f"{run_name}_{name}_trial_{i+1}",nested=True):
                mlflow.log_params(trial.params)
                mlflow.log_metric('accuracy', trial.value)    
            
                
        ##  ---- Champion Model ------ ##
        
        # update the best model run with unseen data
        model, score_train, score_valid, signature = mt.train_model(model, X_train, y_train, X_test, y_test, study.best_params)

        # update the run in experiemnt
        exp_run_param = {
            'name' : f'{experiment_name}',
            'run_name': f'{run_name}_{name}_best_params',
            'artifact_path': f'loan_application_model/{run_name}',
            'model_name': f'{name}_best_params', 
            'signature': signature
        }
        
        model_params = study.best_params
        
        metrics = {
            'Accuracy Training' : score_train,
            'Accuracy Test' : score_valid 
        }
        
        mle.mlflow_logging(exp_run_param, model, model_params, metrics)

        optuna_study[name] = study

        #print(f'{name}: {score}')
        
print('Done')

RestException: INVALID_PARAMETER_VALUE: Response: {'error_code': 'INVALID_PARAMETER_VALUE'}

In [6]:
# tst with just upload a simple log


params = {
    'val1': 'gg',
    'val2': 'no gg'
}
mlflow.log_params(params)

RestException: INVALID_PARAMETER_VALUE: Response: {'error_code': 'INVALID_PARAMETER_VALUE'}

In [None]:
# Maliciouness


def model_training(model, X_train, y_train, X_test, y_test):
    model_fitted = model.fit(X_train, y_train)
    y_pred = model_fitted.predict(X_test)
    score = accuracy_score(y_pred, y_test)
    return model, round(score,3)
