In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import pickle
# import umap
from sklearn.preprocessing import StandardScaler, RobustScaler, OrdinalEncoder, LabelEncoder, OneHotEncoder
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, roc_auc_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score, train_test_split, StratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
import mlflow
import optuna


ModuleNotFoundError: No module named 'mlflow'

In [None]:
import dagshub
dagshub.init(repo_owner='SanyamVv3', repo_name='mlflow_dags_hub', mlflow=True)

In [None]:
df = pd.read_csv('diabetes_prediction_dataset.csv')

In [None]:
cat_features = ['gender','smoking_history']

# Binary features
bin_features = ['hypertension','heart_disease']

# Numerical features
num_features = df.drop(columns=cat_features + bin_features + ['diabetes']).columns.tolist()


# Target variable
target = 'diabetes'

In [None]:
X = df.drop(columns=['diabetes'])

# Target
y = df['diabetes']

In [None]:
# Creating the folds for the k fold cross-validation
kfold = StratifiedKFold(n_splits=5 , shuffle=True, random_state=101)

In [None]:
column_transformer = ColumnTransformer(
    [
        ('num','passthrough',num_features),
        ('cat',OrdinalEncoder(),cat_features),
        ('bin','passthrough', bin_features)
    ],
    remainder='drop',
    verbose_feature_names_out=True
)

In [None]:
def create_classification_pipeline(classifier_model):

    pipeline = Pipeline(
        [
            ('transformer',column_transformer),
            ('classifier',classifier_model)
        ]
    )
    return pipeline

In [None]:
def flatten_array(predictions):
    flat_array = []
    for array in predictions:
        for item in array:
            flat_array.append(item)
    return flat_array

In [None]:
def objective(trial):

    # with mlflow.start_run(nested=True):
    with mlflow.start_run() :
        mlflow.xgboost.autolog()
        param = {
            "verbosity": 0,

            "objective": "binary:logistic",

            # defines booster, gblinear for linear functions.
            "booster": "gbtree",

            # L2 regularization weight.
            "reg_lambda": trial.suggest_float("reg_lambda", 1e-4, 1.0, log=True),

            # L1 regularization weight.
            "reg_alpha": trial.suggest_float("reg_alpha", 1e-4, 1.0, log=True),

            # sampling ratio for training data.
            "subsample": trial.suggest_float("subsample", 0.5, 1.0,step=0.1),

            # sampling according to each tree.
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),

            "n_estimators": trial.suggest_int("n_estimators",100,800,step=50),

            # maximum depth of the tree, signifies complexity of the tree.
            "max_depth":trial.suggest_int("max_depth", 2, 8, step=2),

            # minimum child weight, larger the term more conservative the tree.
            'min_child_weight':trial.suggest_int("min_child_weight", 2, 30),

            "eta":trial.suggest_float("eta", 1e-3, 1.0, log=True),

            # defines how selective algorithm is.
            "gamma":trial.suggest_float("gamma", 1e-3, 1.0, log=True),

            "grow_policy":trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"]),

            "scale_pos_weight":trial.suggest_float("scale_pos_weight",1,10)

        }

        true_positives = []
        false_positives = []
        true_negatives = []
        false_negatives = []
        predictions_bin = []
        predictions_proba = []
        y_true = []

        for i, (train_index, test_index) in enumerate(kfold.split(X,y)):

            X_train, y_train = X.loc[train_index], y.loc[train_index]
            X_test, y_test = X.loc[test_index], y.loc[test_index]

            classifer_model = XGBClassifier(**param, random_state=101)
            pipeline_classifier = create_classification_pipeline(classifer_model)

            pipeline_classifier.fit(X_train, y_train)
            mlflow.log_models(model=classifer_model)

            binary_predictions = pipeline_classifier.predict(X_test)
            proba_predictions = pipeline_classifier.predict_proba(X_test)

            predictions_bin.append(binary_predictions)
            predictions_proba.append(proba_predictions[:,1])
            y_true.append(y_test.to_numpy())

            fold_confusion_matrix = confusion_matrix(y_true=y_test, y_pred=binary_predictions)
            true_positives.append(fold_confusion_matrix[1,1])
            true_negatives.append(fold_confusion_matrix[0,0])
            false_positives.append(fold_confusion_matrix[0,1])
            false_negatives.append(fold_confusion_matrix[1,0])

        predictions_bin = flatten_array(predictions_bin)
        predictions_proba = flatten_array(predictions_proba)
        y_true = flatten_array(y_true)

        accuracy = accuracy_score(y_true=y_true, y_pred=predictions_bin)
        recall = recall_score(y_true=y_true, y_pred=predictions_bin)
        precision = precision_score(y_true=y_true, y_pred=predictions_bin)
        roc_auc = roc_auc_score(y_true=y_true, y_score=predictions_proba)

        mlflow.log_metric('accuracy',accuracy)
        mlflow.log_metric('recall',recall)
        mlflow.log_metric('precision',precision)
        mlflow.log_metric('roc_auc',roc_auc)
        mlflow.log_param('params',param)
        return roc_auc
        

In [None]:
import os

study = None
def run_experiments():

    mlflow.set_experiment("XGBoost_tuning_final_version_3")
    mlflow.set_tracking_uri("http://127.0.0.1:5000")
    
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=10)

    print("Number of finished trials: ", len(study.trials))
    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))     

run_experiments()

In [None]:
param = {
            "verbosity": 0,

            "objective": "binary:logistic",

            # defines booster, gblinear for linear functions.
            "booster": "gbtree",

            # L2 regularization weight.
            "reg_lambda":0.22701201370251145,

            # L1 regularization weight.
            "reg_alpha": 0.0001517102271587763,

            # sampling ratio for training data.
            "subsample": 0.9,

            # sampling according to each tree.
            "colsample_bytree": 0.42226493114808444,

            "n_estimators": 750,

            # maximum depth of the tree, signifies complexity of the tree.
            "max_depth":6,

            # minimum child weight, larger the term more conservative the tree.
            'min_child_weight':8,

            "eta":0.010571498549826279,

            # defines how selective algorithm is.
            "gamma":0.0012338402450844555,

            "grow_policy":'lossguide',

            "scale_pos_weight":9.98131994138987

        }

In [None]:
mlflow.set_experiment("Final_model")
mlflow.set_tracking_uri("http://127.0.0.1:5000")
classifer_model = XGBClassifier(**param, random_state=101)
pipeline_classifier = create_classification_pipeline(classifer_model)

pipeline_classifier.fit(X, y)
mlflow.xgboost.log_models(xgb_model=classifer_model)

binary_predictions = pipeline_classifier.predict(X)
proba_predictions = pipeline_classifier.predict_proba(X)

accuracy = accuracy_score(y_true=y, y_pred=binary_predictions)
recall = recall_score(y_true=y, y_pred=binary_predictions)
precision = precision_score(y_true=y, y_pred=binary_predictions)
roc_auc = roc_auc_score(y_true=y, y_score=proba_predictions)



In [None]:
mlflow.log_metric('accuracy',accuracy)
mlflow.log_metric('recall',recall)
mlflow.log_metric('precision',precision)
mlflow.log_metric('roc_auc',roc_auc)
mlflow.log_param('params',param)