In [31]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

import xgboost as xgb
import catboost
from catboost import CatBoostClassifier, Pool
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB



from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from PIL import Image
import mlflow
import mlflow.sklearn
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score, roc_curve
import optuna

import joblib
import json

import matplotlib.pyplot as plt
import seaborn as sns

from mlflow import MlflowClient
import requests

## Table of content
- 1. [COVID-19](#1-covid-19-dataset-)
- 2. [23_end](#pipeline)

<a id=COVID-19 Dataset></a>

## 1. COVID-19 Dataset 



#### COVID-19 patient's symptoms, status, and medical history.

#### About Dataset

This dataset is taken from Kaggle platform (https://www.kaggle.com/datasets/meirnizri/covid19-dataset)  


##### Context  

Coronavirus disease (COVID-19) is an infectious disease caused by a newly discovered coronavirus. Most people infected with COVID-19 virus will experience mild to moderate respiratory illness and recover without requiring special treatment. Older people, and those with underlying medical problems like cardiovascular disease, diabetes, chronic respiratory disease, and cancer are more likely to develop serious illness.
During the entire course of the pandemic, one of the main problems that healthcare providers have faced is the shortage of medical resources and a proper plan to efficiently distribute them. In these tough times, being able to predict what kind of resource an individual might require at the time of being tested positive or even before that will be of immense help to the authorities as they would be able to procure and arrange for the resources necessary to save the life of that patient.

The main goal of this project is to build a machine learning model that, given a Covid-19 patient's current symptom, status, and medical history, will predict whether the patient is in high risk or not.

##### content
The dataset was provided by the Mexican government (link). This dataset contains an enormous number of anonymized patient-related information including pre-conditions. The raw dataset consists of 21 unique features and 1,048,576 unique patients. In the Boolean features, 1 means "yes" and 2 means "no". values as 97 and 99 are missing data.

- sex: 1 for female and 2 for male.
- age: of the patient.
- classification: covid test findings. Values 1-3 mean that the patient was diagnosed with covid in different
degrees. 4 or higher means that the patient is not a carrier of covid or that the test is inconclusive.
- patient type: type of care the patient received in the unit. 1 for returned home and 2 for hospitalization.
- pneumonia: whether the patient already have air sacs inflammation or not.
- pregnancy: whether the patient is pregnant or not.
- diabetes: whether the patient has diabetes or not.
- copd: Indicates whether the patient has Chronic obstructive pulmonary disease or not.
- asthma: whether the patient has asthma or not.
- inmsupr: whether the patient is immunosuppressed or not.
- hypertension: whether the patient has hypertension or not.
- cardiovascular: whether the patient has heart or blood vessels related disease.
- renal chronic: whether the patient has chronic renal disease or not.
- other disease: whether the patient has other disease or not.
- obesity: whether the patient is obese or not.
- tobacco: whether the patient is a tobacco user.
- usmr: Indicates whether the patient treated medical units of the first, second or third level.
- medical unit: type of institution of the National Health System that provided the care.
- intubed: whether the patient was connected to the ventilator.
- icu: Indicates whether the patient had been admitted to an Intensive Care Unit.
- date died: If the patient died indicate the date of death, and 9999-99-99 otherwise.

In [17]:
data = pd.read_csv('data/Covid_data.csv')
data

Unnamed: 0.1,Unnamed: 0,USMER,MEDICAL_UNIT,SEX,PATIENT_TYPE,DATE_DIED,INTUBED,PNEUMONIA,AGE,PREGNANT,...,ASTHMA,INMSUPR,HIPERTENSION,OTHER_DISEASE,CARDIOVASCULAR,OBESITY,RENAL_CHRONIC,TOBACCO,CLASIFFICATION_FINAL,ICU
0,0,2,1,1,1,03/05/2020,97,1,65,2,...,2,2,1,2,2,2,2,2,3,97
1,1,2,1,2,1,03/06/2020,97,1,72,97,...,2,2,1,2,2,1,1,2,5,97
2,2,2,1,2,2,09/06/2020,1,2,55,97,...,2,2,2,2,2,2,2,2,3,2
3,3,2,1,1,1,12/06/2020,97,2,53,2,...,2,2,2,2,2,2,2,2,7,97
4,4,2,1,2,1,21/06/2020,97,2,68,97,...,2,2,1,2,2,2,2,2,3,97
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
899995,899995,2,12,1,1,9999-99-99,97,2,52,2,...,2,2,1,2,2,2,2,2,7,97
899996,899996,2,12,2,1,9999-99-99,97,2,4,97,...,2,2,2,2,2,2,2,2,7,97
899997,899997,2,12,1,1,9999-99-99,97,2,24,2,...,2,2,2,2,2,2,2,2,7,97
899998,899998,2,12,1,1,9999-99-99,97,2,22,2,...,2,2,2,2,2,2,2,2,7,97


In [24]:
# temp - DELETE
data = data.iloc[0:1000]
data

Unnamed: 0.1,Unnamed: 0,USMER,MEDICAL_UNIT,SEX,PATIENT_TYPE,DATE_DIED,INTUBED,PNEUMONIA,AGE,PREGNANT,...,ASTHMA,INMSUPR,HIPERTENSION,OTHER_DISEASE,CARDIOVASCULAR,OBESITY,RENAL_CHRONIC,TOBACCO,CLASIFFICATION_FINAL,ICU
0,0,2,1,1,1,03/05/2020,97,1,65,2,...,2,2,1,2,2,2,2,2,3,97
1,1,2,1,2,1,03/06/2020,97,1,72,97,...,2,2,1,2,2,1,1,2,5,97
2,2,2,1,2,2,09/06/2020,1,2,55,97,...,2,2,2,2,2,2,2,2,3,2
3,3,2,1,1,1,12/06/2020,97,2,53,2,...,2,2,2,2,2,2,2,2,7,97
4,4,2,1,2,1,21/06/2020,97,2,68,97,...,2,2,1,2,2,2,2,2,3,97
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,2,3,2,2,14/06/2020,2,1,52,97,...,2,2,1,2,2,2,2,2,3,2
996,996,1,3,2,2,14/06/2020,1,1,70,97,...,2,2,1,2,2,1,2,1,3,1
997,997,1,3,2,2,14/06/2020,2,1,52,97,...,2,2,2,2,2,2,2,2,3,2
998,998,1,3,1,2,14/06/2020,1,1,50,2,...,2,2,1,2,2,2,2,2,6,1


In [25]:
missing_values_count = data.apply(lambda col: col.isin([97, 99]).sum())
missing_values_count

Unnamed: 0                2
USMER                     0
MEDICAL_UNIT              0
SEX                       0
PATIENT_TYPE              0
DATE_DIED                 0
INTUBED                 334
PNEUMONIA                20
AGE                       0
PREGNANT                567
DIABETES                  0
COPD                      0
ASTHMA                    0
INMSUPR                   0
HIPERTENSION              0
OTHER_DISEASE             0
CARDIOVASCULAR            0
OBESITY                   0
RENAL_CHRONIC             0
TOBACCO                   0
CLASIFFICATION_FINAL      0
ICU                     334
dtype: int64

In [26]:
data.columns

Index(['Unnamed: 0', 'USMER', 'MEDICAL_UNIT', 'SEX', 'PATIENT_TYPE',
       'DATE_DIED', 'INTUBED', 'PNEUMONIA', 'AGE', 'PREGNANT', 'DIABETES',
       'COPD', 'ASTHMA', 'INMSUPR', 'HIPERTENSION', 'OTHER_DISEASE',
       'CARDIOVASCULAR', 'OBESITY', 'RENAL_CHRONIC', 'TOBACCO',
       'CLASIFFICATION_FINAL', 'ICU'],
      dtype='object')

In [32]:
data.hist(figsize=(20, 20));

AttributeError: module 'matplotlib.cbook' has no attribute '_safe_first_finite'

<Figure size 2000x2000 with 0 Axes>

## 2. Pipeline

In [34]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Unnamed: 0            1000 non-null   int64 
 1   USMER                 1000 non-null   int64 
 2   MEDICAL_UNIT          1000 non-null   int64 
 3   SEX                   1000 non-null   int64 
 4   PATIENT_TYPE          1000 non-null   int64 
 5   DATE_DIED             1000 non-null   object
 6   INTUBED               1000 non-null   int64 
 7   PNEUMONIA             1000 non-null   int64 
 8   AGE                   1000 non-null   int64 
 9   PREGNANT              1000 non-null   int64 
 10  DIABETES              1000 non-null   int64 
 11  COPD                  1000 non-null   int64 
 12  ASTHMA                1000 non-null   int64 
 13  INMSUPR               1000 non-null   int64 
 14  HIPERTENSION          1000 non-null   int64 
 15  OTHER_DISEASE         1000 non-null   i

In [35]:
# let's make this classification task

data.loc[~data['DATE_DIED'].isin(['9999-99-99']), 'DATE_DIED'] = 1 #checks if the  column is not in the list ['9999-99-99'].
data.loc[data['DATE_DIED'] == '9999-99-99', 'DATE_DIED'] = 0


In [36]:
data

Unnamed: 0.1,Unnamed: 0,USMER,MEDICAL_UNIT,SEX,PATIENT_TYPE,DATE_DIED,INTUBED,PNEUMONIA,AGE,PREGNANT,...,ASTHMA,INMSUPR,HIPERTENSION,OTHER_DISEASE,CARDIOVASCULAR,OBESITY,RENAL_CHRONIC,TOBACCO,CLASIFFICATION_FINAL,ICU
0,0,2,1,1,1,1,97,1,65,2,...,2,2,1,2,2,2,2,2,3,97
1,1,2,1,2,1,1,97,1,72,97,...,2,2,1,2,2,1,1,2,5,97
2,2,2,1,2,2,1,1,2,55,97,...,2,2,2,2,2,2,2,2,3,2
3,3,2,1,1,1,1,97,2,53,2,...,2,2,2,2,2,2,2,2,7,97
4,4,2,1,2,1,1,97,2,68,97,...,2,2,1,2,2,2,2,2,3,97
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,2,3,2,2,1,2,1,52,97,...,2,2,1,2,2,2,2,2,3,2
996,996,1,3,2,2,1,1,1,70,97,...,2,2,1,2,2,1,2,1,3,1
997,997,1,3,2,2,1,2,1,52,97,...,2,2,2,2,2,2,2,2,3,2
998,998,1,3,1,2,1,1,1,50,2,...,2,2,1,2,2,2,2,2,6,1


In [37]:
X = data.drop('DATE_DIED',axis=1)
y = data['DATE_DIED']

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X,y, stratify=y,train_size=0.7, random_state=13)


In [39]:
X_train

Unnamed: 0.1,Unnamed: 0,USMER,MEDICAL_UNIT,SEX,PATIENT_TYPE,INTUBED,PNEUMONIA,AGE,PREGNANT,DIABETES,...,ASTHMA,INMSUPR,HIPERTENSION,OTHER_DISEASE,CARDIOVASCULAR,OBESITY,RENAL_CHRONIC,TOBACCO,CLASIFFICATION_FINAL,ICU
667,667,2,3,1,1,97,2,48,2,1,...,2,2,1,2,2,2,2,2,3,97
350,350,2,3,1,2,1,1,63,2,1,...,2,2,1,2,1,2,1,2,7,2
283,283,2,2,2,1,97,2,95,97,2,...,2,2,2,2,2,2,2,2,7,97
608,608,1,3,2,2,2,1,40,97,2,...,2,2,2,2,2,2,2,2,5,2
347,347,2,3,1,2,2,1,72,2,2,...,2,2,2,2,2,1,2,2,3,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164,164,1,2,1,2,2,2,12,2,2,...,2,1,2,2,2,2,2,2,3,2
871,871,1,3,1,2,1,1,43,2,1,...,2,2,2,2,2,2,2,2,3,1
569,569,1,3,2,2,1,1,77,97,1,...,2,2,1,2,2,2,2,2,3,1
54,54,2,1,1,1,97,2,62,2,2,...,2,2,1,2,2,1,2,1,7,97


### Pipeline

In [None]:
numerical_features = ['AGE']

categorical_features = [i for i in X_train.columns if i not in numerical_features]

# numeric data pipeline
pipe_num = Pipeline([
    # ('imputer', SimpleImputer(strategy='median')), # not necessary because we have already clean data 
    ('power_tr', PowerTransformer()),
    ('scaler', StandardScaler()) ])

# cat data pipeline
pipe_cat = Pipeline([
    # ('imputer', SimpleImputer(strategy='constant',fill_value='unknown')), # not necessary because we have already clean data 
    ('encoding', OneHotEncoder(sparse_output=False, handle_unknown='ignore')) ])

# column transformer
ct = ColumnTransformer([
    ('pipe_num', pipe_num, numerical_features),
    ('pipe_cat', pipe_cat, categorical_features) ])

## 3. Model training

### 3.1 XGBoost model
XGBoost is an optimized distributed gradient boosting library designed to be highly efficient, flexible and portable. It implements machine learning algorithms under the Gradient Boosting framework. XGBoost provides a parallel tree boosting (also known as GBDT, GBM) that solve many data science problems in a fast and accurate way. The same code runs on major distributed environment (Hadoop, SGE, MPI) and can solve problems beyond billions of examples.

In [9]:
#full pipeline with XGBoost
pipe = Pipeline([
    ('column_transformer', ct),
    ('model', xgb.XGBClassifier()),  # It denotes the fraction of observations to be randomly samples for each tree. 
])


NameError: name 'ct' is not defined

In [10]:
mlflow.set_tracking_uri(uri='http://127.0.0.1:5000')
client = MlflowClient()
local_dir = ('C:/Users/Peter/DataspellProjects/pollock')


In [11]:
# optimize hyperparameters with Optuna
def objective(trial):

    # params for optimization
    params = {
        'model__eta': trial.suggest_float('model__eta', 0.01, 0.2),
        'model__gamma': trial.suggest_float('model__gamma', 0.01, 5),
        'model__max_depth': trial.suggest_int('model__max_depth', 3, 10),
        'model__min_child_weight': trial.suggest_float('model__min_child_weight', 1, 10),
        'model__max_delta_step': trial.suggest_float('model__max_delta_step', 0.001, 10),
        'model__subsample': trial.suggest_float('model__subsample', 0.1, 1),
        'model__colsample_bytree': trial.suggest_float('model__colsample_bytree', 0.5, 1),
        'model__colsample_bylevel': trial.suggest_float('model__colsample_bylevel', 0.5, 1),
        'model__colsample_bynode': trial.suggest_float('model__colsample_bynode', 0.5, 1),
        'model__reg_lambda': trial.suggest_float('model__reg_lambda', 1e-5, 1e2),
        'model__reg_alpha': trial.suggest_float('model__reg_alpha', 1e-5, 1e2),
        'model__scale_pos_weight': trial.suggest_float('model__scale_pos_weight', 1, 10),
        'model__max_leaves': trial.suggest_int('model__max_leaves', 1, 100),
    }

    pipe.set_params(**params)

    # model fitting
    cross_val_f1 = cross_val_score(pipe, X_train, list(y_train), scoring='f1').mean()

    return cross_val_f1  # Use F1 score for binary classification


mlflow.set_experiment("MLflow_hype_optuna_XGB")

with mlflow.start_run(run_name='xgb_optuna_run'):
    study = optuna.create_study(direction='maximize')  # Maximize F1 score
    study.optimize(objective, n_trials=10)

    best_trial = study.best_trial
    best_params = best_trial.params

    # Log parameters
    mlflow.log_params(best_params)

    # Set the best parameters to your model
    pipe.set_params(**best_params)

    # Train the model with the best parameters on the entire training set
    pipe.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = pipe.predict(X_test)


# Metrics
    accuracy = accuracy_score(list(y_test), y_pred)
    recall = recall_score(list(y_test), y_pred)
    f1 = f1_score(list(y_test), y_pred)

    mlflow.sklearn.log_model(pipe, 'xgb_model_optuna')

    # Log metrics
    mlflow.log_metrics({
        'accuracy': accuracy,
        'recall': recall,
        'f1_score': f1,
    })

    # Calculate ROC AUC (if needed for binary classification)
    fpr, tpr, thresholds = roc_curve(list(y_test), y_pred)
    roc_auc = roc_auc_score(list(y_test), y_pred)

    # Plot ROC curve (if needed for binary classification)
    plt.figure(figsize=(8, 8))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc))
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc='lower right')

    # Save the plot to a local file
    roc_auc_plot_path = "mlruns/roc_auc_plot.png"
    plt.savefig(roc_auc_plot_path)
    plt.close()

    # Log ROC AUC plot as an artifact (if needed for binary classification)
    mlflow.log_artifact(roc_auc_plot_path, "roc_auc_plot")


[I 2024-01-11 00:59:07,974] A new study created in memory with name: no-name-57b30afa-96de-4a5a-8a54-db093dd29f6d
[W 2024-01-11 00:59:07,978] Trial 0 failed with parameters: {'model__eta': 0.1786362185217251, 'model__gamma': 1.0539854463513472, 'model__max_depth': 8, 'model__min_child_weight': 7.4760088278700145, 'model__max_delta_step': 4.221996819410605, 'model__subsample': 0.8349318851436973, 'model__colsample_bytree': 0.6110668553134777, 'model__colsample_bylevel': 0.6402830439645868, 'model__colsample_bynode': 0.5378435785888132, 'model__reg_lambda': 10.67028593140285, 'model__reg_alpha': 87.74754936249204, 'model__scale_pos_weight': 3.1905012085621443, 'model__max_leaves': 60} because of the following error: NameError("name 'pipe' is not defined").
Traceback (most recent call last):
  File "C:\ProgramData\anaconda3\envs\kanagawa\lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\Peter\AppData\Local\Temp\ipykern

NameError: name 'pipe' is not defined

In [None]:
logged_model_xgb = 'runs:/38d9fc155e7f4dc3b5e1cdce89a95577/xgb_model_optuna'

# Load model as a PyFuncModel.
loaded_model_xgb = mlflow.pyfunc.load_model(logged_model_xgb)

In [None]:
joblib.dump(loaded_model_xgb,'xgb_optuna_model.joblib')

### 3.2 CatBoost model
Catboost is a boosted decision tree machine learning algorithm developed by Yandex. It works in the same way as other gradient boosted algorithms such as XGBoost but provides support out of the box for categorical variables, has a higher level of accuracy without tuning parameters and also offers GPU support to speed up training.


In [12]:
# Full pipeline with CatBoost
cat_pipe = Pipeline([
    ('column_transformer', ct),
    ('model', CatBoostClassifier()),  # Use CatBoostClassifier
])

NameError: name 'ct' is not defined

In [13]:
# Optimize hyperparameters with Optuna
def objective(trial):

    # Parameters for optimization
    cat_params = {
        'model__learning_rate': trial.suggest_float('model__learning_rate', 0.01, 0.2),
        'model__depth': trial.suggest_int('model__depth', 3, 10),
        'model__l2_leaf_reg': trial.suggest_float('model__l2_leaf_reg', 1, 10),
        'model__subsample': trial.suggest_float('model__subsample', 0.1, 1),
        'model__colsample_bylevel': trial.suggest_float('model__colsample_bylevel', 0.5, 1),
        'model__scale_pos_weight': trial.suggest_float('model__scale_pos_weight', 1, 10),
    }

    cat_pipe.set_params(**cat_params)

    # Model fitting
    cross_val_f1 = cross_val_score(cat_pipe, X_train, list(y_train), scoring='f1').mean()

    return cross_val_f1  # Use F1 score for binary classification

mlflow.set_experiment("MLflow_hype_optuna_CatBoost")

with mlflow.start_run(run_name='catboost_optuna_run'):
    cat_study = optuna.create_study(direction='maximize')  # Maximize F1 score
    cat_study.optimize(objective, n_trials=10)

    best_cat_trial = cat_study.best_trial
    best_cat_params = best_cat_trial.params

    # Log parameters
    mlflow.log_params(best_cat_params)

    # Set the best parameters to your CatBoost model
    cat_pipe.set_params(**best_cat_params)

    # Train the model with the best parameters on the entire training set
    cat_pipe.fit(X_train, y_train)

    # Make predictions on the test set
    y_cat_pred = cat_pipe.predict(X_test)

    # Metrics
    accuracy_cat = accuracy_score(list(y_test), y_cat_pred)
    recall_cat = recall_score(list(y_test), y_cat_pred)
    f1_cat = f1_score(list(y_test), y_cat_pred)

    mlflow.sklearn.log_model(cat_pipe, 'catboost_model_optuna')

    # Log metrics
    mlflow.log_metrics({
        'accuracy': accuracy_cat,
        'recall': recall_cat,
        'f1_score': f1_cat,
    })

    # Calculate ROC AUC (if needed for binary classification)
    fpr_cat, tpr_cat, thresholds_cat = roc_curve(list(y_test), y_cat_pred)
    roc_auc_cat = roc_auc_score(list(y_test), y_cat_pred)

    # Plot ROC curve (if needed for binary classification)
    plt.figure(figsize=(8, 8))
    plt.plot(fpr_cat, tpr_cat, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc_cat))
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc='lower right')

    # Save the plot to a local file
    roc_auc_cat_plot_path = "mlruns/roc_auc_cat_plot.png"
    plt.savefig(roc_auc_cat_plot_path)
    plt.close()

    # Log ROC AUC plot as an artifact (if needed for binary classification)
    mlflow.log_artifact(roc_auc_cat_plot_path, "roc_auc_cat_plot")

[I 2024-01-11 00:59:09,068] A new study created in memory with name: no-name-b489c625-c557-4ae1-be77-975c9e11ba00
[W 2024-01-11 00:59:09,071] Trial 0 failed with parameters: {'model__learning_rate': 0.14259961582120448, 'model__depth': 6, 'model__l2_leaf_reg': 6.729640716001769, 'model__subsample': 0.6548678026271023, 'model__colsample_bylevel': 0.6795087980007523, 'model__scale_pos_weight': 4.238174214234416} because of the following error: NameError("name 'cat_pipe' is not defined").
Traceback (most recent call last):
  File "C:\ProgramData\anaconda3\envs\kanagawa\lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\Peter\AppData\Local\Temp\ipykernel_27000\643228215.py", line 14, in objective
    cat_pipe.set_params(**cat_params)
NameError: name 'cat_pipe' is not defined
[W 2024-01-11 00:59:09,078] Trial 0 failed with value None.


NameError: name 'cat_pipe' is not defined

In [None]:
logged_model_catb = 'runs:/99549db16c9b4884b06402a72fc1809f/catboost_model_optuna'

# Load model as a PyFuncModel.
loaded_model_catb = mlflow.pyfunc.load_model(logged_model_catb)

In [14]:
joblib.dump(loaded_model_catb,'catb_optuna_model.joblib')

NameError: name 'loaded_model_catb' is not defined

### 3.3 LightGBM model
Light GBM is a gradient boosting framework that uses tree based learning algorithm.

Light GBM grows tree vertically while other algorithm grows trees horizontally meaning that Light GBM grows tree leaf-wise while other algorithm grows level-wise. It will choose the leaf with max delta loss to grow. When growing the same leaf, Leaf-wise algorithm can reduce more loss than a level-wise algorithm.

In [15]:
model_trained = lgbm_pipe.fit(X_train, list(y_train))


NameError: name 'lgbm_pipe' is not defined

In [None]:
# Full pipeline with LightGBM
lgbm_pipe = Pipeline([
    ('column_transformer', ct),
    ('model', LGBMClassifier()),  # Use LGBMClassifier
])

# Optimize hyperparameters with Optuna
def objective(trial):

    # Parameters for optimization
    lgbm_params = {
        'model__learning_rate': trial.suggest_float('model__learning_rate', 0.01, 0.2),
        'model__num_leaves': trial.suggest_int('model__num_leaves', 10, 200),
        'model__max_depth': trial.suggest_int('model__max_depth', 3, 20),
        'model__min_child_samples': trial.suggest_int('model__min_child_samples', 1, 20),
        'model__subsample': trial.suggest_float('model__subsample', 0.1, 1),
        'model__colsample_bytree': trial.suggest_float('model__colsample_bytree', 0.5, 1),
        'model__reg_lambda': trial.suggest_float('model__reg_lambda', 1e-5, 1e2),
        'model__reg_alpha': trial.suggest_float('model__reg_alpha', 1e-5, 1e2),
        'model__scale_pos_weight': trial.suggest_float('model__scale_pos_weight', 1, 10),
    }

    lgbm_pipe.set_params(**lgbm_params)

    # Model fitting
    cross_val_f1 = cross_val_score(lgbm_pipe, X_train, list(y_train), scoring='f1').mean()

    return cross_val_f1  # Use F1 score for binary classification

mlflow.set_experiment("MLflow_hype_optuna_LightGBM")

with mlflow.start_run(run_name='lgbm_optuna_run'):
    lgbm_study = optuna.create_study(direction='maximize')  # Maximize F1 score
    lgbm_study.optimize(objective, n_trials=10)

    best_lgbm_trial = lgbm_study.best_trial
    best_lgbm_params = best_lgbm_trial.params

    # Log parameters
    mlflow.log_params(best_lgbm_params)

    # Set the best parameters to your LightGBM model
    lgbm_pipe.set_params(**best_lgbm_params)

    # Train the model with the best parameters on the entire training set
    lgbm_pipe.fit(X_train, list(y_train))

    # Make predictions on the test set
    y_lgbm_pred = lgbm_pipe.predict(X_test)

    # Metrics
    accuracy_lgbm = accuracy_score(list(y_test), y_lgbm_pred)
    recall_lgbm = recall_score(list(y_test), y_lgbm_pred)
    f1_lgbm = f1_score(list(y_test), y_lgbm_pred)

    mlflow.sklearn.log_model(lgbm_pipe, 'lgbm_model_optuna')

    # Log metrics
    mlflow.log_metrics({
        'accuracy': accuracy_lgbm,
        'recall': recall_lgbm,
        'f1_score': f1_lgbm,
    })

    # Calculate ROC AUC (if needed for binary classification)
    fpr_lgbm, tpr_lgbm, thresholds_lgbm = roc_curve(list(y_test), y_lgbm_pred)
    roc_auc_lgbm = roc_auc_score(list(y_test), y_lgbm_pred)

    # Plot ROC curve (if needed for binary classification)
    plt.figure(figsize=(8, 8))
    plt.plot(fpr_lgbm, tpr_lgbm, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc_lgbm))
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc='lower right')

    # Save the plot to a local file
    roc_auc_lgbm_plot_path = "mlruns/roc_auc_lgbm_plot.png"
    plt.savefig(roc_auc_lgbm_plot_path)
    plt.close()

    # Log ROC AUC plot as an artifact (if needed for binary classification)
    mlflow.log_artifact(roc_auc_lgbm_plot_path, "roc_auc_lgbm_plot")

In [None]:
logged_model_lgbm = logged_model = 'runs:/ed7a2bb27af84058a418a7846ad0283a/lgbm_model_optuna'

# Load model as a PyFuncModel.
loaded_model_lgbm = mlflow.pyfunc.load_model(logged_model_lgbm)

In [None]:
joblib.dump(loaded_model_lgbm,'loaded_model_lgbm.joblib')


### 3.4 K-Nearest Neighbors (K-NN) model

K-Nearest Neighbors is a simple and intuitive algorithm used for both classification and regression tasks. The basic idea is to classify or predict a new data point based on the majority class or average of its k-nearest neighbors in the feature space. The algorithm calculates the distance between data points to determine their proximity. The value of k, representing the number of neighbors, is a crucial parameter that influences the model's performance. K-NN is non-parametric, meaning it doesn't make assumptions about the underlying data distribution, making it versatile for different types of datasets.

In [None]:
# Full pipeline with k-Nearest Neighbors (KNN)
knn_pipe = Pipeline([
    ('column_transformer', ct),
    ('model', KNeighborsClassifier()),  # Use KNeighborsClassifier
])


In [None]:
# Optimize hyperparameters with Optuna
def objective(trial):

    # Parameters for optimization
    knn_params = {
        'model__n_neighbors': trial.suggest_int('model__n_neighbors', 1, 20),
        'model__weights': trial.suggest_categorical('model__weights', ['uniform', 'distance']),
        'model__p': trial.suggest_int('model__p', 1, 2),  # For Minkowski distance
    }

    knn_pipe.set_params(**knn_params)

    # Model fitting
    cross_val_f1 = cross_val_score(knn_pipe, X_train, list(y_train), scoring='f1').mean()

    return cross_val_f1  # Use F1 score for binary classification

mlflow.set_experiment("MLflow_hype_optuna_KNN")

with mlflow.start_run(run_name='knn_optuna_run'):
    knn_study = optuna.create_study(direction='maximize')  # Maximize F1 score
    knn_study.optimize(objective, n_trials=10)

    best_knn_trial = knn_study.best_trial
    best_knn_params = best_knn_trial.params

    # Log parameters
    mlflow.log_params(best_knn_params)

    # Set the best parameters to your KNN model
    knn_pipe.set_params(**best_knn_params)

    # Train the model with the best parameters on the entire training set
    knn_pipe.fit(X_train, list(y_train))

    # Make predictions on the test set
    y_knn_pred = knn_pipe.predict(X_test)

    # Metrics
    accuracy_knn = accuracy_score(list(y_test), y_knn_pred)
    recall_knn = recall_score(list(y_test), y_knn_pred)
    f1_knn = f1_score(list(y_test), y_knn_pred)

    mlflow.sklearn.log_model(knn_pipe, 'knn_model_optuna')

    # Log metrics
    mlflow.log_metrics({
        'accuracy': accuracy_knn,
        'recall': recall_knn,
        'f1_score': f1_knn,
    })

    # Calculate ROC AUC (if needed for binary classification)
    fpr_knn, tpr_knn, thresholds_knn = roc_curve(list(y_test), y_knn_pred)
    roc_auc_knn = roc_auc_score(list(y_test), y_knn_pred)

    # Plot ROC curve (if needed for binary classification)
    plt.figure(figsize=(8, 8))
    plt.plot(fpr_knn, tpr_knn, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc_knn))
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc='lower right')

    # Save the plot to a local file
    roc_auc_knn_plot_path = "mlruns/roc_auc_knn_plot.png"
    plt.savefig(roc_auc_knn_plot_path)
    plt.close()

    # Log ROC AUC plot as an artifact (if needed for binary classification)
    mlflow.log_artifact(roc_auc_knn_plot_path, "roc_auc_knn_plot")

In [None]:
logged_model_knn = 'runs:/2cb937454c6849959dd3f11300981261/knn_model_optuna'

# Load model as a PyFuncModel.
loaded_model_knn = mlflow.pyfunc.load_model(logged_model_knn)

In [None]:
joblib.dump(loaded_model_knn,'knn_optuna_model.joblib')


### 3.5 Logistic Regression model

Logistic Regression is a statistical method used for binary and multi-class classification problems. Despite its name, it is a classification algorithm rather than a regression one. It predicts the probability of an instance belonging to a particular class, and then makes a discrete prediction based on a threshold.

In [None]:

# Full pipeline with Logistic Regression
logreg_pipe = Pipeline([
    ('column_transformer', ct),
    ('model', LogisticRegression()),  # Use Logistic Regression
])

# Optimize hyperparameters with Optuna
def objective(trial):

    # Parameters for optimization
    logreg_params = {
        'model__penalty': trial.suggest_categorical('model__penalty', ['none', 'l2']),
        'model__C': trial.suggest_float('model__C', 1e-5, 1e2),
        'model__fit_intercept': trial.suggest_categorical('model__fit_intercept', [True, False]),
    }

    logreg_pipe.set_params(**logreg_params)

    # Model fitting
    cross_val_f1 = cross_val_score(logreg_pipe, X_train, list(y_train), scoring='f1').mean()

    return cross_val_f1  # Use F1 score for binary classification

mlflow.set_experiment("MLflow_hype_optuna_LogisticRegression")

with mlflow.start_run(run_name='logreg_optuna_run'):
    logreg_study = optuna.create_study(direction='maximize')  # Maximize F1 score
    logreg_study.optimize(objective, n_trials=10)

    best_logreg_trial = logreg_study.best_trial
    best_logreg_params = best_logreg_trial.params

    # Log parameters
    mlflow.log_params(best_logreg_params)

    # Set the best parameters to your Logistic Regression model
    logreg_pipe.set_params(**best_logreg_params)

    # Train the model with the best parameters on the entire training set
    logreg_pipe.fit(X_train, list(y_train))

    # Make predictions on the test set
    y_logreg_pred = logreg_pipe.predict(X_test)

    # Metrics
    accuracy_logreg = accuracy_score(list(y_test), y_logreg_pred)
    recall_logreg = recall_score(list(y_test), y_logreg_pred)
    f1_logreg = f1_score(list(y_test), y_logreg_pred)

    mlflow.sklearn.log_model(logreg_pipe, 'logreg_model_optuna')

    # Log metrics
    mlflow.log_metrics({
        'accuracy': accuracy_logreg,
        'recall': recall_logreg,
        'f1_score': f1_logreg,
    })

    # Calculate ROC AUC (if needed for binary classification)
    fpr_logreg, tpr_logreg, thresholds_logreg = roc_curve(list(y_test), y_logreg_pred)
    roc_auc_logreg = roc_auc_score(list(y_test), y_logreg_pred)

    # Plot ROC curve (if needed for binary classification)
    plt.figure(figsize=(8, 8))
    plt.plot(fpr_logreg, tpr_logreg, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc_logreg))
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc='lower right')

    # Save the plot to a local file
    roc_auc_logreg_plot_path = "mlruns/roc_auc_logreg_plot.png"
    plt.savefig(roc_auc_logreg_plot_path)
    plt.close()

    # Log ROC AUC plot as an artifact (if needed for binary classification)
    mlflow.log_artifact(roc_auc_logreg_plot_path, "roc_auc_logreg_plot")


In [None]:
logged_model_logreg = 'runs:/b2a4f5cce8d94a8f993cd5d4acae3ddd/logreg_model_optuna'

# Load model as a PyFuncModel.
loaded_model_logreg = mlflow.pyfunc.load_model(logged_model_logreg)

In [None]:
joblib.dump(loaded_model_logreg,'logreg_optuna_model.joblib')


### 3.7 Naive Bayes model

Naive Bayes is a probabilistic classification algorithm based on Bayes' theorem, which calculates the probability of a hypothesis (class) given the observed evidence (features). The "naive" assumption in Naive Bayes is that all features are conditionally independent given the class. This simplifying assumption significantly reduces computational complexity, making it computationally efficient. Naive Bayes is computationally efficient due to the independence assumption, making it particularly useful for large datasets.

In [None]:


# Full pipeline with Gaussian Naive Bayes
nb_pipe = Pipeline([
    ('column_transformer', ct),
    ('model', GaussianNB()),  # Use Gaussian Naive Bayes
])

# Optimize hyperparameters with Optuna
def objective(trial):


    # Model fitting
    cross_val_f1 = cross_val_score(nb_pipe, X_train, list(y_train), scoring='f1').mean()

    return cross_val_f1  # Use F1 score for binary classification

mlflow.set_experiment("MLflow_hype_optuna_GaussianNB")

with mlflow.start_run(run_name='nb_optuna_run'):
    nb_study = optuna.create_study(direction='maximize')  # Maximize F1 score
    nb_study.optimize(objective, n_trials=10)

    best_nb_trial = nb_study.best_trial
    best_nb_params = best_nb_trial.params

    # Log parameters
    mlflow.log_params(best_nb_params)

    # Set the best parameters to your Naive Bayes model
    nb_pipe.set_params(**best_nb_params)

    # Train the model with the best parameters on the entire training set
    nb_pipe.fit(X_train, list(y_train))

    # Make predictions on the test set
    y_nb_pred = nb_pipe.predict(X_test)

    # Metrics
    accuracy_nb = accuracy_score(list(y_test), y_nb_pred)
    recall_nb = recall_score(list(y_test), y_nb_pred)
    f1_nb = f1_score(list(y_test), y_nb_pred)

    mlflow.sklearn.log_model(nb_pipe, 'nb_model_optuna')

    # Log metrics
    mlflow.log_metrics({
        'accuracy': accuracy_nb,
        'recall': recall_nb,
        'f1_score': f1_nb,
    })

    # Calculate ROC AUC (if needed for binary classification)
    fpr_nb, tpr_nb, thresholds_nb = roc_curve(list(y_test), y_nb_pred)
    roc_auc_nb = roc_auc_score(list(y_test), y_nb_pred)

    # Plot ROC curve (if needed for binary classification)
    plt.figure(figsize=(8, 8))
    plt.plot(fpr_nb, tpr_nb, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc_nb))
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc='lower right')

    # Save the plot to a local file
    roc_auc_nb_plot_path = "mlruns/roc_auc_nb_plot.png"
    plt.savefig(roc_auc_nb_plot_path)
    plt.close()

    # Log ROC AUC plot as an artifact (if needed for binary classification)
    mlflow.log_artifact(roc_auc_nb_plot_path, "roc_auc_nb_plot")


In [None]:
logged_model_nb = 'runs:/c20bce1fd6ba4040bc023d2732097985/nb_model_optuna'

# Load model as a PyFuncModel.
loaded_model_nb = mlflow.pyfunc.load_model(logged_model_nb)

In [None]:
joblib.dump(loaded_model_nb,'nb_optuna_model.joblib')
