**Get Datasets**

In [46]:
import pandas as pd
import numpy as np

df_raw = pd.read_csv("D:\School\ITC\Y3\Semet 1\Intro DS\Projects\Diabetes-Classification\selected_dataset\DiaHealth A Bangladeshi Dataset for Type 2 Diabetes Prediction\Cleaned_Datasets\Diabetes_Final_Data_Cleaned_v3.csv")

In [47]:
df = df_raw.copy()
df.tail()

Unnamed: 0,age,gender,systolic_bp,diastolic_bp,glucose,bmi,family_diabetes,hypertensive,diabetic
4725,70,1,146,77,9.42,18.35,0,1,0
4726,74,1,164,89,6.47,24.99,0,1,0
4727,75,1,141,104,8.31,22.75,0,0,1
4728,36,0,139,80,4.9,17.87,0,0,0
4729,26,0,134,93,5.15,30.92,0,0,0


# **1. Linear-Based Models**

## **1.1. Logistic Regression**

In [48]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from imblearn.over_sampling import RandomOverSampler

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, RobustScaler, PowerTransformer

# get data
X = df.drop(['diabetic'], axis=1)
y = df['diabetic']

# resample data
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)

# split resampled data
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled,
    y_resampled,
    test_size=0.3,
    random_state=42)

# specify features to be scaled
features_to_scale = ['age', 'systolic_bp', 'diastolic_bp', 'glucose', 'bmi']

# Define the feature categories for scaling
skewed_features = ['glucose', 'systolic_bp']
outlier_features = ['diastolic_bp']
normal_features = ['bmi', 'age']

# Define the scaling transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('power', PowerTransformer(method='yeo-johnson'), skewed_features),
        ('robust', RobustScaler(), outlier_features),
        ('standard', StandardScaler(), normal_features)
    ],
    remainder='passthrough'
)

# Scale the data using the preprocessor
X_train_scaled = preprocessor.fit_transform(X_train)
X_test_scaled = preprocessor.transform(X_test)

# train data
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

# predict
y_pred = model.predict(X_test_scaled)

# evaluation
print("Confustion Metrix: ")
print(confusion_matrix(y_test, y_pred))
print("Classification Report: ")
print(classification_report(y_test, y_pred))

Confustion Metrix: 
[[1136  239]
 [ 362  992]]
Classification Report: 
              precision    recall  f1-score   support

           0       0.76      0.83      0.79      1375
           1       0.81      0.73      0.77      1354

    accuracy                           0.78      2729
   macro avg       0.78      0.78      0.78      2729
weighted avg       0.78      0.78      0.78      2729



**Save the model and scaler obj**

In [49]:
# import os
# import joblib

# # Specify the directory path
# save_dir = r"D:\School\ITC\Y3\Semet 1\Intro DS\Projects\Diabetes-Classification\models"

# # Save the model and preprocessor
# print(joblib.dump(model, os.path.join(save_dir, 'logistic_model.pkl')))
# print(joblib.dump(preprocessor, os.path.join(save_dir, 'logistic_preprocessor.pkl')))

**Load model to use**

In [50]:
import joblib, os
import pandas as pd

# Specify the directory path
save_dir = r"D:\School\ITC\Y3\Semet 1\Intro DS\Projects\Diabetes-Classification\models"

# Load the model and preprocessor
model = joblib.load(os.path.join(save_dir, 'logistic_model.pkl'))
preprocessor = joblib.load(os.path.join(save_dir, 'logistic_preprocessor.pkl'))

# New data to make predictions on
X_new = pd.DataFrame(
    {
        'age': [70.00],
        'gender': [0],
        'systolic_bp': [146.00],
        'diastolic_bp': [84.00],
        'glucose': [9.51],
        'bmi': [20.04],
        'family_diabetes': [0],
        'hypertensive': [0],
    }
)

# Transform the new data using the preprocessor
X_new_transformed = preprocessor.transform(X_new)

# Make predictions
predictions = model.predict(X_new_transformed)

# print the result
if predictions == 1:
    print("Yes, he or she has diabetes")
else: 
    print("No, he or she doesn't has diabetes")

Yes, he or she has diabetes


## **1.2. LDA (LiDA: Linear Discriminant Analysis)**

In [51]:
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import confusion_matrix, classification_report
from imblearn.over_sampling import RandomOverSampler

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, RobustScaler, PowerTransformer

# get data
X = df.drop(['diabetic'], axis=1)
y = df['diabetic']

# resample data
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)

# split resampled data
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled,
    y_resampled,
    test_size=0.3,
    random_state=42)

# specify features to be scaled
features_to_scale = ['age', 'systolic_bp', 'diastolic_bp', 'glucose', 'bmi']

# Define the feature categories for scaling
skewed_features = ['glucose', 'systolic_bp']
outlier_features = ['diastolic_bp']
normal_features = ['bmi', 'age']

# Define the scaling transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('power', PowerTransformer(method='yeo-johnson'), skewed_features),
        ('robust', RobustScaler(), outlier_features),
        ('standard', StandardScaler(), normal_features)
    ],
    remainder='passthrough'
)

# Scale the data using the preprocessor
X_train_scaled = preprocessor.fit_transform(X_train)
X_test_scaled = preprocessor.transform(X_test)

# train data
model = LinearDiscriminantAnalysis()
model.fit(X_train_scaled, y_train)

# predict
y_pred = model.predict(X_test_scaled)

# evaluation
print("Confustion Metrix: ")
print(confusion_matrix(y_test, y_pred))
print("Classification Report: ")
print(classification_report(y_test, y_pred))

Confustion Metrix: 
[[1171  204]
 [ 404  950]]
Classification Report: 
              precision    recall  f1-score   support

           0       0.74      0.85      0.79      1375
           1       0.82      0.70      0.76      1354

    accuracy                           0.78      2729
   macro avg       0.78      0.78      0.78      2729
weighted avg       0.78      0.78      0.78      2729



## **1.3. SVM**

In [52]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from imblearn.over_sampling import RandomOverSampler

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, RobustScaler, PowerTransformer

# get data
X = df.drop(['diabetic'], axis=1)
y = df['diabetic']

# resample data
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)

# split resampled data
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled,
    y_resampled,
    test_size=0.3,
    random_state=42)

# specify features to be scaled
features_to_scale = ['age', 'systolic_bp', 'diastolic_bp', 'glucose', 'bmi']

# Define the feature categories for scaling
skewed_features = ['glucose', 'systolic_bp']
outlier_features = ['diastolic_bp']
normal_features = ['bmi', 'age']

# Define the scaling transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('power', PowerTransformer(method='yeo-johnson'), skewed_features),
        ('robust', RobustScaler(), outlier_features),
        ('standard', StandardScaler(), normal_features)
    ],
    remainder='passthrough'
)

# Scale the data using the preprocessor
X_train_scaled = preprocessor.fit_transform(X_train)
X_test_scaled = preprocessor.transform(X_test)

# train data
model = SVC()
model.fit(X_train_scaled, y_train)

# predict
y_pred = model.predict(X_test_scaled)

# evaluation
print("Confustion Metrix: ")
print(confusion_matrix(y_test, y_pred))
print("Classification Report: ")
print(classification_report(y_test, y_pred))

Confustion Metrix: 
[[1172  203]
 [ 222 1132]]
Classification Report: 
              precision    recall  f1-score   support

           0       0.84      0.85      0.85      1375
           1       0.85      0.84      0.84      1354

    accuracy                           0.84      2729
   macro avg       0.84      0.84      0.84      2729
weighted avg       0.84      0.84      0.84      2729



# **2. Tree-Based Models**

## **2.1. Decision Trees**

**Oversampling: `imblearn.over_sampling.RandomOverSampler`**

In [53]:
from imblearn.over_sampling import RandomOverSampler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

# data
X = df.drop('diabetic', axis=1)
y = df['diabetic']

# Resample the data
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)

# Split the resampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled,
    y_resampled,
    test_size=0.3,
    random_state=42)

# Train the model
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)

# Print evaluation metrics
print("Confustion Metrix: ")
print(confusion_matrix(y_test, y_pred))
print("Classification Report: ")
print(classification_report( y_test, y_pred))

Confustion Metrix: 
[[1304   71]
 [   0 1354]]
Classification Report: 
              precision    recall  f1-score   support

           0       1.00      0.95      0.97      1375
           1       0.95      1.00      0.97      1354

    accuracy                           0.97      2729
   macro avg       0.98      0.97      0.97      2729
weighted avg       0.98      0.97      0.97      2729



**Hybrid Method: `imblearn.combine.SMOTEENN`**

In [54]:
from imblearn.combine import SMOTEENN
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

# data
X = df.drop('diabetic', axis=1)
y = df['diabetic']

# Combine SMOTE with ENN
smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)

# Split the resampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

# Train the model
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)

# Print evaluation metrics
print("Confustion Metrix: ")
print(confusion_matrix(y_test, y_pred))
print("Classification Report: ")
print(classification_report( y_test, y_pred))

Confustion Metrix: 
[[ 945  120]
 [  63 1220]]
Classification Report: 
              precision    recall  f1-score   support

           0       0.94      0.89      0.91      1065
           1       0.91      0.95      0.93      1283

    accuracy                           0.92      2348
   macro avg       0.92      0.92      0.92      2348
weighted avg       0.92      0.92      0.92      2348



**Hybrid Method: `imblearn.combine.SMOTETomek`**

In [55]:
from imblearn.combine import SMOTETomek
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

# data
X = df.drop('diabetic', axis=1)
y = df['diabetic']

# Combine SMOTE with Tomek
smote_tomek = SMOTETomek(random_state=0)
X_resampled, y_resampled = smote_tomek.fit_resample(X, y)

# Split the resampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

# Train the model
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)

# Print evaluation metrics
print("Confustion Metrix: ")
print(confusion_matrix(y_test, y_pred))
print("Classification Report: ")
print(classification_report( y_test, y_pred))

Confustion Metrix: 
[[1187  197]
 [  92 1238]]
Classification Report: 
              precision    recall  f1-score   support

           0       0.93      0.86      0.89      1384
           1       0.86      0.93      0.90      1330

    accuracy                           0.89      2714
   macro avg       0.90      0.89      0.89      2714
weighted avg       0.90      0.89      0.89      2714



## **2.2. Random Forest (ព្រៃអាគមន៍)**

**Oversampling: `imblearn.over_sampling.RandomOverSampler` (YK)**

In [56]:
from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

# data
X = df.drop('diabetic', axis=1)
y = df['diabetic']

# Resample the data
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)

# Split the resampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled,
    y_resampled,
    test_size=0.3,
    random_state=42)

# Train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)

# Print evaluation metrics
print("Model: ", type(model).__name__)
print("Confustion Metrix: ")
print(confusion_matrix(y_test, y_pred))
print("Classification Report: ")
print(classification_report( y_test, y_pred))

Model:  RandomForestClassifier
Confustion Metrix: 
[[1356   19]
 [   0 1354]]
Classification Report: 
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      1375
           1       0.99      1.00      0.99      1354

    accuracy                           0.99      2729
   macro avg       0.99      0.99      0.99      2729
weighted avg       0.99      0.99      0.99      2729



**Oversampling: `imblearn.combine.SMOTEENN`**

In [57]:
from imblearn.combine import SMOTEENN
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

# data
X = df.drop('diabetic', axis=1)
y = df['diabetic']

# Combine SMOTE with ENN
smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)

# Split the resampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

# Train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)

# Print evaluation metrics
print("Confustion Metrix: ")
print(confusion_matrix(y_test, y_pred))
print("Classification Report: ")
print(classification_report( y_test, y_pred))

Confustion Metrix: 
[[ 997   68]
 [  21 1262]]
Classification Report: 
              precision    recall  f1-score   support

           0       0.98      0.94      0.96      1065
           1       0.95      0.98      0.97      1283

    accuracy                           0.96      2348
   macro avg       0.96      0.96      0.96      2348
weighted avg       0.96      0.96      0.96      2348



**Oversampling: `imblearn.combine.SMOTETomek`**

In [58]:
from imblearn.combine import SMOTETomek
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

# data
X = df.drop('diabetic', axis=1)
y = df['diabetic']

# Combine SMOTE with Tomek
smote_tomek = SMOTETomek(random_state=0)
X_resampled, y_resampled = smote_tomek.fit_resample(X, y)

# Split the resampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

# Train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)

# Print evaluation metrics
print("Confustion Metrix: ")
print(confusion_matrix(y_test, y_pred))
print("Classification Report: ")
print(classification_report( y_test, y_pred))

Confustion Metrix: 
[[1242  142]
 [  42 1288]]
Classification Report: 
              precision    recall  f1-score   support

           0       0.97      0.90      0.93      1384
           1       0.90      0.97      0.93      1330

    accuracy                           0.93      2714
   macro avg       0.93      0.93      0.93      2714
weighted avg       0.93      0.93      0.93      2714



## **2.3. Gradient Boosting Machines (GBM)**

**Oversampling: `imblearn.over_sampling.RandomOverSampler`**

In [59]:
from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

# data
X = df.drop('diabetic', axis=1)
y = df['diabetic']

# Combine SMOTE with Tomek
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(X, y)

# Split the resampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

# Train the model
model = GradientBoostingClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)

# Print evaluation metrics
print("Confusion Matrix: ")
print(confusion_matrix(y_test, y_pred))
print("Classification Report: ")
print(classification_report(y_test, y_pred))

Confusion Matrix: 
[[1197  178]
 [ 135 1219]]
Classification Report: 
              precision    recall  f1-score   support

           0       0.90      0.87      0.88      1375
           1       0.87      0.90      0.89      1354

    accuracy                           0.89      2729
   macro avg       0.89      0.89      0.89      2729
weighted avg       0.89      0.89      0.89      2729



## **2.4. XGBoost**

**Oversampling: `imblearn.over_sampling.RandomOverSampler`**

In [60]:
from imblearn.over_sampling import RandomOverSampler
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

# data
X = df.drop('diabetic', axis=1)
y = df['diabetic']

# Combine SMOTE with Tomek
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(X, y)

# Split the resampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

# Train the model
model = XGBClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)

# Print evaluation metrics
print("Confusion Matrix: ")
print(confusion_matrix(y_test, y_pred))
print("Classification Report: ")
print(classification_report(y_test, y_pred))

Confusion Matrix: 
[[1339   36]
 [   0 1354]]
Classification Report: 
              precision    recall  f1-score   support

           0       1.00      0.97      0.99      1375
           1       0.97      1.00      0.99      1354

    accuracy                           0.99      2729
   macro avg       0.99      0.99      0.99      2729
weighted avg       0.99      0.99      0.99      2729



## **2.5. LightGBM**

**Oversampling: `imblearn.over_sampling.RandomOverSampler`**

In [61]:
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
import lightgbm as lgb

# data
X = df.drop('diabetic', axis=1)
y = df['diabetic']

# Combine SMOTE with Tomek
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(X, y)

# Split the resampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

# Train the model
train_data = lgb.Dataset(X_train, label=y_train)
params = {'objective': 'binary', 'metric': 'auc', 'boosting_type': 'gbdt', 'num_leaves': 31, 'learning_rate': 0.05}
model = lgb.train(
    params, 
    train_data, 100)

# Evaluate the model
y_pred = model.predict(X_test)

[LightGBM] [Info] Number of positive: 3194, number of negative: 3173
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000559 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 735
[LightGBM] [Info] Number of data points in the train set: 6367, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501649 -> initscore=0.006597
[LightGBM] [Info] Start training from score 0.006597


In [62]:
# Print evaluation metrics
print("Confusion Matrix: ")
print(confusion_matrix(y_test, (y_pred > 0.5).astype(int)))
print("Classification Report: ")
print(classification_report(y_test, (y_pred > 0.5).astype(int)))

Confusion Matrix: 
[[1283   92]
 [   0 1354]]
Classification Report: 
              precision    recall  f1-score   support

           0       1.00      0.93      0.97      1375
           1       0.94      1.00      0.97      1354

    accuracy                           0.97      2729
   macro avg       0.97      0.97      0.97      2729
weighted avg       0.97      0.97      0.97      2729



## **2.5. CatBoost**

**Oversampling: `imblearn.over_sampling.RandomOverSampler`**

In [68]:
from catboost import CatBoostClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

# data
X = df.drop('diabetic', axis=1)
y = df['diabetic']

# Resample the data
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(X, y)

# Split the resampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

# Train the CatBoost model
model = CatBoostClassifier(iterations=100,
                        #    learning_rate=0.1, 
                        #    depth=10, 
                           random_state=42)
model.fit(X_train, y_train)

Learning rate set to 0.187592
0:	learn: 0.6307334	total: 1.67ms	remaining: 165ms
1:	learn: 0.5824462	total: 3.04ms	remaining: 149ms
2:	learn: 0.5476780	total: 4.63ms	remaining: 150ms
3:	learn: 0.5194200	total: 6.15ms	remaining: 148ms
4:	learn: 0.5005798	total: 7.49ms	remaining: 142ms
5:	learn: 0.4809404	total: 8.93ms	remaining: 140ms
6:	learn: 0.4624360	total: 10.5ms	remaining: 139ms
7:	learn: 0.4523346	total: 12ms	remaining: 138ms
8:	learn: 0.4402018	total: 13.4ms	remaining: 136ms
9:	learn: 0.4310513	total: 14.9ms	remaining: 134ms
10:	learn: 0.4213062	total: 16.3ms	remaining: 132ms
11:	learn: 0.4140643	total: 17.9ms	remaining: 132ms
12:	learn: 0.4076326	total: 19.7ms	remaining: 132ms
13:	learn: 0.4006168	total: 21.3ms	remaining: 131ms
14:	learn: 0.3917853	total: 22.7ms	remaining: 129ms
15:	learn: 0.3867005	total: 24.5ms	remaining: 129ms
16:	learn: 0.3829175	total: 26.1ms	remaining: 128ms
17:	learn: 0.3780145	total: 27.7ms	remaining: 126ms
18:	learn: 0.3705604	total: 29.6ms	remaining: 

<catboost.core.CatBoostClassifier at 0x137e3ac0e90>

In [70]:
# Evaluate the model
y_pred = model.predict(X_test)

# Print evaluation metrics
print("Model: ", type(model).__name__)
print("Confusion Matrix: ")
print(confusion_matrix(y_test, y_pred))
print("Classification Report: ")
print(classification_report(y_test, y_pred))

Model:  CatBoostClassifier
Confusion Matrix: 
[[1262  113]
 [   8 1346]]
Classification Report: 
              precision    recall  f1-score   support

           0       0.99      0.92      0.95      1375
           1       0.92      0.99      0.96      1354

    accuracy                           0.96      2729
   macro avg       0.96      0.96      0.96      2729
weighted avg       0.96      0.96      0.96      2729



# **3. Probabilistic Models**

## **3.1. Naive Bayes: Gaussian Naive Bayes (Gusion)**

In [65]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

# data
X = df.drop('diabetic', axis=1)
y = df['diabetic']

# Resample the data
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(X, y)

# Split the resampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

# Train the model
model = GaussianNB()
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)

# Print evaluation metrics
print("Confusion Matrix: ")
print(confusion_matrix(y_test, y_pred))
print("Classification Report: ")
print(classification_report(y_test, y_pred))

Confusion Matrix: 
[[1068  307]
 [ 317 1037]]
Classification Report: 
              precision    recall  f1-score   support

           0       0.77      0.78      0.77      1375
           1       0.77      0.77      0.77      1354

    accuracy                           0.77      2729
   macro avg       0.77      0.77      0.77      2729
weighted avg       0.77      0.77      0.77      2729



## **3.2. Naive Bayes: Multinomial Naive Bayes (Milk Tea No Moew)**

In [66]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

# data
X = df.drop('diabetic', axis=1)
y = df['diabetic']

# Resample the data
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(X, y)

# Split the resampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

# Train the model
model = MultinomialNB()
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)

# Print evaluation metrics
print("Confusion Matrix: ")
print(confusion_matrix(y_test, y_pred))
print("Classification Report: ")
print(classification_report(y_test, y_pred))

Confusion Matrix: 
[[1015  360]
 [ 567  787]]
Classification Report: 
              precision    recall  f1-score   support

           0       0.64      0.74      0.69      1375
           1       0.69      0.58      0.63      1354

    accuracy                           0.66      2729
   macro avg       0.66      0.66      0.66      2729
weighted avg       0.66      0.66      0.66      2729



## **3.3. Naive Bayes: Bernoulli Naive Bayes (Burn Luo Yi)**

In [67]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

# data
X = df.drop('diabetic', axis=1)
y = df['diabetic']

# Resample the data
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(X, y)

# Split the resampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Train the model
model = BernoulliNB()
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)

# Print evaluation metrics
print("Confusion Matrix: ")
print(confusion_matrix(y_test, y_pred))
print("Classification Report: ")
print(classification_report(y_test, y_pred))

Confusion Matrix: 
[[840  78]
 [467 435]]
Classification Report: 
              precision    recall  f1-score   support

           0       0.64      0.92      0.76       918
           1       0.85      0.48      0.61       902

    accuracy                           0.70      1820
   macro avg       0.75      0.70      0.68      1820
weighted avg       0.74      0.70      0.69      1820

