In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
dataset = pd.read_csv('diabetes.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [3]:
dataset.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [4]:
dataset.duplicated().sum()

0

In [5]:
zero_columns = ['Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']
dataset[zero_columns] = dataset[zero_columns].replace(0, np.nan)

In [6]:
dataset.isnull().sum()

Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

In [7]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")
dataset[zero_columns] = imputer.fit_transform(dataset[zero_columns])

In [8]:
dataset.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [10]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Logistic Regression

In [11]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train,y_train)

In [12]:
from sklearn.metrics import confusion_matrix, classification_report
y_pred = classifier.predict(X_test)
y_pred = np.round(y_pred).astype(int)
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(classification_report(y_test, y_pred))

[[89 10]
 [24 31]]
              precision    recall  f1-score   support

           0       0.79      0.90      0.84        99
           1       0.76      0.56      0.65        55

    accuracy                           0.78       154
   macro avg       0.77      0.73      0.74       154
weighted avg       0.78      0.78      0.77       154



In [13]:
from sklearn.metrics import roc_auc_score,average_precision_score
y_pred = classifier.predict_proba(X_test)[:, 1]

pr_auc = average_precision_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
print("ROC-AUC Score:", roc_auc)
print("PR-AUC Score:", pr_auc)

ROC-AUC Score: 0.8422405876951332
PR-AUC Score: 0.7917108732727778


## K-Nearest-Neighbors

In [86]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

params = {
    "n_neighbors": range(1, 21),
    "weights": ["uniform", "distance"],
    "p": [1, 2]
}

grid = GridSearchCV(
    KNeighborsClassifier(metric="minkowski"),
    params,
    scoring="f1",
    cv=10,
    n_jobs=-1
)

grid.fit(X_train, y_train)

print("Best Params:", grid.best_params_)
print("Best CV F1 Score:", grid.best_score_)

Best Params: {'n_neighbors': 5, 'p': 2, 'weights': 'uniform'}
Best CV F1 Score: 0.5858970054318892


In [87]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2,weights='uniform')
classifier.fit(X_train, y_train)

In [88]:
from sklearn.metrics import confusion_matrix, classification_report
y_pred = classifier.predict(X_test)
y_pred = np.round(y_pred).astype(int)
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(classification_report(y_test, y_pred))

[[90  9]
 [22 33]]
              precision    recall  f1-score   support

           0       0.80      0.91      0.85        99
           1       0.79      0.60      0.68        55

    accuracy                           0.80       154
   macro avg       0.79      0.75      0.77       154
weighted avg       0.80      0.80      0.79       154



In [89]:
from sklearn.metrics import roc_auc_score,average_precision_score
y_pred = classifier.predict_proba(X_test)[:, 1]

pr_auc = average_precision_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
print("ROC-AUC Score:", roc_auc)
print("PR-AUC Score:", pr_auc)

ROC-AUC Score: 0.8142332415059688
PR-AUC Score: 0.668898558817017


## XG Boost

In [90]:
from xgboost import XGBClassifier
classifier = XGBClassifier(scale_pos_weight=4.5)
classifier.fit(X_train, y_train)

In [91]:
from sklearn.metrics import confusion_matrix, classification_report
y_pred = classifier.predict(X_test)
y_pred = np.round(y_pred).astype(int)
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(classification_report(y_test, y_pred))

[[77 22]
 [15 40]]
              precision    recall  f1-score   support

           0       0.84      0.78      0.81        99
           1       0.65      0.73      0.68        55

    accuracy                           0.76       154
   macro avg       0.74      0.75      0.75       154
weighted avg       0.77      0.76      0.76       154



In [92]:
from sklearn.metrics import roc_auc_score,average_precision_score
y_pred = classifier.predict_proba(X_test)[:, 1]

pr_auc = average_precision_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
print("ROC-AUC Score:", roc_auc)
print("PR-AUC Score:", pr_auc)

ROC-AUC Score: 0.8359963269054178
PR-AUC Score: 0.7781436894467473


## Random Forest

In [93]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint

params = {
    "n_estimators": randint(150, 500),
    "max_depth": [None, 8, 10, 12, 15],
    "min_samples_split": randint(2, 30),
    "min_samples_leaf": randint(1, 10),
    "max_features": ["sqrt", "log2"],
    "class_weight": ["balanced"]
}

rand = RandomizedSearchCV(
    RandomForestClassifier(),
    params,
    n_iter=40,
    scoring="f1",
    cv=10,
    n_jobs=-1
)

rand.fit(X_train, y_train)

print("Best Params:", rand.best_params_)
print("Best PR-AUC:", rand.best_score_)

Best Params: {'class_weight': 'balanced', 'max_depth': 15, 'max_features': 'sqrt', 'min_samples_leaf': 9, 'min_samples_split': 2, 'n_estimators': 419}
Best PR-AUC: 0.6839396639658302


In [94]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(class_weight="balanced",
                                    max_depth=8,
                                    max_features="log2",
                                    min_samples_leaf=8,
                                    min_samples_split=28,
                                    n_estimators = 437, 
                                    criterion = 'entropy')
classifier.fit(X_train, y_train)

In [95]:
from sklearn.metrics import confusion_matrix, classification_report
y_pred = classifier.predict(X_test)
y_pred = np.round(y_pred).astype(int)
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(classification_report(y_test, y_pred))

[[81 18]
 [10 45]]
              precision    recall  f1-score   support

           0       0.89      0.82      0.85        99
           1       0.71      0.82      0.76        55

    accuracy                           0.82       154
   macro avg       0.80      0.82      0.81       154
weighted avg       0.83      0.82      0.82       154



In [96]:
from sklearn.metrics import roc_auc_score,average_precision_score
y_pred = classifier.predict_proba(X_test)[:, 1]

pr_auc = average_precision_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
print("ROC-AUC Score:", roc_auc)
print("PR-AUC Score:", pr_auc)

ROC-AUC Score: 0.8664830119375573
PR-AUC Score: 0.8058498335717981
