In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler

In [1]:
from tabulate import tabulate

In [3]:
data = pd.read_csv('diabetes.csv')

In [4]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
data.tail()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.34,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1
767,1,93,70,31,0,30.4,0.315,23,0


In [6]:
data.shape

(768, 9)

In [7]:
X = data.drop(columns='Outcome')
y = data['Outcome']

In [11]:
columns_to_replace_zeros = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
X[columns_to_replace_zeros] = X[columns_to_replace_zeros].replace(0, X[columns_to_replace_zeros].median())


In [12]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [19]:
#linear regression
lr_model = LogisticRegression(random_state=42)
lr_model.fit(X_train, y_train)


In [15]:
y_pred_lr = lr_model.predict(X_test)

In [17]:
from sklearn.metrics import classification_report, accuracy_score

In [50]:
lr_report = classification_report(y_test, y_pred_lr,output_dict=True)
lr_accuracy = accuracy_score(y_test, y_pred_lr)
print("Logistic Regression Report:\n", lr_report)
print("Accuracy:", lr_accuracy)

Logistic Regression Report:
 {'0': {'precision': 0.8118811881188119, 'recall': 0.8282828282828283, 'f1-score': 0.82, 'support': 99.0}, '1': {'precision': 0.6792452830188679, 'recall': 0.6545454545454545, 'f1-score': 0.6666666666666666, 'support': 55.0}, 'accuracy': 0.7662337662337663, 'macro avg': {'precision': 0.74556323556884, 'recall': 0.7414141414141414, 'f1-score': 0.7433333333333333, 'support': 154.0}, 'weighted avg': {'precision': 0.764511222011689, 'recall': 0.7662337662337663, 'f1-score': 0.7652380952380953, 'support': 154.0}}
Accuracy: 0.7662337662337663


In [21]:
#naive bias classification

In [22]:
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)


In [23]:
y_pred_nb = nb_model.predict(X_test)

In [49]:
nb_report = classification_report(y_test, y_pred_nb,output_dict=True)
nb_accuracy = accuracy_score(y_test, y_pred_nb)
print("Naive Bayes Report:\n", nb_report)
print("Accuracy:", nb_accuracy)

Naive Bayes Report:
 {'0': {'precision': 0.8210526315789474, 'recall': 0.7878787878787878, 'f1-score': 0.8041237113402062, 'support': 99.0}, '1': {'precision': 0.6440677966101694, 'recall': 0.6909090909090909, 'f1-score': 0.6666666666666666, 'support': 55.0}, 'accuracy': 0.7532467532467533, 'macro avg': {'precision': 0.7325602140945584, 'recall': 0.7393939393939394, 'f1-score': 0.7353951890034365, 'support': 154.0}, 'weighted avg': {'precision': 0.757843761947241, 'recall': 0.7532467532467533, 'f1-score': 0.7550319096710849, 'support': 154.0}}
Accuracy: 0.7532467532467533


In [25]:
#KNN CLASSSIFICATION

In [26]:
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

In [27]:
y_pred_knn = knn_model.predict(X_test)

In [51]:
knn_report = classification_report(y_test, y_pred_knn,output_dict=True)
knn_accuracy = accuracy_score(y_test, y_pred_knn)
print("KNN Report:\n", knn_report)
print("Accuracy:", knn_accuracy)

KNN Report:
 {'0': {'precision': 0.8061224489795918, 'recall': 0.797979797979798, 'f1-score': 0.8020304568527918, 'support': 99.0}, '1': {'precision': 0.6428571428571429, 'recall': 0.6545454545454545, 'f1-score': 0.6486486486486487, 'support': 55.0}, 'accuracy': 0.7467532467532467, 'macro avg': {'precision': 0.7244897959183674, 'recall': 0.7262626262626263, 'f1-score': 0.7253395527507203, 'support': 154.0}, 'weighted avg': {'precision': 0.7478134110787172, 'recall': 0.7467532467532467, 'f1-score': 0.7472512396370264, 'support': 154.0}}
Accuracy: 0.7467532467532467


In [29]:
#DECISION TREE CLASSSIFICATION

In [30]:
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

In [31]:
y_pred_dt = dt_model.predict(X_test)

In [52]:
dt_report = classification_report(y_test, y_pred_dt,output_dict=True)
dt_accuracy = accuracy_score(y_test, y_pred_dt)
print("Decision Tree Report:\n", dt_report)
print("Accuracy:", dt_accuracy)

Decision Tree Report:
 {'0': {'precision': 0.8131868131868132, 'recall': 0.7474747474747475, 'f1-score': 0.7789473684210526, 'support': 99.0}, '1': {'precision': 0.6031746031746031, 'recall': 0.6909090909090909, 'f1-score': 0.6440677966101694, 'support': 55.0}, 'accuracy': 0.7272727272727273, 'macro avg': {'precision': 0.7081807081807081, 'recall': 0.7191919191919192, 'f1-score': 0.711507582515611, 'support': 154.0}, 'weighted avg': {'precision': 0.7381824524681667, 'recall': 0.7272727272727273, 'f1-score': 0.7307760927743085, 'support': 154.0}}
Accuracy: 0.7272727272727273


In [35]:
#RANDOM FOREST

In [36]:
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

In [37]:
y_pred_rf = rf_model.predict(X_test)

In [53]:
rf_report = classification_report(y_test, y_pred_rf,output_dict=True)
rf_accuracy = accuracy_score(y_test, y_pred_rf)
print("Random Forest Report:\n", rf_report)
print("Accuracy:", rf_accuracy)

Random Forest Report:
 {'0': {'precision': 0.8163265306122449, 'recall': 0.8080808080808081, 'f1-score': 0.8121827411167513, 'support': 99.0}, '1': {'precision': 0.6607142857142857, 'recall': 0.6727272727272727, 'f1-score': 0.6666666666666666, 'support': 55.0}, 'accuracy': 0.7597402597402597, 'macro avg': {'precision': 0.7385204081632653, 'recall': 0.7404040404040404, 'f1-score': 0.739424703891709, 'support': 154.0}, 'weighted avg': {'precision': 0.7607507288629737, 'recall': 0.7597402597402597, 'f1-score': 0.7602127145274353, 'support': 154.0}}
Accuracy: 0.7597402597402597


In [39]:
# k-means cluster for classification

In [40]:
from sklearn.metrics import adjusted_rand_score

In [41]:
kmeans_model = KMeans(n_clusters=2, random_state=42)
kmeans_model.fit(X_train)

In [42]:
y_pred_kmeans = kmeans_model.predict(X_test)


In [57]:
ari_score = adjusted_rand_score(y_test, y_pred_kmeans)
print("K-Means ARI Score:", ari_score)

K-Means ARI Score: 0.1366630734219667


In [44]:
# comparision table ( basically comparing all the)

In [45]:
results = {
    "Model": ["Logistic Regression", "Naive Bayes", "KNN", "Decision Tree", "Random Forest"],
    "Accuracy": [lr_accuracy, nb_accuracy, knn_accuracy, dt_accuracy, rf_accuracy],
}

comparison_df = pd.DataFrame(results)
print(comparison_df)

                 Model  Accuracy
0  Logistic Regression  0.766234
1          Naive Bayes  0.753247
2                  KNN  0.746753
3        Decision Tree  0.727273
4        Random Forest  0.759740


In [46]:
# this is the normal overall comparision

In [47]:
# detaild comparision table

In [58]:
models_performance = {
    "Model": ["Logistic Regression", "Naive Bayes", "KNN", "Decision Tree", "Random Forest"],
    "Precision (Class 1)": [
        lr_report['1']['precision'], nb_report['1']['precision'],
        knn_report['1']['precision'], dt_report['1']['precision'],
        rf_report['1']['precision']
    ],
    "Recall (Class 1)": [
        lr_report['1']['recall'], nb_report['1']['recall'],
        knn_report['1']['recall'], dt_report['1']['recall'],
        rf_report['1']['recall']
    ],
    "F1-Score (Class 1)": [
        lr_report['1']['f1-score'], nb_report['1']['f1-score'],
        knn_report['1']['f1-score'], dt_report['1']['f1-score'],
        rf_report['1']['f1-score']
    ],
    "Accuracy": [
        lr_accuracy, nb_accuracy, knn_accuracy,
        dt_accuracy, rf_accuracy
    ]
}

comparison_df = pd.DataFrame(models_performance)
print(comparison_df)


                 Model  Precision (Class 1)  Recall (Class 1)  \
0  Logistic Regression             0.679245          0.654545   
1          Naive Bayes             0.644068          0.690909   
2                  KNN             0.642857          0.654545   
3        Decision Tree             0.603175          0.690909   
4        Random Forest             0.660714          0.672727   

   F1-Score (Class 1)  Accuracy  
0            0.666667  0.766234  
1            0.666667  0.753247  
2            0.648649  0.746753  
3            0.644068  0.727273  
4            0.666667  0.759740  


In [None]:
#RESULT:- THE  BEST IS RANDOM FOREST AFTER COMPARING