# DIABETES CLASSIFICATION 

# Import Libraries

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    classification_report,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)

# Load Data 

In [2]:
df = pd.read_csv("/kaggle/input/multiclass-diabetes-dataset/Multiclass Diabetes Dataset/Multiclass Diabetes Dataset.csv")
df.head()

Unnamed: 0,Gender,AGE,Urea,Cr,HbA1c,Chol,TG,HDL,LDL,VLDL,BMI,Class
0,0,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,0
1,1,26,4.5,62,4.9,3.7,1.4,1.1,2.1,0.6,23.0,0
2,1,33,7.1,46,4.9,4.9,1.0,0.8,2.0,0.4,21.0,0
3,0,45,2.3,24,4.0,2.9,1.0,1.0,1.5,0.4,21.0,0
4,0,50,2.0,50,4.0,3.6,1.3,0.9,2.1,0.6,24.0,0


# Data Analysis 

In [3]:
df.size

3168

In [4]:
df.dtypes

Gender      int64
AGE         int64
Urea      float64
Cr          int64
HbA1c     float64
Chol      float64
TG        float64
HDL       float64
LDL       float64
VLDL      float64
BMI       float64
Class       int64
dtype: object

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 264 entries, 0 to 263
Data columns (total 12 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Gender  264 non-null    int64  
 1   AGE     264 non-null    int64  
 2   Urea    264 non-null    float64
 3   Cr      264 non-null    int64  
 4   HbA1c   264 non-null    float64
 5   Chol    264 non-null    float64
 6   TG      264 non-null    float64
 7   HDL     264 non-null    float64
 8   LDL     264 non-null    float64
 9   VLDL    264 non-null    float64
 10  BMI     264 non-null    float64
 11  Class   264 non-null    int64  
dtypes: float64(8), int64(4)
memory usage: 24.9 KB


In [6]:
df.describe()

Unnamed: 0,Gender,AGE,Urea,Cr,HbA1c,Chol,TG,HDL,LDL,VLDL,BMI,Class
count,264.0,264.0,264.0,264.0,264.0,264.0,264.0,264.0,264.0,264.0,264.0,264.0
mean,0.545455,49.522727,5.671515,85.806818,6.862727,4.594394,2.151894,1.182879,2.530871,1.479167,26.626856,1.121212
std,0.498875,10.127301,4.002837,99.400047,2.544604,1.289062,1.265841,0.455591,1.000173,3.099856,5.093652,0.914857
min,0.0,25.0,1.1,6.0,0.9,0.0,0.6,0.4,0.3,0.2,19.0,0.0
25%,0.0,43.0,3.6,46.0,5.0,3.875,1.3,0.9,1.8,0.675,23.0,0.0
50%,1.0,50.0,4.7,61.0,6.1,4.5,1.8,1.1,2.5,0.9,25.0,1.0
75%,1.0,55.25,6.1,82.25,8.2,5.3,2.725,1.325,3.2,1.3,30.0,2.0
max,1.0,77.0,26.4,800.0,14.6,9.5,8.7,4.0,5.6,31.8,43.25,2.0


In [7]:
df.isnull().sum()

Gender    0
AGE       0
Urea      0
Cr        0
HbA1c     0
Chol      0
TG        0
HDL       0
LDL       0
VLDL      0
BMI       0
Class     0
dtype: int64

In [8]:
df.columns

Index(['Gender', 'AGE', 'Urea', 'Cr', 'HbA1c', 'Chol', 'TG', 'HDL', 'LDL',
       'VLDL', 'BMI', 'Class'],
      dtype='object')

# Data Preprocessing and splitting 

In [9]:
X = df.drop("Class",axis=1)
y = df["Class"]

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [11]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Model Implementation 

In [12]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

In [13]:
if len(np.shape(y_test)) > 1 and y_test.shape[1] > 1:
    y_test = np.argmax(y_test, axis=1)
if len(np.shape(y_pred)) > 1 and y_pred.shape[1] > 1:
    y_pred = np.argmax(y_pred, axis=1)

# 📊 Confusion Matrix
print("✅ Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# 📋 Classification Report
print("\n🔍 Classification Report:")
print(classification_report(y_test, y_pred))

# 🔢 Individual Metrics
print("\n📈 Evaluation Metrics:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision (macro):", precision_score(y_test, y_pred, average='macro'))
print("Recall (macro):", recall_score(y_test, y_pred, average='macro'))
print("F1 Score (macro):", f1_score(y_test, y_pred, average='macro'))

print("Precision (weighted):", precision_score(y_test, y_pred, average='weighted'))
print("Recall (weighted):", recall_score(y_test, y_pred, average='weighted'))
print("F1 Score (weighted):", f1_score(y_test, y_pred, average='weighted'))

✅ Confusion Matrix:
[[14  0  0]
 [ 0 11  1]
 [ 1  0 26]]

🔍 Classification Report:
              precision    recall  f1-score   support

           0       0.93      1.00      0.97        14
           1       1.00      0.92      0.96        12
           2       0.96      0.96      0.96        27

    accuracy                           0.96        53
   macro avg       0.97      0.96      0.96        53
weighted avg       0.96      0.96      0.96        53


📈 Evaluation Metrics:
Accuracy: 0.9622641509433962
Precision (macro): 0.9654320987654321
Recall (macro): 0.9598765432098765
F1 Score (macro): 0.9616673144909026
Precision (weighted): 0.9635220125786162
Recall (weighted): 0.9622641509433962
F1 Score (weighted): 0.9621792877146333
