In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder

In [2]:
df = pd.read_csv("diabetes (1).csv")

In [3]:
df.head(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
def BMI_category(BMI):
    if BMI<18.5:
        return 'Underweight'
    elif 18.5 <=BMI< 25:
        return 'Normal'
    elif 25 <= BMI < 30:
        return 'Overweight'
    else:
        return 'Obese'

In [5]:
df['BMI_category'] = df['BMI'].apply(BMI_category)

In [8]:
train,val = train_test_split(df,test_size=0.2,random_state=20)

In [9]:
print(train.shape)
print(val.shape)

(614, 10)
(154, 10)


In [12]:
encode = OneHotEncoder(sparse_output=False, drop='first')
scale = StandardScaler()

In [13]:
numeric_features = ['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']
categorical_features = ['BMI_category']

In [14]:
train[numeric_features] = scale.fit_transform(train[numeric_features])
val[numeric_features] = scale.transform(val[numeric_features])

In [15]:
train

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,BMI_category
446,-0.844670,-0.652172,0.159016,-0.551331,-0.088078,-0.858281,0.536640,-0.431718,0,Overweight
260,-0.250595,2.233438,-0.055568,-0.360366,0.440046,-0.138690,-0.533988,0.082657,0,Obese
570,-0.250595,-1.349793,0.051724,-1.315192,-0.704223,0.066907,-0.620473,0.511303,0,Obese
590,2.125704,-0.303362,0.802767,1.231009,-0.704223,1.904434,1.332900,1.025678,1,Obese
30,0.343480,-0.366782,0.319953,0.339839,-0.704223,0.516651,0.202628,2.311616,0,Obese
...,...,...,...,...,...,...,...,...,...,...
218,0.343480,-1.127823,0.266307,0.085219,-0.704223,-0.382837,2.224592,-0.088801,1,Overweight
223,0.937554,0.679648,-0.484735,0.785424,0.968170,-0.408537,0.623125,2.397345,0,Overweight
271,-0.547632,-0.398492,-0.377444,0.721769,-0.211307,-0.871131,-1.043952,-1.031822,0,Overweight
474,0.046442,-0.208232,-0.270152,-1.315192,-0.704223,-0.395687,-1.049917,-0.774634,0,Overweight


In [16]:
encoded_train = encode.fit_transform(train[categorical_features])
encoded_val = encode.transform(val[categorical_features])

In [17]:
encoded_train_df = pd.DataFrame(encoded_train, columns=encode.get_feature_names_out(categorical_features), index=train.index)
encoded_val_df = pd.DataFrame(encoded_val, columns=encode.get_feature_names_out(categorical_features), index=val.index)

In [18]:
train = pd.concat([train.drop(columns=categorical_features), encoded_train_df], axis=1)
val = pd.concat([val.drop(columns=categorical_features), encoded_val_df], axis=1)

In [19]:
from sklearn.neighbors import KNeighborsClassifier 

In [20]:
knn = KNeighborsClassifier()

In [21]:
X_train = train.drop(columns=['Outcome'])  
y_train = train['Outcome']               
X_val = val.drop(columns=['Outcome'])     
y_val = val['Outcome']  

In [22]:
knn.fit(X_train, y_train)

In [23]:
y_pred = knn.predict(X_val)

In [24]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_pred,y_val))

0.7727272727272727


In [25]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
max_depth_values = [3, 5, 7]
best_f1 = 0
best_depth = None
best_model = None

for depth in max_depth_values:
    # Initialize Decision Tree model
    model = DecisionTreeClassifier(max_depth=depth, random_state=42)
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on validation data
    y_pred = model.predict(X_val)
    
    # Calculate F1 score
    f1 = f1_score(y_val, y_pred)
    print(f"Max Depth: {depth}, F1 Score: {f1:.4f}")
    

    if f1 > best_f1:
        best_f1 = f1
        best_depth = depth
        best_model = model

print(f"\nBest Max Depth: {best_depth}, Best F1 Score: {best_f1:.4f}")

y_best_pred = best_model.predict(X_val)

from sklearn.tree import export_text
print("\nDecision Tree Structure:\n")
print(export_text(best_model, feature_names=list(X_train.columns)))


Max Depth: 3, F1 Score: 0.6598
Max Depth: 5, F1 Score: 0.4490
Max Depth: 7, F1 Score: 0.6154

Best Max Depth: 3, Best F1 Score: 0.6598

Decision Tree Structure:

|--- Glucose <= 0.22
|   |--- Age <= -0.39
|   |   |--- BMI <= 1.72
|   |   |   |--- class: 0
|   |   |--- BMI >  1.72
|   |   |   |--- class: 1
|   |--- Age >  -0.39
|   |   |--- BMI <= -0.65
|   |   |   |--- class: 0
|   |   |--- BMI >  -0.65
|   |   |   |--- class: 0
|--- Glucose >  0.22
|   |--- BMI_category_Obese <= 0.50
|   |   |--- Glucose <= 0.79
|   |   |   |--- class: 0
|   |   |--- Glucose >  0.79
|   |   |   |--- class: 0
|   |--- BMI_category_Obese >  0.50
|   |   |--- Glucose <= 1.17
|   |   |   |--- class: 1
|   |   |--- Glucose >  1.17
|   |   |   |--- class: 1

