In [1]:
#Folder structure
#--Data
#--Saved_objects
#--Inference_A1_J013.ipynb
#--J013_A1.ipynb

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
import joblib


In [3]:
data = pd.read_csv("Data/diabetes.csv")
data.head()

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [4]:
def categorize_bmi(bmi):
    if bmi < 18.5:
        return "Underweight"
    elif bmi < 25:
        return "Normal weight"
    elif bmi < 30:
        return "Overweight"
    else:
        return "Obese"

data["BMI_category"] = data["BMI"].apply(categorize_bmi)

In [5]:
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42, stratify=data["Outcome"])

numeric_features = ["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "DiabetesPedigreeFunction", "Age"]
scaler = StandardScaler()
train_data[numeric_features] = scaler.fit_transform(train_data[numeric_features])
val_data[numeric_features] = scaler.transform(val_data[numeric_features])

categorical_features = ["BMI_category"]
encoder = OneHotEncoder(sparse=False)
train_encoded = encoder.fit_transform(train_data[categorical_features])
val_encoded = encoder.transform(val_data[categorical_features])

train_encoded_df = pd.DataFrame(train_encoded, columns=encoder.get_feature_names_out(categorical_features))
val_encoded_df = pd.DataFrame(val_encoded, columns=encoder.get_feature_names_out(categorical_features))

train_data = pd.concat([train_data.reset_index(drop=True), train_encoded_df], axis=1).drop(columns=categorical_features)
val_data = pd.concat([val_data.reset_index(drop=True), val_encoded_df], axis=1).drop(columns=categorical_features)

X_train = train_data.drop(columns=["Outcome"])
y_train = train_data["Outcome"]
X_val = val_data.drop(columns=["Outcome"])
y_val = val_data["Outcome"]



In [6]:
best_k, best_knn_f1 = None, 0
for k in [3, 5, 7]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    f1 = f1_score(y_val, knn.predict(X_val))
    if f1 > best_knn_f1:
        best_k, best_knn_f1 = k, f1

best_depth, best_dt_f1 = None, 0
for depth in [3, 5, 7]:
    dt = DecisionTreeClassifier(max_depth=depth, random_state=42)
    dt.fit(X_train, y_train)
    f1 = f1_score(y_val, dt.predict(X_val))
    if f1 > best_dt_f1:
        best_depth, best_dt_f1 = depth, f1

if best_knn_f1 > best_dt_f1:
    best_model = KNeighborsClassifier(n_neighbors=best_k)
    best_model.fit(X_train, y_train)
    print(f"Best Model: KNN (k={best_k}, F1 Score={best_knn_f1})")
else:
    best_model = DecisionTreeClassifier(max_depth=best_depth, random_state=42)
    best_model.fit(X_train, y_train)
    print(f"Best Model: Decision Tree (max_depth={best_depth}, F1 Score={best_dt_f1})")

joblib.dump(scaler, "Saved_objects/scaler.pkl")
joblib.dump(encoder, "Saved_objects/encoder.pkl")
joblib.dump(best_model, "Saved_objects/best_model.pkl")

Best Model: Decision Tree (max_depth=5, F1 Score=0.7155963302752293)


['Saved_objects/best_model.pkl']