In [253]:
import pandas as pd

df=pd.read_csv('diabetes.csv')
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [246]:
def categorize_bmi(bmi):
    if bmi < 16.0:
        return 'Severely_Underweight'
    elif bmi < 18.4:
        return 'Underweight'
    elif bmi < 24.9:
        return 'Normal'
    elif bmi < 29.9:
        return 'Overweight'
    elif bmi < 34.9:
        return 'Moderately_Obese'
    elif bmi < 40.0:
        return 'Severely_Obese'
    else:
        return 'Morbidly_Obese'


df['BMI_Category'] = df['BMI'].apply(categorize_bmi)
df.drop(columns=['BMI'],inplace=True)
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,DiabetesPedigreeFunction,Age,Outcome,BMI_Category
0,6,148,72,35,0,0.627,50,1,Moderately_Obese
1,1,85,66,29,0,0.351,31,0,Overweight
2,8,183,64,0,0,0.672,32,1,Normal
3,1,89,66,23,94,0.167,21,0,Overweight
4,0,137,40,35,168,2.288,33,1,Morbidly_Obese
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,0.171,63,0,Moderately_Obese
764,2,122,70,27,0,0.340,27,0,Severely_Obese
765,5,121,72,23,112,0.245,30,0,Overweight
766,1,126,60,0,0,0.349,47,1,Moderately_Obese


In [247]:
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'DiabetesPedigreeFunction', 'Age', 'Outcome', 'BMI_Category'],
      dtype='object')

In [248]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd

X = df[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin','BMI_Category', 'DiabetesPedigreeFunction', 'Age']]

y = df[['Outcome']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

numeric_features = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin','DiabetesPedigreeFunction', 'Age']
categorical_features = ['BMI_Category']

scaler = StandardScaler()

X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])
X_test[numeric_features] = scaler.transform(X_test[numeric_features])

X_train

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI_Category,DiabetesPedigreeFunction,Age
663,1.516591,0.750524,0.564756,1.652015,0.437496,Severely_Obese,0.529526,0.567932
712,1.812018,0.244752,-0.347904,1.020973,-0.678474,Morbidly_Obese,-0.069689,0.398450
161,0.925736,-0.608739,0.260536,1.273390,0.222886,Severely_Obese,-0.794249,0.991638
509,1.221164,-0.039745,0.463350,-1.250779,-0.678474,Overweight,-0.167519,2.601722
305,-0.551400,-0.039745,0.361943,1.084077,0.222886,Severely_Obese,-0.760619,-0.364222
...,...,...,...,...,...,...,...,...
645,-0.551400,1.129853,0.260536,0.957869,3.098656,Severely_Obese,-1.008254,-0.279481
715,0.925736,2.078175,-0.956345,0.831660,2.686605,Moderately_Obese,1.107339,0.059484
72,2.698300,0.149919,1.071790,-1.250779,-0.678474,Morbidly_Obese,0.364436,0.737415
235,0.039454,1.572403,0.159129,-1.250779,-0.678474,Morbidly_Obese,0.046486,-0.618446


In [249]:

import pandas as pd
from sklearn.preprocessing import OneHotEncoder


encoder = OneHotEncoder(sparse_output=False)

X_train_encoded = encoder.fit_transform(X_train[['BMI_Category']])

X_test_encoded = encoder.transform(X_test[['BMI_Category']])


X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=encoder.get_feature_names_out(['BMI_Category']), index=X_train.index)
X_test_encoded_df = pd.DataFrame(X_test_encoded, columns=encoder.get_feature_names_out(['BMI_Category']), index=X_test.index)


X_train = X_train.drop('BMI_Category', axis=1)
X_test = X_test.drop('BMI_Category', axis=1)

X_train = pd.concat([X_train, X_train_encoded_df], axis=1)
X_test = pd.concat([X_test, X_test_encoded_df], axis=1)

X_test




Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,DiabetesPedigreeFunction,Age,BMI_Category_Moderately_Obese,BMI_Category_Morbidly_Obese,BMI_Category_Normal,BMI_Category_Overweight,BMI_Category_Severely_Obese,BMI_Category_Severely_Underweight,BMI_Category_Underweight
285,0.925736,0.466027,0.260536,0.389931,0.480418,0.560098,1.500086,0.0,0.0,0.0,1.0,0.0,0.0,0.0
101,-0.846827,0.940188,-0.449311,-1.250779,-0.678474,-0.870679,-0.957411,0.0,0.0,0.0,1.0,0.0,0.0,0.0
581,0.630309,-0.387463,-0.449311,0.453035,-0.678474,-0.788134,-0.533704,0.0,0.0,0.0,1.0,0.0,0.0,0.0
352,-0.255973,-1.904779,0.666163,0.516139,-0.678474,-0.675017,1.076380,1.0,0.0,0.0,0.0,0.0,0.0,0.0
726,-0.846827,-0.166188,0.463350,0.579243,0.866715,0.098458,-0.703187,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
563,0.630309,-0.703571,-0.449311,-0.051799,-0.214917,0.101515,-0.109998,0.0,0.0,0.0,1.0,0.0,0.0,0.0
318,-0.255973,-0.197799,-0.145091,1.210286,0.523340,-0.959338,-0.448963,0.0,0.0,0.0,0.0,1.0,0.0,0.0
154,1.221164,2.109786,0.463350,-1.250779,-0.678474,-0.999082,0.822156,0.0,1.0,0.0,0.0,0.0,0.0,0.0
684,0.334882,0.466027,0.666163,-1.250779,-0.678474,0.538697,3.025428,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [250]:

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score

y_train = y_train.values.ravel()  
y_test = y_test.values.ravel()    

def evaluate_knn(X_train, X_test, y_train, y_test):
    best_f1 = 0
    best_k = 0
    
    for k in [3, 5, 7]:
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_test)
        f1 = f1_score(y_test, y_pred)
        print(f"KNN (k={k}) - F1 Score: {f1}")
        
        if f1 > best_f1:
            best_f1 = f1
            best_k = k
    
    return best_k, best_f1

best_k, best_f1_knn = evaluate_knn(X_train, X_test, y_train, y_test)
print(f"Best k for KNN: {best_k} with F1 Score: {best_f1_knn}")

def evaluate_decision_tree(X_train, X_test, y_train, y_test):
    best_f1 = 0
    best_depth = 0
    
    for depth in [3, 5, 7]:
        dt = DecisionTreeClassifier(max_depth=depth, random_state=42)
        dt.fit(X_train, y_train)
        y_pred = dt.predict(X_test)
        f1 = f1_score(y_test, y_pred)
        print(f"Decision Tree (max_depth={depth}) - F1 Score: {f1}")
        
        if f1 > best_f1:
            best_f1 = f1
            best_depth = depth
    
    return best_depth, best_f1

best_depth, best_f1_dt = evaluate_decision_tree(X_train, X_test, y_train, y_test)
print(f"Best max_depth for Decision Tree: {best_depth} with F1 Score: {best_f1_dt}")


KNN (k=3) - F1 Score: 0.5961538461538461
KNN (k=5) - F1 Score: 0.6415094339622641
KNN (k=7) - F1 Score: 0.6666666666666666
Best k for KNN: 7 with F1 Score: 0.6666666666666666
Decision Tree (max_depth=3) - F1 Score: 0.6415094339622641
Decision Tree (max_depth=5) - F1 Score: 0.6935483870967742
Decision Tree (max_depth=7) - F1 Score: 0.6041666666666666
Best max_depth for Decision Tree: 5 with F1 Score: 0.6935483870967742


In [251]:
X_train

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,DiabetesPedigreeFunction,Age,BMI_Category_Moderately_Obese,BMI_Category_Morbidly_Obese,BMI_Category_Normal,BMI_Category_Overweight,BMI_Category_Severely_Obese,BMI_Category_Severely_Underweight,BMI_Category_Underweight
663,1.516591,0.750524,0.564756,1.652015,0.437496,0.529526,0.567932,0.0,0.0,0.0,0.0,1.0,0.0,0.0
712,1.812018,0.244752,-0.347904,1.020973,-0.678474,-0.069689,0.398450,0.0,1.0,0.0,0.0,0.0,0.0,0.0
161,0.925736,-0.608739,0.260536,1.273390,0.222886,-0.794249,0.991638,0.0,0.0,0.0,0.0,1.0,0.0,0.0
509,1.221164,-0.039745,0.463350,-1.250779,-0.678474,-0.167519,2.601722,0.0,0.0,0.0,1.0,0.0,0.0,0.0
305,-0.551400,-0.039745,0.361943,1.084077,0.222886,-0.760619,-0.364222,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
645,-0.551400,1.129853,0.260536,0.957869,3.098656,-1.008254,-0.279481,0.0,0.0,0.0,0.0,1.0,0.0,0.0
715,0.925736,2.078175,-0.956345,0.831660,2.686605,1.107339,0.059484,1.0,0.0,0.0,0.0,0.0,0.0,0.0
72,2.698300,0.149919,1.071790,-1.250779,-0.678474,0.364436,0.737415,0.0,1.0,0.0,0.0,0.0,0.0,0.0
235,0.039454,1.572403,0.159129,-1.250779,-0.678474,0.046486,-0.618446,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [252]:
import joblib

joblib.dump(scaler, 'scaler.pkl')

joblib.dump(encoder, 'encoder.pkl')

best_model_knn = KNeighborsClassifier(n_neighbors=7)
best_model_knn.fit(X_train, y_train)
joblib.dump(best_model_knn, 'best_knn_model.pkl')

best_model_dt = DecisionTreeClassifier(max_depth=5, random_state=42)
best_model_dt.fit(X_train, y_train)
joblib.dump(best_model_dt, 'best_dt_model.pkl')


['best_dt_model.pkl']

In [3]:
from inference import predict_outcomes
import pandas as pd

df = pd.read_csv('diabetes.csv')
classes = predict_outcomes(df.head(5))
print(classes)

[1 0 1 0 1]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:,'BMI_Category'] = df['BMI'].apply(categorize_bmi)
