In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
import joblib

In [2]:
df = pd.read_csv('diabetes.csv')

In [3]:
df['BMI_category'] = None 
df['BMI_category'] = df['BMI_category'].fillna(
    df['BMI'].apply(
        lambda bmi: 'Underweight' if bmi < 18.5 else 
        'Healthy weight' if 18.5 <= bmi < 24.9 else 
        'Overweight' if 25 <= bmi < 29.9 else 
        'Obese'
    )
)

In [4]:
X = df.drop(columns=['Outcome'])
y = df['Outcome']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


num = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
cat = ['BMI_category']


scaler = StandardScaler()
X_trainnum = scaler.fit_transform(X_train[num])
X_valnum = scaler.transform(X_val[num])

encoder = OneHotEncoder(sparse=False)
X_traincat = encoder.fit_transform(X_train[cat])
X_valcat = encoder.transform(X_val[cat])

X_train = np.hstack((X_trainnum, X_traincat))
X_val = np.hstack((X_valnum, X_valcat))


In [5]:
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(encoder, 'encoder.pkl')

['encoder.pkl']

In [6]:
bestknnmodel = None
bestknnf1 = 0

for k in [3, 5, 7]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_predknn = knn.predict(X_val)
    f1knn = f1_score(y_val, y_predknn)
    
    if f1knn > bestknnf1:
        bestknnf1 = f1knn
        bestknnmodel = knn

print(bestknnf1)

0.6181818181818182


In [7]:
besttreemodel = None
besttreef1 = 0

for depth in [3, 5, 7]:
    tree = DecisionTreeClassifier(max_depth=depth, random_state=42)
    tree.fit(X_train, y_train)
    y_predtree = tree.predict(X_val)
    f1tree = f1_score(y_val, y_predtree)
    
    if f1tree > besttreef1:
        besttreef1 = f1tree
        besttreemodel = tree

print(besttreef1)

0.693069306930693


In [8]:
joblib.dump(bestknnmodel, 'bestknnmodel.pkl')
joblib.dump(besttreemodel, 'besttreemodel.pkl')

['besttreemodel.pkl']

In [9]:
def inference_pipeline(sample_data):
    loaded_scaler = joblib.load('scaler.pkl')
    loaded_encoder = joblib.load('encoder.pkl')
    loaded_model = joblib.load('besttreemodel.pkl') 
    sample_numeric = loaded_scaler.transform(sample_data[num])
    sample_categorical = loaded_encoder.transform(sample_data[cat])
    sample_processed = np.hstack((sample_numeric, sample_categorical))
    predictions = loaded_model.predict(sample_processed)
    return predictions


In [10]:
sample_data = 6,148,72,35,0,33.6,0.627,50
predictions = inference_pipeline(sample_data)
print("Predictions for first five samples:", predictions)

TypeError: tuple indices must be integers or slices, not list