In [2]:
import pandas as pd
file_path = "diabetes.csv"
df = pd.read_csv(file_path)
df.info(), df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


(None,
    Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
 0            6      148             72             35        0  33.6   
 1            1       85             66             29        0  26.6   
 2            8      183             64              0        0  23.3   
 3            1       89             66             23       94  28.1   
 4            0      137             40             35      168  43.1   
 
    DiabetesPedigreeFunction  Age  Outcome  
 0                     0.627   50        1  
 1                     0.351   31        0  
 2                     0.672   32        1  
 3                     0.167   21        0  
 4                     2.288   33        1  )

In [4]:
# Define BMI categories based on standard medical guidelines
def categorize_bmi(bmi):
    if bmi < 18.5:
        return "Underweight"
    elif 18.5 <= bmi < 24.9:
        return "Normal weight"
    elif 25 <= bmi < 29.9:
        return "Overweight"
    else:
        return "Obese"

# Apply the function to create the new column
df["BMI_category"] = df["BMI"].apply(categorize_bmi)

# Display value counts to verify
df["BMI_category"].value_counts()


BMI_category
Obese            478
Overweight       174
Normal weight    101
Underweight       15
Name: count, dtype: int64

In [6]:
from sklearn.model_selection import train_test_split

# Define features and target variable
X = df.drop(columns=["Outcome"])  # Features
y = df["Outcome"]  # Target variable

# Split data into training (80%) and validation (20%) sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Display the shape of the resulting datasets
X_train.shape, X_val.shape, y_train.shape, y_val.shape

##The dataset has been split into training (614 samples) and validation (154 samples) sets.


((614, 9), (154, 9), (614,), (154,))

In [8]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Identify numeric and categorical features
numeric_features = ["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "DiabetesPedigreeFunction", "Age"]
categorical_features = ["BMI_category"]

# Create column transformer with StandardScaler and OneHotEncoder
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(), categorical_features)
    ]
)

# Fit and transform the training set
X_train_transformed = preprocessor.fit_transform(X_train)

# Transform the validation set
X_val_transformed = preprocessor.transform(X_val)

# Display transformed feature shape
X_train_transformed.shape, X_val_transformed.shape


((614, 12), (154, 12))

In [10]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score

# Experimenting with different values of k for KNN
knn_scores = {}
for k in [3, 5, 7]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_transformed, y_train)
    y_pred = knn.predict(X_val_transformed)
    knn_scores[k] = f1_score(y_val, y_pred)

# Select the best k value
best_k = max(knn_scores, key=knn_scores.get)
best_knn_f1 = knn_scores[best_k]

# Experimenting with different values of max_depth for Decision Tree
dt_scores = {}
for depth in [3, 5, 7]:
    dt = DecisionTreeClassifier(max_depth=depth, random_state=42)
    dt.fit(X_train_transformed, y_train)
    y_pred = dt.predict(X_val_transformed)
    dt_scores[depth] = f1_score(y_val, y_pred)

# Select the best max_depth value
best_depth = max(dt_scores, key=dt_scores.get)
best_dt_f1 = dt_scores[best_depth]

best_k, best_knn_f1, best_depth, best_dt_f1


#The best hyperparameters based on F1 score are:

##KNN: K=5 with an F1 score of 0.588.
##Decision Tree: max_depth=5 with an F1 score of 0.704.


(5, 0.5882352941176471, 5, 0.7037037037037037)

In [30]:
import joblib
import os

# Define a writable directory
save_dir = "./saved_models/"  # Ensure this directory exists
scaler_path = os.path.join(save_dir, "scaler.pkl")
encoder_path = os.path.join(save_dir, "encoder.pkl")
model_path = os.path.join(save_dir, "best_model.pkl")

# Create directory if it doesn't exist
os.makedirs(save_dir, exist_ok=True)

# Save StandardScaler and OneHotEncoder separately
joblib.dump(preprocessor.named_transformers_["num"], scaler_path)  # Save StandardScaler
joblib.dump(preprocessor.named_transformers_["cat"], encoder_path)  # Save OneHotEncoder

# Save the best Decision Tree model
joblib.dump(dt, model_path)

# Confirm that files are saved correctly
print("Files saved successfully in:", save_dir)
print("Saved files:", os.listdir(save_dir))


Files saved successfully in: ./saved_models/
Saved files: ['scaler.pkl', 'encoder.pkl', 'best_model.pkl']


In [34]:
import joblib
import os

# Define the correct path where files were saved
save_dir = "./saved_models/"

# Ensure files exist before loading
if not os.path.exists(os.path.join(save_dir, "scaler.pkl")):
    raise FileNotFoundError("scaler.pkl not found in the expected directory.")
if not os.path.exists(os.path.join(save_dir, "best_model.pkl")):
    raise FileNotFoundError("best_model.pkl not found in the expected directory.")

# Load the saved preprocessor and model
scaler = joblib.load(os.path.join(save_dir, "scaler.pkl"))
encoder = joblib.load(os.path.join(save_dir, "encoder.pkl"))
model = joblib.load(os.path.join(save_dir, "best_model.pkl"))

print("Files loaded successfully!")


Files loaded successfully!


In [38]:
import joblib
import numpy as np
import pandas as pd

# Define correct file paths
save_dir = "./saved_models/"  # Update this to the correct path if needed

# Load the saved preprocessor and model
scaler = joblib.load(os.path.join(save_dir, "scaler.pkl"))
encoder = joblib.load(os.path.join(save_dir, "encoder.pkl"))  # Load OneHotEncoder
model = joblib.load(os.path.join(save_dir, "best_model.pkl"))

# Define numerical and categorical columns (same as training)
num_features = ["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "DiabetesPedigreeFunction", "Age"]
cat_features = ["BMI_category"]

def preprocess(sample):
    """
    Function to preprocess a sample before prediction.
    :param sample: Dictionary containing feature values
    :return: Transformed sample (ready for model prediction)
    """
    # Convert sample to DataFrame
    sample_df = pd.DataFrame([sample])

    # Apply OneHotEncoder transformation to categorical features
    sample_cat = encoder.transform(sample_df[cat_features]).toarray()

    # Apply StandardScaler to numerical features
    sample_num = scaler.transform(sample_df[num_features])

    # Concatenate numerical and encoded categorical features
    sample_transformed = np.hstack((sample_num, sample_cat))

    return sample_transformed

def predict(sample):
    """
    Function to preprocess and predict the outcome for a given sample.
    :param sample: Dictionary containing feature values
    :return: Predicted class (0 or 1)
    """
    sample_transformed = preprocess(sample)
    prediction = model.predict(sample_transformed)
    return prediction[0]

# Example test cases from validation set
test_samples = [
    {
        "Pregnancies": 2,
        "Glucose": 120,
        "BloodPressure": 70,
        "SkinThickness": 20,
        "Insulin": 79,
        "BMI": 25.5,
        "DiabetesPedigreeFunction": 0.45,
        "Age": 30,
        "BMI_category": "Overweight"
    },
    {
        "Pregnancies": 5,
        "Glucose": 140,
        "BloodPressure": 80,
        "SkinThickness": 35,
        "Insulin": 90,
        "BMI": 29.0,
        "DiabetesPedigreeFunction": 0.55,
        "Age": 45,
        "BMI_category": "Overweight"
    }
]

# Run predictions
for i, sample in enumerate(test_samples):
    print(f"Sample {i+1}: Predicted Outcome =", predict(sample))


Sample 1: Predicted Outcome = 0
Sample 2: Predicted Outcome = 1
