# Machine Learning Classificaion model
In this section we'll preprocess and build a classification model which will be able to classify Diabetic/Non-Diabetic by using the features within the dataset.

## 1. Import libraries

In [21]:
# Imports & settings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import (roc_auc_score, roc_curve, confusion_matrix, accuracy_score,
                             classification_report, f1_score, precision_score, recall_score)
# Models
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, \
AdaBoostClassifier, ExtraTreesClassifier, VotingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.svm import SVC


## 2. Load & Quick sanity check

In [7]:
#---------------------
# Load data and check
#---------------------

data = pd.read_csv("../Data/train.csv")
test = pd.read_csv("../Data/test.csv")

data.shape, test.shape

((4105, 10), (1027, 10))

In [None]:
#-------------------------------------------
# A quick fix (we know from EDA that Gender has F,f,M, values so we need to fix it)
#-------------------------------------------

# Replace `f` with `F`
data["Gender"] = data["Gender"].replace("f", "F")
print("-----Fixed-----")

-----Fixed-----


In [15]:
# Check again
(data["Gender"] == "f").sum()

np.int64(0)

## 3. Split
Split the data into train/validation sets

In [17]:
#----------------------------------------
# Split Dataset into train and validation sets
#----------------------------------------

# Define random state
RANDOM_STATE = 42

# Detemine target and features
X = data.drop(columns=["Diagnosis"], errors="ignore")   # Features
y = data["Diagnosis"]   # Target

# Split
X_train, X_val, y_train, y_val  = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y)


# Print Shapes
print(f"Shapes:\n\tx_train shape: {X_train.shape}\n\tx_val shape: {X_val.shape}")
print(f"\ty_train shape: {y_train.shape}\n\ty_val shape: {y_val.shape}", end="\n\n")
print("-----Done-----")


Shapes:
	x_train shape: (3284, 9)
	x_val shape: (821, 9)
	y_train shape: (3284,)
	y_val shape: (821,)

-----Done-----


## 4. Preprocess
- **Numeric columns:** impute (median), log1p for skewed features, scale.
- **Categorical columns:** impute (most_frequent) + OneHotEncoder.
- We'll include `FunctionTransformer(np.log1p)` only for selected skewed columns.

In [None]:
"""
Preprocess:
* Impute: When there is a missing value it works (just in case)
* Scale: StandarScaler to standardize numeric features
* log1p: log1p Transformation technique applied on the skewed features
* Encode: OneHotEncoder for categorical features to be encoded 
"""

# Identify columns
#------------------

# Numeric columns
num_cols = ["Age", "BMI", "Chol", "TG", "HDL","LDL", "Cr", "BUN"]
# Categorical columns
cat_col = ["Gender"]
# Skewed/non-skewed (basic) columns
skewed = ["HDL", "TG", "BUN"]
non_skewed = [c for c in num_cols if c not in skewed]


# Pipeline pieces
#------------------

# Skewed columns pipeline
num_skewed_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("log1p", FunctionTransformer(np.log1p, validate=False)),
    ("scaler", StandardScaler())
])

# Non-skewed (basic) columns pipeline
num_basic_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# Categorical columns pipeline
cat_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

# Build Preprocessor 
preprocessor = ColumnTransformer(transformers=[
    ("num_skewed", num_skewed_pipeline, skewed),
    ("num_basic", num_basic_pipeline, non_skewed),
    ("cat", cat_pipeline, cat_col)], remainder="drop")

print("-----Preprocessor was built-----")

-----Preprocessor was built-----


## 5. Baseline models & Benchmarking


In [22]:
"""
Check all the models possible to be used for the project,
then FIT, TRAIN, and EVALUATE to see their performances and benchmark.
* Strong cnadidates will be considered for optimization.
"""

# Initialize the models
models = {
    "RandomForest": RandomForestClassifier(random_state=RANDOM_STATE),
    "GradientBoosting": GradientBoostingClassifier(random_state=RANDOM_STATE),
    "AdaBoost": AdaBoostClassifier(random_state=RANDOM_STATE),
    "ExtraTrees": ExtraTreesClassifier(random_state=RANDOM_STATE),
    "DecisionTree": DecisionTreeClassifier(random_state=RANDOM_STATE),
    "LogisticRegression": LogisticRegression(random_state=RANDOM_STATE),
    "SVC": SVC(probability=True, random_state=RANDOM_STATE),
    "KNN": KNeighborsClassifier(),
    "GaussianNB": GaussianNB(),
    "BernoulliNB": BernoulliNB(),
    "GaussianProcess": GaussianProcessClassifier()
}


# Train & Evaluate each model and store them in `results`
results = []

for name, model in models.items():
    pipe = Pipeline(steps=[("pre", preprocessor), ("clf", model)])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_val)
    
    results.append({
        "Model": name,
        "F1_weighted": f1_score(y_val, y_pred, average='weighted', zero_division=0),
        "F1_macro": f1_score(y_val, y_pred, average='macro', zero_division=0),
        "F1_binary": f1_score(y_val, y_pred, average='binary', zero_division=0),
        "Accuracy": accuracy_score(y_val, y_pred)
    })

# Convert to DataFrame
results_df = pd.DataFrame(results)

# Sort by one or more metrics -> Primary metric: F1 score weighted
results_df = results_df.sort_values(by='F1_weighted', ascending=False).reset_index(drop=True)

results_df


Unnamed: 0,Model,F1_weighted,F1_macro,F1_binary,Accuracy
0,SVC,0.84432,0.834259,0.791461,0.845311
1,GradientBoosting,0.836333,0.825223,0.777963,0.838002
2,GaussianProcess,0.834264,0.823338,0.77686,0.835566
3,RandomForest,0.8331,0.822152,0.775578,0.834348
4,ExtraTrees,0.829087,0.817597,0.768719,0.830694
5,AdaBoost,0.824707,0.813379,0.765189,0.825822
6,KNN,0.815031,0.802172,0.747475,0.817296
7,LogisticRegression,0.814183,0.801569,0.747913,0.816078
8,DecisionTree,0.802385,0.791898,0.747287,0.801462
9,GaussianNB,0.800779,0.784019,0.712727,0.807552


In [None]:
# Use K-folds
#-------------------

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

results = []
for name, base_model in models.items():
    pipe = Pipeline(steps=[("pre", preprocessor), ("clf", base_model)])
    # F1-weighted
    f1_scores = cross_val_score(pipe, X_train, y_train, cv=cv, scoring='f1_weighted', n_jobs=-1)
    # ROC-AUC (requires predict_proba or decision_function; will error for some models like KNN? usually OK)
    try:
        auc_scores = cross_val_score(pipe, X_train, y_train, cv=cv, scoring='roc_auc', n_jobs=-1)
    except Exception:
        auc_scores = np.array([np.nan]*cv.get_n_splits())
    results.append({
        "Model": name,
        "F1_mean": f1_scores.mean(),
        "F1_std": f1_scores.std(),
        "AUC_mean": np.nanmean(auc_scores),
        "AUC_std": np.nanstd(auc_scores)
    })

results_df = pd.DataFrame(results).sort_values(by='F1_mean', ascending=False).reset_index(drop=True)
results_df['F1_str'] = results_df.apply(lambda r: f"{r['F1_mean']:.3f} ± {r['F1_std']:.3f}", axis=1)
results_df['AUC_str'] = results_df.apply(lambda r: (f"{r['AUC_mean']:.3f} ± {r['AUC_std']:.3f}"
                                                   if not np.isnan(r['AUC_mean']) else "n/a"), axis=1)
results_df = results_df[["Model","F1_str","AUC_str","F1_mean","F1_std","AUC_mean","AUC_std"]]

results_df


Unnamed: 0,Model,F1_str,AUC_str,F1_mean,F1_std,AUC_mean,AUC_std
0,SVC,0.826 ± 0.011,0.899 ± 0.008,0.825845,0.010604,0.898766,0.00845
1,ExtraTrees,0.825 ± 0.010,0.907 ± 0.008,0.824575,0.009605,0.907266,0.007743
2,GaussianProcess,0.822 ± 0.009,0.894 ± 0.007,0.8222,0.008586,0.894467,0.006521
3,GradientBoosting,0.819 ± 0.012,0.911 ± 0.011,0.818664,0.011786,0.910733,0.011031
4,RandomForest,0.817 ± 0.013,0.907 ± 0.007,0.816717,0.01303,0.90743,0.007273
5,AdaBoost,0.814 ± 0.018,0.902 ± 0.011,0.813924,0.017809,0.901675,0.010768
6,LogisticRegression,0.810 ± 0.016,0.886 ± 0.011,0.810217,0.015909,0.885977,0.010793
7,KNN,0.800 ± 0.008,0.869 ± 0.011,0.800097,0.007834,0.869261,0.011218
8,GaussianNB,0.782 ± 0.010,0.882 ± 0.006,0.782087,0.00972,0.882474,0.006014
9,BernoulliNB,0.780 ± 0.005,0.845 ± 0.012,0.779853,0.004672,0.845062,0.01189
