In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report

In [4]:
# -------------------------------------------------------------------
# 1. Load & clean data (from previous script)
# -------------------------------------------------------------------
df = pd.read_csv("ObesityDataSet.csv")
df.replace("?", np.nan, inplace=True)

numeric_cols = ['Age','Height','Weight','FCVC','NCP','CH2O','FAF','TUE']
for c in numeric_cols:
    df[c] = pd.to_numeric(df[c], errors='coerce')

df.drop_duplicates(inplace=True)
df.dropna(inplace=True)
df = df[(df['Age'].between(14,80)) &
        (df['Height'].between(1.2,2.2)) &
        (df['Weight'].between(30,200))]

def remove_iqr(df_in, cols):
    df_out = df_in.copy()
    for c in cols:
        q1, q3 = df_out[c].quantile([0.25, 0.75])
        iqr = q3 - q1
        lb, ub = q1 - 1.5*iqr, q3 + 1.5*iqr
        df_out = df_out[df_out[c].between(lb, ub)]
    return df_out

df = remove_iqr(df, ['FCVC','NCP','CH2O','FAF','TUE'])
df.reset_index(drop=True, inplace=True)
df['BMI'] = df['Weight'] / df['Height']**2

le = LabelEncoder()
df['NObeyesdad'] = le.fit_transform(df['NObeyesdad'])

cat_cols = ['Gender','CALC','FAVC','SCC','SMOKE',
            'family_history_with_overweight','CAEC','MTRANS']
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)

X = df.drop('NObeyesdad', axis=1)
y = df['NObeyesdad']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler()
num_feats = ['Age','Height','Weight','FCVC','NCP','CH2O','FAF','TUE','BMI']
X_train[num_feats] = scaler.fit_transform(X_train[num_feats])
X_test[num_feats]  = scaler.transform(X_test[num_feats])

sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

In [5]:
# -------------------------------------------------------------------
# 2. Define classifiers (default)
# -------------------------------------------------------------------
knn_default = KNeighborsClassifier()
svm_default = SVC(probability=True, random_state=42)
xgb_default = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)

# -------------------------------------------------------------------
# 3. Train default models
# -------------------------------------------------------------------
knn_default.fit(X_train_res, y_train_res)
svm_default.fit(X_train_res, y_train_res)
xgb_default.fit(X_train_res, y_train_res)

# -------------------------------------------------------------------
# 4. Evaluate default models
# -------------------------------------------------------------------
print("=== KNN Default ===")
print(classification_report(y_test, knn_default.predict(X_test)))
print("=== SVM Default ===")
print(classification_report(y_test, svm_default.predict(X_test)))
print("=== XGB Default ===")
print(classification_report(y_test, xgb_default.predict(X_test)))

Parameters: { "use_label_encoder" } are not used.



=== KNN Default ===
              precision    recall  f1-score   support

           0       0.75      0.95      0.84        22
           1       0.83      0.59      0.69        34
           2       0.90      0.88      0.89        42
           3       1.00      0.97      0.99        38
           4       1.00      1.00      1.00        52
           5       0.61      0.79      0.69        29
           6       0.75      0.68      0.71        31

    accuracy                           0.85       248
   macro avg       0.83      0.84      0.83       248
weighted avg       0.86      0.85      0.85       248

=== SVM Default ===
              precision    recall  f1-score   support

           0       1.00      0.95      0.98        22
           1       0.92      0.97      0.94        34
           2       0.98      0.95      0.96        42
           3       1.00      0.97      0.99        38
           4       1.00      1.00      1.00        52
           5       0.87      0.90     

In [6]:
# -------------------------------------------------------------------
# 5. Hyperparameter grids
# -------------------------------------------------------------------
param_grid_knn = {
    'n_neighbors': [3,5,7,9],
    'weights': ['uniform','distance'],
    'p': [1,2]
}

param_grid_svm = {
    'C': [0.1,1,10],
    'kernel': ['linear','rbf'],
    'gamma': ['scale','auto']
}

param_grid_xgb = {
    'n_estimators': [50,100,200],
    'max_depth': [3,5,7],
    'learning_rate': [0.01,0.1,0.2]
}

# -------------------------------------------------------------------
# 6. GridSearchCV (with 5‑fold CV)
# -------------------------------------------------------------------
gs_knn = GridSearchCV(KNeighborsClassifier(), param_grid_knn,
                      cv=5, n_jobs=-1, scoring='f1_weighted')
gs_svm = GridSearchCV(SVC(probability=True, random_state=42), param_grid_svm,
                      cv=5, n_jobs=-1, scoring='f1_weighted')
gs_xgb = GridSearchCV(XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42),
                      param_grid_xgb, cv=5, n_jobs=-1, scoring='f1_weighted')

# -------------------------------------------------------------------
# 7. Train tuned models
# -------------------------------------------------------------------
gs_knn.fit(X_train_res, y_train_res)
gs_svm.fit(X_train_res, y_train_res)
gs_xgb.fit(X_train_res, y_train_res)

knn_tuned = gs_knn.best_estimator_
svm_tuned = gs_svm.best_estimator_
xgb_tuned = gs_xgb.best_estimator_

# -------------------------------------------------------------------
# 8. Evaluate tuned models
# -------------------------------------------------------------------
print("=== KNN Tuned ===")
print(classification_report(y_test, knn_tuned.predict(X_test)))
print("Best KNN params:", gs_knn.best_params_)
print("=== SVM Tuned ===")
print(classification_report(y_test, svm_tuned.predict(X_test)))
print("Best SVM params:", gs_svm.best_params_)
print("=== XGB Tuned ===")
print(classification_report(y_test, xgb_tuned.predict(X_test)))
print("Best XGB params:", gs_xgb.best_params_)


=== KNN Tuned ===
              precision    recall  f1-score   support

           0       0.96      1.00      0.98        22
           1       0.86      0.74      0.79        34
           2       0.95      0.95      0.95        42
           3       1.00      0.97      0.99        38
           4       1.00      1.00      1.00        52
           5       0.71      0.86      0.78        29
           6       0.87      0.84      0.85        31

    accuracy                           0.92       248
   macro avg       0.91      0.91      0.91       248
weighted avg       0.92      0.92      0.92       248

Best KNN params: {'n_neighbors': 3, 'p': 1, 'weights': 'distance'}
=== SVM Tuned ===
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        22
           1       0.97      0.91      0.94        34
           2       0.98      1.00      0.99        42
           3       1.00      0.97      0.99        38
           4       1.00      

Parameters: { "use_label_encoder" } are not used.



In [7]:
import joblib

# Save scaler
joblib.dump(scaler, 'scaler.pkl')

# Save models - default and tuned versions
joblib.dump(knn_default, 'knn_default.pkl')
joblib.dump(svm_default, 'svm_default.pkl')
joblib.dump(xgb_default, 'xgb_default.pkl')

joblib.dump(knn_tuned, 'knn_tuned.pkl')
joblib.dump(svm_tuned, 'svm_tuned.pkl')
joblib.dump(xgb_tuned, 'xgb_tuned.pkl')


['xgb_tuned.pkl']

In [8]:
# Accuracy reports as strings for Streamlit display
from sklearn.metrics import accuracy_score

accuracy_default = {
    'KNN': accuracy_score(y_test, knn_default.predict(X_test)),
    'SVM': accuracy_score(y_test, svm_default.predict(X_test)),
    'XGB': accuracy_score(y_test, xgb_default.predict(X_test))
}

accuracy_tuned = {
    'KNN': accuracy_score(y_test, knn_tuned.predict(X_test)),
    'SVM': accuracy_score(y_test, svm_tuned.predict(X_test)),
    'XGB': accuracy_score(y_test, xgb_tuned.predict(X_test))
}

joblib.dump(accuracy_default, 'accuracy_default.pkl')
joblib.dump(accuracy_tuned, 'accuracy_tuned.pkl')

# Save dataset description in a markdown or text file, e.g., 'dataset_info.md'
dataset_info = """
# Dataset Overview
- Dataset contains 2111 samples with 17 columns including Age, Gender, Height, Weight, etc.
- Target variable is NObeyesdad, categorizing obesity levels.
- Data cleaning included removing duplicates, nulls, outliers in Age, Weight, Height.
- Feature engineering included BMI calculation.
- Encoding categorical variables and normalization applied.
"""
with open('dataset_info.md', 'w') as f:
    f.write(dataset_info)
