# Model Building

---

1. Import packages
2. Load data
3. Modelling

---

## 1.) Import packages

In [56]:
import warnings
warnings.filterwarnings("ignore")

In [57]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Shows plots in jupyter notebook
# %matplotlib inline

# Set plot style
sns.set(color_codes=True)

---
## 2.) Load data

In [58]:
df=pd.read_csv('../data/feature_engineered_data/indian_youngsters_health_data.csv')

#### Show Top 5 Records

In [59]:
df.head()

Unnamed: 0,Age,Gender,Family_Income,Family_History_Diabetes,Parent_Diabetes_Type,Genetic_Risk_Score,BMI,Physical_Activity_Level,Dietary_Habits,Fast_Food_Intake,...,Cholesterol_Level,Prediabetes,Sleep_Hours,Stress_Level,Screen_Time,BMI_Activity_Risk,Age_Group,Stress_Sleep_Ratio,Genetic_Family_Risk,Unhealthy_Behavior_Score
0,21,Male,2209393,No,No Diabetes,6,31.4,Sedentary,Moderate,1,...,163.3,Yes,7.7,7,6.8,High Risk,Young Adult,0.804598,NoNoNoNoNoNo,YesNo
1,18,Female,387650,No,No Diabetes,5,24.4,Active,Unhealthy,5,...,169.1,Yes,7.9,8,6.0,Low Risk,Teen,0.898876,NoNoNoNoNo,NoNo
2,25,Male,383333,No,No Diabetes,6,20.0,Moderate,Moderate,2,...,296.3,Yes,7.6,8,4.6,Moderate Risk,Young Adult,0.930233,NoNoNoNoNoNo,NoNo
3,22,Male,2443733,No,No Diabetes,4,39.8,Moderate,Unhealthy,4,...,252.8,No,9.5,2,10.9,Moderate Risk,Young Adult,0.190476,NoNoNoNo,NoYes
4,19,Male,1449463,No,No Diabetes,4,19.2,Moderate,Moderate,0,...,252.3,No,6.4,2,1.3,Moderate Risk,Teen,0.27027,NoNoNoNo,NoYes


---

## 3.) Modelling

We now have a dataset containing features that we have engineered and we are ready to start training a predictive model.

In [60]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

### 3.1 Map target variable to binary.

In [61]:
df['Prediabetes'] = df['Prediabetes'].map({'Yes': 1, 'No': 0})

### 3.2 Define features and target for the model.

In [62]:
features = [
    'Age', 'Gender', 'Family_Income', 'Family_History_Diabetes', 'Parent_Diabetes_Type',
    'Genetic_Risk_Score', 'BMI', 'Physical_Activity_Level', 'Dietary_Habits', 'Fast_Food_Intake',
    'Fasting_Blood_Sugar', 'HbA1c', 'Cholesterol_Level', 'Sleep_Hours', 'Stress_Level',
    'Screen_Time', 'BMI_Activity_Risk', 'Age_Group', 'Stress_Sleep_Ratio', 'Genetic_Family_Risk',
    'Unhealthy_Behavior_Score'
]
X = df[features]
y = df['Prediabetes']

---
## 4.) Train-test split.

In [63]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

---
## 5.) Preprocessing

In [64]:
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

### One-hot encode categorical features

In [65]:
encoder = OneHotEncoder(drop='first', sparse_output=False)  # Use sparse_output instead of sparse
X_train_encoded = encoder.fit_transform(X_train[categorical_cols])
X_test_encoded = encoder.transform(X_test[categorical_cols])

### Scale numerical features

In [66]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[numerical_cols])
X_test_scaled = scaler.transform(X_test[numerical_cols])

### Combine preprocessed features

In [67]:
X_train_preprocessed = np.hstack((X_train_scaled, X_train_encoded))
X_test_preprocessed = np.hstack((X_test_scaled, X_test_encoded))

---
## 6.) Evaluate Function to give all metrics after model Training

In [68]:
# Models to compare
models = {
    'LogisticRegression': LogisticRegression(),
    'RandomForestClassifier': RandomForestClassifier(random_state=42),
    'KNeighborsClassifier': KNeighborsClassifier(),
    'XGBClassifier': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

# Dictionary to store trained models
trained_models = {}

# Train and evaluate models
results = {}

for name, model in models.items():
    model.fit(X_train_preprocessed, y_train)
    y_pred = model.predict(X_test_preprocessed)
    
    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
   
    results[name] = {
        'accuracy': accuracy,
        'classification_report': report,
        'confusion_matrix': conf_matrix
    }

# Display results
for name, metrics in results.items():
    print(f"\n{name} Results:")
    print(f"Accuracy: {metrics['accuracy']:.4f}")
    print("Classification Report:\n", metrics['classification_report'])
    print("Confusion Matrix:\n", metrics['confusion_matrix'])


LogisticRegression Results:
Accuracy: 0.6982
Classification Report:
               precision    recall  f1-score   support

           0       0.70      1.00      0.82     13964
           1       0.00      0.00      0.00      6036

    accuracy                           0.70     20000
   macro avg       0.35      0.50      0.41     20000
weighted avg       0.49      0.70      0.57     20000

Confusion Matrix:
 [[13964     0]
 [ 6036     0]]

RandomForestClassifier Results:
Accuracy: 0.6975
Classification Report:
               precision    recall  f1-score   support

           0       0.70      1.00      0.82     13964
           1       0.06      0.00      0.00      6036

    accuracy                           0.70     20000
   macro avg       0.38      0.50      0.41     20000
weighted avg       0.51      0.70      0.57     20000

Confusion Matrix:
 [[13949    15]
 [ 6035     1]]

KNeighborsClassifier Results:
Accuracy: 0.6275
Classification Report:
               precision    rec

# Saving the Model to a file

In [69]:
for name, model in trained_models.items():
    joblib.dump(model, f'{name}_model.pkl')

---
 # Conclusion 
---

Our diabetes prediction models achieved ~70% accuracy across all approaches ( Logistic Regression: 69.82%, XGBoost: 69.28%, Random Forest: 69.75% ,KNeighborsClassifier: 62.75%). Logistic Regression emerged as the best performer with more balanced predictions. Key predictors include BMI, HbA1c, Sleep Hours, Fast Food Intake, Physical Activity, Blood Sugar levels, Cholesterol, Family History Diabetes, Parent Diabetes Type, and Genetic Risk Score.