# Diabetes Prediction - Final Project CSCI 4050 (ID: Group 16)

In [1]:
# Imports
import warnings
warnings.filterwarnings("ignore")

import os
import numpy as np
import pandas as pd

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import Adam

import lightning as L
from lightning.pytorch.callbacks import EarlyStopping
from torchmetrics import Accuracy

from torchinfo import summary

---
## Section 1 : Predicting Diabetes Likelihood
### Learning Problem: 
The purpose of this section of the project is to create a Machine Learning MLP model that can predict whether you have diabetes or not and also give you the percentage of confidence that the prediction is accurate. It uses various information including: demographic information (gender, age, number of pregnancies), physical measurements (BMI, blood pressure, skin thickness), lab measurements (blood glucose, HbA1c, insulin) and health history (hypertension, heart disease, smoking status) to help make the predictions. The model was trained using 2 datasets taken from Kaggle and uses a sequence of linear and ReLu layers to help conduct a binary classification of whether a user has diabetes. The model also uses the PyTorch Lightning module and the Adam optimizer to help improve the model's performance.

---
### Data Loading and Cleaning

#### 1. Loading Raw Data

In [2]:
first_dataset_path  = "data/raw/diabetes.csv"
second_dataset_path = "data/raw/diabetes_prediction_dataset.csv"

first_dataset  = pd.read_csv(first_dataset_path)
second_dataset = pd.read_csv(second_dataset_path)

print("First dataset:", first_dataset.shape)
print("Second dataset:", second_dataset.shape)
first_dataset.head(), second_dataset.head()

First dataset: (768, 9)
Second dataset: (100000, 9)


(   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
 0            6      148             72             35        0  33.6   
 1            1       85             66             29        0  26.6   
 2            8      183             64              0        0  23.3   
 3            1       89             66             23       94  28.1   
 4            0      137             40             35      168  43.1   
 
    DiabetesPedigreeFunction  Age  Outcome  
 0                     0.627   50        1  
 1                     0.351   31        0  
 2                     0.672   32        1  
 3                     0.167   21        0  
 4                     2.288   33        1  ,
    gender   age  hypertension  heart_disease smoking_history    bmi  \
 0  Female  80.0             0              1           never  25.19   
 1  Female  54.0             0              0         No Info  27.32   
 2    Male  28.0             0              0           never  27.32   
 

#### 2. Clean Data and Align Columns

In [3]:
# Fill missing columns in the first dataset
for col in ["gender", "hypertension", "heart_disease", "smoking_history", "HbA1c_level"]:
    if col != "smoking_history":
        first_dataset[col] = 0
    else:
        first_dataset[col] = "never"
first_dataset["blood_glucose_level"] = first_dataset["Glucose"]

# Fill missing columns in the second dataset
for col in ["Pregnancies", "BloodPressure", "SkinThickness", "Insulin",
            "DiabetesPedigreeFunction", "Glucose"]:
    second_dataset[col] = 0
second_dataset["Glucose"] = second_dataset["blood_glucose_level"]

# Concatenate datasets row-wise
combined_df = pd.concat([first_dataset, second_dataset], ignore_index=True)
combined_df.shape

(100768, 18)

#### 3. Shuffle and Split Data

In [4]:
np.random.seed(42)
indices = np.arange(len(combined_df))
np.random.shuffle(indices)
split_idx = int(len(combined_df) * 0.8)  # 80% train, 20% val
train_df = combined_df.iloc[indices[:split_idx]].copy()
val_df   = combined_df.iloc[indices[split_idx:]].copy()

#### 4. Convert Zero Values to Median Values
*For preservation of large datasets*

In [5]:
zero_as_missing = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI",
                   "HbA1c_level", "blood_glucose_level"]

# Replace zeros with NaN
for col in zero_as_missing:
    train_df[col].replace(0, np.nan, inplace=True)
    val_df[col].replace(0, np.nan, inplace=True)

# Compute medians on training only
medians = train_df[zero_as_missing].median()

# Fill NaNs
train_df[zero_as_missing] = train_df[zero_as_missing].fillna(medians)
val_df[zero_as_missing]   = val_df[zero_as_missing].fillna(medians)

#### 5. Encode Categorical Data to Continuous Data

In [6]:
# Gender: Male=1, Female=0
train_df['gender'] = train_df['gender'].map({'Male': 1, 'Female': 0}).fillna(0)
val_df['gender']   = val_df['gender'].map({'Male': 1, 'Female': 0}).fillna(0)

# Smoking history: replace 'No Info' with 'never'
train_df['smoking_history'] = train_df['smoking_history'].replace('No Info', 'never')
val_df['smoking_history']   = val_df['smoking_history'].replace('No Info', 'never')

# One-hot encode smoking history
train_smoke = pd.get_dummies(train_df['smoking_history'], prefix='smoking_history', drop_first=False)
val_smoke   = pd.get_dummies(val_df['smoking_history'], prefix='smoking_history', drop_first=False)

# Ensure all dummy columns exist
expected_smoking_cols = ['smoking_history_never', 'smoking_history_former', 'smoking_history_current']
train_smoke = train_smoke.reindex(columns=expected_smoking_cols, fill_value=0)
val_smoke   = val_smoke.reindex(columns=expected_smoking_cols, fill_value=0)

# Add back to original DataFrames
train_df = pd.concat([train_df, train_smoke], axis=1)
val_df   = pd.concat([val_df, val_smoke], axis=1)

# Fill any remaining NaNs with 0
train_df = train_df.fillna(0)
val_df   = val_df.fillna(0)

train_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,gender,...,heart_disease,smoking_history,HbA1c_level,blood_glucose_level,age,bmi,diabetes,smoking_history_never,smoking_history_former,smoking_history_current
77687,0,126.0,72.0,29.0,125.0,32.4,0.0,0.0,0.0,0.0,...,0,never,4.5,126.0,39.0,27.32,0.0,True,False,False
30880,0,159.0,72.0,29.0,125.0,32.4,0.0,0.0,0.0,0.0,...,0,never,5.7,159.0,58.0,33.83,1.0,True,False,False
99917,0,280.0,72.0,29.0,125.0,32.4,0.0,0.0,0.0,0.0,...,0,ever,5.7,280.0,54.0,31.86,1.0,False,False,False
44947,0,145.0,72.0,29.0,125.0,32.4,0.0,0.0,0.0,0.0,...,0,former,6.0,145.0,70.0,32.4,0.0,False,True,False
38535,0,126.0,72.0,29.0,125.0,32.4,0.0,0.0,0.0,0.0,...,0,never,5.7,126.0,33.0,22.4,0.0,True,False,False


#### 6. Final Feature List

In [7]:
numeric_cols = [
    "Pregnancies", "blood_glucose_level", "BloodPressure", "SkinThickness",
    "Insulin", "BMI", "DiabetesPedigreeFunction", "Age", "HbA1c_level"
]

categorical_cols = ["gender", "hypertension", "heart_disease"]

smoking_cols = [
    "smoking_history_never",
    "smoking_history_former",
    "smoking_history_current"
]

feature_cols = numeric_cols + categorical_cols + smoking_cols
print("Total features:", len(feature_cols))

Total features: 15


#### 7. Extract X and y Values (Parameters and Validation Outputs)

In [8]:
X_train = train_df[feature_cols].astype(np.float32)
y_train = train_df.get("Outcome", train_df.get("diabetes", pd.Series(0))).astype(int)

X_val = val_df[feature_cols].astype(np.float32)
y_val = val_df.get("Outcome", val_df.get("diabetes", pd.Series(0))).astype(int)

# Ensure no NaNs or Infs
X_train = X_train.fillna(0).replace([np.inf, -np.inf], 0)
X_val   = X_val.fillna(0).replace([np.inf, -np.inf], 0)

X_val[:5], y_val[:5]

(       Pregnancies  blood_glucose_level  BloodPressure  SkinThickness  \
 67078          0.0                140.0           72.0           29.0   
 74397          0.0                280.0           72.0           29.0   
 45823          0.0                 85.0           72.0           29.0   
 44830          0.0                260.0           72.0           29.0   
 19856          0.0                155.0           72.0           29.0   
 
        Insulin        BMI  DiabetesPedigreeFunction  Age  HbA1c_level  gender  \
 67078    125.0  32.400002                       0.0  0.0          5.8     0.0   
 74397    125.0  32.400002                       0.0  0.0          6.5     0.0   
 45823    125.0  32.400002                       0.0  0.0          6.6     0.0   
 44830    125.0  32.400002                       0.0  0.0          6.5     0.0   
 19856    125.0  32.400002                       0.0  0.0          5.8     0.0   
 
        hypertension  heart_disease  smoking_history_never  

#### 8. Standardize Mean and Standard Deviation Data

In [9]:
mean = X_train[numeric_cols].mean()
std  = X_train[numeric_cols].std()
std[std == 0] = 1

# Apply scaling
X_train[numeric_cols] = (X_train[numeric_cols] - mean) / std
X_val[numeric_cols]   = (X_val[numeric_cols] - mean) / std

#### 9. Convert Data to PyTorch Tensors

In [10]:
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)

X_val_tensor = torch.tensor(X_val.values, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.long)

train_loader = DataLoader(TensorDataset(X_train_tensor, y_train_tensor), batch_size=32, shuffle=True)
val_loader   = DataLoader(TensorDataset(X_val_tensor, y_val_tensor), batch_size=32, shuffle=False)

print("Training loader batches:", len(train_loader))
print("Validation loader batches:", len(val_loader))

Training loader batches: 2520
Validation loader batches: 630


---
### Import Diabetes Model

In [11]:
from src.model.model_dgg import DiabetesModel
model = DiabetesModel(input_size=X_val.shape[1])
summary(model, input_size=(1, X_val.shape[1]))

Layer (type:depth-idx)                   Output Shape              Param #
DiabetesModel                            [1, 2]                    --
â”œâ”€Sequential: 1-1                        [1, 2]                    --
â”‚    â””â”€Linear: 2-1                       [1, 128]                  2,048
â”‚    â””â”€ReLU: 2-2                         [1, 128]                  --
â”‚    â””â”€Linear: 2-3                       [1, 64]                   8,256
â”‚    â””â”€ReLU: 2-4                         [1, 64]                   --
â”‚    â””â”€Dropout: 2-5                      [1, 64]                   --
â”‚    â””â”€Linear: 2-6                       [1, 32]                   2,080
â”‚    â””â”€ReLU: 2-7                         [1, 32]                   --
â”‚    â””â”€Linear: 2-8                       [1, 2]                    66
Total params: 12,450
Trainable params: 12,450
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES): 0.01
Input size (MB): 0.00
Forward/backward pass size (MB):

---
### Training the Model

In [12]:
# Enable early stopping if val_loss doesn't improve in 5 epochs
early_stop = EarlyStopping(
    monitor="val_loss",
    patience=5,
    mode="min"
)

trainer = L.Trainer(
    max_epochs=20,
    accelerator="auto",
    devices=1,
    log_every_n_steps=10,
    callbacks=[early_stop]
)

trainer.fit(model, train_loader, val_loader)

ðŸ’¡ Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores

  | Name      | Type               | Params | Mode  | FLOPs
-----------------------------------------------------------------
0 | net       | Sequential         | 12.5 K | train | 0    
1 | train_acc | MulticlassAccuracy | 0      | train | 0    
2 | val_acc   | MulticlassAccuracy | 0      | train | 0    
-----------------------------------------------------------------
12.5 K    Trainable params
0         Non-trainable params
12.5 K    Total params
0.050     Total estimated model params size (MB)
11        Modules in train mode
0         Modules in eval mode
0         Total Flops


Epoch 14: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2520/2520 [00:22<00:00, 109.89it/s, v_num=25, train_loss=7.56e-5, train_acc=1.000, val_loss=0.00462, val_acc=0.997] 


#### Final Evaluation

In [13]:
model.eval()  

# Use DataLoader for batch-wise evaluation
all_preds = []
all_labels = []

with torch.no_grad():
    for X_batch, y_batch in val_loader:
        logits = model(X_batch)            
        preds = torch.argmax(logits, dim=1)  
        all_preds.append(preds)
        all_labels.append(y_batch)

# Concatenate all batches
all_preds = torch.cat(all_preds)
all_labels = torch.cat(all_labels)

# Compute accuracy
accuracy = (all_preds == all_labels).float().mean().item()
print(f"Final Validation Accuracy: {accuracy:.4f}")

Final Validation Accuracy: 0.9974


---
## Section 2 : Predicting Insulin Dosage
### Learning Problem
The purpose of this section of the project is to create a Machine Learning model that can predict the amount of insulin a person should administer at a given time based on their recent glucose levels, carbohydrate intake, and other relevant health factors. The model uses various types of information, including time of day, blood glucose readings, carbohydrate consumption, previous insulin doses, and meal events to provide a personalized insulin recommendation. The model is trained using a dataset derived from real user insulin and diabetes logs, which has been preprocessed and structured as time-series sequences. It uses MLP architecture with linear and ReLU layers to learn complex relationships between the input features and the target insulin amount. The model also uses the PyTorch Lightning module and the Adam optimizer to help improve the model's performance. 

---
### 1. Data Loading and Cleaning

In [14]:
# Base directory
BASE_DIR = os.getcwd()
DATA_PATH = os.path.join(BASE_DIR, 'data', 'raw', 'home_insulin_clean_target_gc.csv')

# Load cleaned dataset
df = pd.read_csv(DATA_PATH, parse_dates=["dateTime"])
print(f"Dataset loaded with {len(df)} rows and {len(df.columns)} columns")

# Display first few rows
df.head()

Dataset loaded with 6790 rows and 11 columns


Unnamed: 0,dateTime,event,netCarbs,bloodGlucose,insulinToCarbRatio,insulinSensitivityFactor,bloodGlucoseTarget,insulinRec,insulinTaken,targetBG,averageMonthlyBloodGlucose
0,2020-08-09 08:40:00,breakfast,64.0,7.2,7.0,3.0,6.7,9.31,9.0,6.0,6.317568
1,2020-08-09 11:27:00,lunch,50.46,10.3,6.0,3.0,6.7,9.61,10.0,6.0,6.317568
2,2020-08-09 17:30:00,dinner,51.3,3.8,7.0,3.0,6.7,6.36,6.0,6.0,6.317568
3,2020-08-10 07:57:00,breakfast,78.5,3.4,7.0,3.0,6.7,10.11,10.0,6.0,6.317568
4,2020-08-10 12:09:00,lunch,67.08,3.1,7.0,3.0,6.7,8.38,8.0,6.0,6.317568


---
### 2. Importing the Insulin Model

In [15]:
from src.model.model_idg import InsulinDataset, InsulinModule

# Take one sample from dataset
dataset = InsulinDataset(df)
batch_dict, target = dataset[0]

# Add batch dimension for each tensor in the dict
batch_dict = {k: v.unsqueeze(0) for k, v in batch_dict.items()}  # shape (1, 1)
target = target.unsqueeze(0)  # shape (1,)

# Initialize model
model_insulin = InsulinModule()

# Forward pass
y_hat = model_insulin(batch_dict)
print("Output shape:", y_hat.shape)

Output shape: torch.Size([1, 1])


---
### 3. Training the model

In [16]:
# Split dataset 50/50 for training/testing
split_idx = int(len(df) * 0.5)
train_dataset = InsulinDataset(df.iloc[:split_idx])
val_dataset = InsulinDataset(df.iloc[split_idx:])

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# Initialize trainer
early_stop = L.pytorch.callbacks.EarlyStopping(
    monitor="val_loss",
    patience=5,
    mode="min"
)

trainer = L.pytorch.Trainer(
    max_epochs=75,
    deterministic=True,
    callbacks=[early_stop]
)

# Train model
trainer.fit(model_insulin, train_dataloaders=train_loader, val_dataloaders=val_loader)

ðŸ’¡ Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores

  | Name | Type              | Params | Mode  | FLOPs
-----------------------------------------------------------
0 | net  | InsulinCalculator | 34     | train | 0    
-----------------------------------------------------------
34        Trainable params
0         Non-trainable params
34        Total params
0.000     Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode
0         Total Flops


Epoch 74: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 107/107 [00:02<00:00, 50.74it/s, v_num=26, train_loss=2.840, val_loss=11.40]   

`Trainer.fit` stopped: `max_epochs=75` reached.


Epoch 74: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 107/107 [00:02<00:00, 50.56it/s, v_num=26, train_loss=2.840, val_loss=11.40]


#### Final Evaluation

In [17]:
# Evaluate model on validation set
model_insulin.eval()
all_preds, all_targets = [], []

with torch.no_grad():
    for batch in val_loader:
        batch_dict, target = batch
        y_hat = model_insulin(batch_dict)
        all_preds.append(y_hat)
        all_targets.append(target)

# Concatenate all batches
all_preds = torch.cat(all_preds).squeeze()
all_targets = torch.cat(all_targets).squeeze()

# Compute Mean Squared Error (MSE) and Root Mean Squared Error (RMSE)
mse = F.mse_loss(all_preds, all_targets)
rmse = torch.sqrt(mse)

print(f"Final Validation MSE: {mse.item():.4f}")
print(f"Final Validation RMSE: {rmse.item():.4f}")

# Display first 10 predictions vs actuals
comparison_df = pd.DataFrame({
    "Predicted": all_preds.numpy(),
    "Actual": all_targets.numpy()
})

print("\nFirst 10 predictions vs actual insulin doses:")
print(comparison_df.head(10))

Final Validation MSE: 2.9540
Final Validation RMSE: 1.7187

First 10 predictions vs actual insulin doses:
   Predicted  Actual
0   6.698973    7.01
1   4.792777    1.54
2   4.837090    3.95
3   3.453540    4.71
4   8.507829    8.71
5   3.631877    2.02
6   4.399913    3.28
7   5.385919    6.37
8   7.821753    6.79
9   5.291615    4.51
