# Assignment: Health Risk Prediction (Mini Version)
Objective:Build a simple regression model to predict BMI based on lifestyle features using either PyTorch or TensorFlow.

## 0)SET UP

In [1]:
pip install torch scikit-learn pandas matplotlib

Note: you may need to restart the kernel to use updated packages.


 __Imports__

In [None]:
import pandas as pd
import random, numpy as np, torch
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch import nn
from sklearn.metrics import mean_squared_error, r2_score
import pickle

 __Set random seeds (reproducibility)__

In [None]:
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

## Step 1 – Dataset Generation **Create or upload the dataset**

In [None]:
data = """sleep_hours,daily_steps,calories,bmi
8.0,9000,2200,22.0
6.5,4000,2800,29.5
7.0,7000,2400,25.0
5.5,3000,3000,31.0
9.0,12000,2000,20.5
6.0,5000,2600,28.0
7.5,8000,2100,23.5
8.5,10000,1900,21.0
6.8,3500,2700,30.0
7.2,6500,2300,24.5
5.0,2000,3200,34.0
8.0,11000,1800,20.0
"""

In [None]:
with open('health_dummy.csv', 'w') as f:
    f.write(data)

df = pd.read_csv('health_dummy.csv')
df.head(12)
df.info()
df.describe()

In [None]:
plt.scatter(df['daily_steps'], df['bmi'])
plt.xlabel('daily_steps'); plt.ylabel('bmi'); plt.title('steps vs BMI'); plt.show()

## Step 2 – Dataset Normalization(MinMaxScaler)__Prepare features, split and normalize__*Tip: always fit the scaler only on training data — otherwise you leak information.*

In [None]:
X = df[['sleep_hours','daily_steps','calories']].values.astype(np.float32)
y = df['bmi'].values.astype(np.float32).reshape(-1,1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)   # fit ON TRAIN only
X_test_scaled  = scaler.transform(X_test)        # transform test using train stats

__Convert arrays to PyTorch tensors__

In [None]:
import torch
X_train_t = torch.from_numpy(X_train_scaled).float()
y_train_t = torch.from_numpy(y_train).float()
X_test_t  = torch.from_numpy(X_test_scaled).float()
y_test_t  = torch.from_numpy(y_test).float()

## step 3 – Training Model (Build the simple linear regression model (PyTorch))

In [None]:
from torch import nn

model = nn.Linear(in_features=3, out_features=1)   # 3 inputs -> 1 output
loss_fn = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)  # start with lr=0.01

print(model)   # shows weight shape and bias

__Training loop (train, track loss, plot)__*Tip: if loss diverges → reduce learning rate (e.g., 0.001) or switch optimizer to Adam.*

In [None]:
epochs = 500
loss_history = []

for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    preds = model(X_train_t)
    loss = loss_fn(preds, y_train_t)
    loss.backward()
    optimizer.step()
    loss_history.append(loss.item())

plt.plot(loss_history)
plt.xlabel('Epoch'); plt.ylabel('MSE')
plt.title('Training Loss')
plt.grid(True)
plt.show()

print("Final training loss:", loss_history[-1])

## Step 4 – Evaluation 
### Evaluate on test set — MSE and R² .
#### What they mean:

**MSE: average squared error (lower = better)**

**R²: proportion of variance explained (maximum 1.0; closer to 1 is better).**

In [None]:
model.eval()
with torch.no_grad():
    preds_test = model(X_test_t).numpy().flatten()

from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_test.flatten(), preds_test)
r2  = r2_score(y_test.flatten(), preds_test)

print("Test MSE:", mse)
print("Test R²:", r2)

## Step 5 – Prediction

In [None]:
sample = np.array([[7.0, 7000.0, 2200.0]], dtype=np.float32)   # change values to test
sample_scaled = scaler.transform(sample)
sample_t = torch.from_numpy(sample_scaled).float()

with torch.no_grad():
    pred_bmi = model(sample_t).item()

print("Predicted BMI for [7 h, 7000 steps, 2200 cal]:", round(pred_bmi, 3))

In [None]:
# Save model state_dict
torch.save(model.state_dict(), 'health_bmi_model.pth')

# Save scaler with pickle
import pickle
with open('health_bmi_scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

# Save dataset CSV
df.to_csv('health_dummy.csv', index=False)
print("Saved model, scaler, and CSV to your Google Drive MyDrive folder.")

# A Short Summary of Results: Final Loss, Evaluation Metrics, and Example Predictions
## Final Loss:
_The final training loss is 0.1366 📉. This low value indicates that the model's predictions on the training data are very close to the actual values after 500 epochs of training. The training loss plot confirms this, showing a steep initial decrease and then plateauing as the model converges_. ✅

## Evaluation Metrics:
The model was evaluated on the unseen test dataset, yielding the following metrics:

### Test MSE (Mean Squared Error): 1.1156 (lower is better)

_Test R² (R-squared): 0.9583. This is an excellent score. The R² value, which ranges from 0 to 1, indicates that roughly 95.83% of the variance in the BMI can be explained by the input features (sleep hours, daily steps, and calories). A value this close to 1 suggests a strong linear relationship between the features and the BMI in this dataset_. 📈

## Example Prediction:
*For the specified example input of 7 hours of sleep, 7000 daily steps, and 2200 calories, the model's predicted BMI is 24.646 ✨. This value falls within the normal BMI range (18.5 to 24.9), which is a reasonable and expected outcome for the given inputs*. 🧑‍⚕️