In [1]:
import pandas as pd

# Load PIMA dataset from GitHub
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
cols = ['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigree','Age','Outcome']
data = pd.read_csv(url, names=cols)

# Inspect
data.head()


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigree,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [2]:
# Features = vitals
X = data[['Glucose','BloodPressure','BMI']]

# Target = diabetes outcome
y = data['Outcome']

# Quick check
print(X.head())
print(y.value_counts())


   Glucose  BloodPressure   BMI
0      148             72  33.6
1       85             66  26.6
2      183             64  23.3
3       89             66  28.1
4      137             40  43.1
Outcome
0    500
1    268
Name: count, dtype: int64


In [3]:
from sklearn.model_selection import train_test_split

# Stratified split to keep class balance
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,  # 50% train, 50% test
    stratify=y,     # ensures equal class distribution
    random_state=42
)

# Quick check
print("Train class distribution:\n", y_train.value_counts())
print("Test class distribution:\n", y_test.value_counts())


Train class distribution:
 Outcome
0    400
1    214
Name: count, dtype: int64
Test class distribution:
 Outcome
0    100
1     54
Name: count, dtype: int64


In [4]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Train
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

# Predict on test set
y_pred = model.predict(X_test_scaled)
y_prob = model.predict_proba(X_test_scaled)[:,1]  # probability of being high risk

# Evaluate
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, y_prob))


Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.84      0.79       100
           1       0.62      0.48      0.54        54

    accuracy                           0.71       154
   macro avg       0.68      0.66      0.67       154
weighted avg       0.70      0.71      0.70       154

Confusion Matrix:
 [[84 16]
 [28 26]]
ROC-AUC Score: 0.7825925925925926


In [6]:
import numpy as np

def risk_category(prob):
    if prob < 0.4:
        return "Low Risk"
    elif prob < 0.7:
        return "Medium Risk"
    else:
        return "High Risk"

# Test example
example = X_test_scaled[0].reshape(1, -1)
pred_prob = model.predict_proba(example)[0][1]
print("Predicted Risk:", risk_category(pred_prob))


Predicted Risk: Medium Risk


In [7]:
import joblib

joblib.dump(model, "risk_model.pkl")
joblib.dump(scaler, "scaler.pkl")

print("Model and scaler saved!")


Model and scaler saved!


In [8]:
from google.colab import files

files.download("risk_model.pkl")
files.download("scaler.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>