In [None]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# **Load the dataset**

In [4]:
data = load_diabetes(as_frame=True)
#We switch from provided dictionary format to pandas' DataFrame
df = data['data']
df['target'] = (data['target'] > 140).astype(int) # Convert target into binary: 1 (high risk), 0 (low risk)

df.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646,1
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204,0
2,0.085299,0.05068,0.044451,-0.00567,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.02593,1
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362,1
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641,0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442 entries, 0 to 441
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     442 non-null    float64
 1   sex     442 non-null    float64
 2   bmi     442 non-null    float64
 3   bp      442 non-null    float64
 4   s1      442 non-null    float64
 5   s2      442 non-null    float64
 6   s3      442 non-null    float64
 7   s4      442 non-null    float64
 8   s5      442 non-null    float64
 9   s6      442 non-null    float64
 10  target  442 non-null    int32  
dtypes: float64(10), int32(1)
memory usage: 36.4 KB


# **Data Preprocessing**

### **Check for missing values**

In [7]:
df.isnull().sum()

age       0
sex       0
bmi       0
bp        0
s1        0
s2        0
s3        0
s4        0
s5        0
s6        0
target    0
dtype: int64

### **Separate features (X) and target variable (y)**

In [8]:
X = df.drop('target', axis=1)
y = df['target']

### **Split the data**

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### **Scale the data**

In [10]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train) # Fit and transform on the training data
X_test = scaler.transform(X_test) # Transform test data

##### **We ensure that:**
- *there's no missing data*
- *feature and target are split (X and y)*
- *train and test are split in 80/20 ratio*
- *the data is scaled for better performance*

# **Train a Logistic Regression Model**

Why Logistic Regression?:
- Nature of the problem - this is a binary classification (1 or 0) and LogReg is specifically designed for binary outcomes and provides probabilities
- It's a good baaseline model, it's simple and fast + computationally efficient

### **Initialize and train the Logistic Regression Model**

In [11]:
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)

### **Make prediction on the test set**

In [12]:
y_pred = model.predict(X_test)

### **Evaluate the model**

In [13]:
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('\nClassification report:\n', classification_report(y_test, y_pred))

Accuracy:  0.7303370786516854

Classification report:
               precision    recall  f1-score   support

           0       0.77      0.73      0.75        49
           1       0.69      0.72      0.71        40

    accuracy                           0.73        89
   macro avg       0.73      0.73      0.73        89
weighted avg       0.73      0.73      0.73        89



To summarize the report:
- Accuracy is proportion of correct predictions (correct pred)/(actual vals)
- Precision is how many of the predicted positives were actuall positive
- Recall is how many of the actual positives were correctly predicted
- F1 score - harmonic mean of precision and recall

# **Test with new data**

### **Example new patient data**

In [14]:
new_patient = pd.DataFrame({
    'age': [0.05],  # standardized data, these need to match scaled values
    'sex': [-0.02],
    'bmi': [0.20],
    'bp': [0.10],
    's1': [0.05],
    's2': [0.15],
    's3': [-0.05],
    's4': [0.12],
    's5': [0.18],
    's6': [0.10]
})

### **Scale the new data**

In [15]:
new_patient_scaled = scaler.transform(new_patient)

### **Predict for the new patient**

In [16]:
new_prediction = model.predict(new_patient_scaled)
print('Diabetes Risk Prediction:', 'High Risk' if new_prediction[0] == 1 else 'Low Risk')

Diabetes Risk Prediction: High Risk


# **A Different Model**

### **Random Forest Classifier**

In [18]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

rf_y_pred = rf_model.predict(X_test)
print('Random Forest Accuracy: ', accuracy_score(y_test, rf_y_pred))

Random Forest Accuracy:  0.7191011235955056


# **Improving Original Model**

LogReg has regularization parameters which help in preventing overfitting by penalizing large coefficient. We can tune this parameter to find the best regularization strength (C). 
Same story with solver, it's a specific algorithm that helps us optimize the model.

In [19]:
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.01, 0.1, 1, 10, 100], 'solver': ['liblinear', 'saga']}
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid_search.fit(X_train, y_train)
print('Best Hyperparameters: ', grid_search.best_params_)

Best Hyperparameters:  {'C': 0.01, 'solver': 'saga'}




### **Apply new hyperparameters to the original model**

In [21]:
model = LogisticRegression(C=0.01, solver='saga', random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print('Accuracy:', accuracy_score(y_test, y_pred))
print('\nClassification Report:\n', classification_report(y_test, y_pred))

Accuracy: 0.7752808988764045

Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.80      0.80        49
           1       0.75      0.75      0.75        40

    accuracy                           0.78        89
   macro avg       0.77      0.77      0.77        89
weighted avg       0.78      0.78      0.78        89



**We can see an increase in:**
- Accuracy from 0.730 to 0.775,
- Precision from 0.77, 0.69 to 0.80, 0.75
- And an F1 score saw an increase from 0.75, 0.71 to 0.80, 0.75

**Which means that we took a step in the right direction.**