In [11]:
# Cell 1: Imports
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported.")

Libraries imported.


In [12]:
# Cell 2: Load Data
# The path is relative from the notebook location (notebooks/)
# It goes up one level (..) to 'backend/', then up another (..) to the root,
# then down into 'data/'
data_path = '../../data/diabetes.csv' 
try:
    df = pd.read_csv(data_path)
    print("Data loaded successfully:")
    print(df.head())
except FileNotFoundError:
    print(f"ERROR: File not found at {data_path}")
    print("Please make sure your 'diabetes.csv' file is in the 'data/' folder.")

Data loaded successfully:
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [13]:
# Cell 3: Data Cleaning
print("Cleaning data...")
# These columns have impossible '0' values that mean 'missing'
columns_to_clean = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
for col in columns_to_clean:
    df[col] = df[col].replace(0, np.nan)
    mean = df[col].mean(skipna=True) # Calculate average, ignoring NaNs
    df[col] = df[col].fillna(mean)   # Fill NaNs with the average
print("Data cleaned. 0 values have been replaced with the mean.")
print(df.describe())

Cleaning data...
Data cleaned. 0 values have been replaced with the mean.
       Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
count   768.000000  768.000000     768.000000     768.000000  768.000000   
mean      3.845052  121.686763      72.405184      29.153420  155.548223   
std       3.369578   30.435949      12.096346       8.790942   85.021108   
min       0.000000   44.000000      24.000000       7.000000   14.000000   
25%       1.000000   99.750000      64.000000      25.000000  121.500000   
50%       3.000000  117.000000      72.202592      29.153420  155.548223   
75%       6.000000  140.250000      80.000000      32.000000  155.548223   
max      17.000000  199.000000     122.000000      99.000000  846.000000   

              BMI  DiabetesPedigreeFunction         Age     Outcome  
count  768.000000                768.000000  768.000000  768.000000  
mean    32.457464                  0.471876   33.240885    0.348958  
std      6.875151              

In [14]:
# Cell 4: Define Features (X) and Target (y)
X = df.drop('Outcome', axis=1) # All columns EXCEPT 'Outcome'
y = df['Outcome']             # Only the 'Outcome' column

print("Features (X) and Target (y) are defined.")

Features (X) and Target (y) are defined.


In [15]:
# Cell 5: Scale Data
# Scaling makes the model perform better
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("Data has been scaled.")

Data has been scaled.


In [16]:
# Cell 6: Split Data
# 80% for training, 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")

Training set shape: (614, 8)
Test set shape: (154, 8)


In [17]:
# Cell 7: Train Model
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)

print("Model trained successfully.")

Model trained successfully.


In [18]:
# Cell 8: Evaluate Model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"--- Model Accuracy: {accuracy * 100:.2f}% ---")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

--- Model Accuracy: 75.32% ---

Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.83      0.81        99
           1       0.67      0.62      0.64        55

    accuracy                           0.75       154
   macro avg       0.73      0.72      0.73       154
weighted avg       0.75      0.75      0.75       154



In [20]:
# Cell 9: Save Model and Scaler
# Save them to the ml_models folder (relative from the notebook)
model_path = '../ml_models/diabetes_model.pkl'
scaler_path = '../ml_models/diabetes_scaler.pkl'

joblib.dump(model, model_path)
joblib.dump(scaler, scaler_path)

print(f"Model saved to: {model_path}")
print(f"Scaler saved to: {scaler_path}")
print("\n--- You can now build and run the Docker containers! ---")

Model saved to: ../ml_models/diabetes_model.pkl
Scaler saved to: ../ml_models/diabetes_scaler.pkl

--- You can now build and run the Docker containers! ---
