In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import joblib
import os

In [2]:
# Load the dataset
DATA_PATH = '../datasets/diabetes.csv'

try:
    data = pd.read_csv(DATA_PATH)
    print("Dataset loaded successfully.")
    print(data.head())
except FileNotFoundError:
    print(f"Error: '{DATA_PATH}' not found.")
    print("Please download the dataset and place it in the './datasets' directory.")
    data = None # Set data to None to prevent further execution if file not found

Dataset loaded successfully.
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [3]:
# Prepare the data (features 'X' and target 'y')
if data is not None:
    X = data.drop('Outcome', axis=1)
    y = data['Outcome']

In [4]:
# Split data into training and testing sets
if data is not None:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print(f"Training samples: {len(X_train)}, Test samples: {len(X_test)}")

Training samples: 614, Test samples: 154


In [5]:
# Create and Train the Logistic Regression Model
if data is not None:
    print("Training the model...")
    model = LogisticRegression(max_iter=1000) 
    model.fit(X_train, y_train)
    print("Model trained.")

Training the model...
Model trained.


In [6]:
# Evaluate the Model
if data is not None:
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    print(f"Model Accuracy on Test Data: {accuracy * 100:.2f}%")

Model Accuracy on Test Data: 74.68%


In [7]:
# Save the Trained Model
if data is not None:
    MODEL_DIR = '../models'
    MODEL_FILENAME = os.path.join(MODEL_DIR, 'diabetes_model.pkl')
    
    # Ensure the model directory exists
    os.makedirs(MODEL_DIR, exist_ok=True)
    
    joblib.dump(model, MODEL_FILENAME)
    print(f"Model saved to {MODEL_FILENAME}")

Model saved to ../models\diabetes_model.pkl
