In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pickle

# Load the dataset
df = pd.read_csv('diabetes.csv')

# --- 1. Data Exploration ---
print("First 5 rows of the dataset:")
print(df.head())

print("\nDataset Information:")
df.info()

print("\nStatistical Summary:")
print(df.describe())

# Check for missing values. In this dataset, 0 can be a missing value for certain columns.
print("\nNumber of 0 values in key columns:")
print((df[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']] == 0).sum())

First 5 rows of the dataset:
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non

In [2]:
# --- 2. Data Preprocessing ---

# Replace 0s with the mean of the column for features where 0 is not a valid value
df['Glucose'] = df['Glucose'].replace(0, df['Glucose'].mean())
df['BloodPressure'] = df['BloodPressure'].replace(0, df['BloodPressure'].mean())
df['SkinThickness'] = df['SkinThickness'].replace(0, df['SkinThickness'].mean())
df['Insulin'] = df['Insulin'].replace(0, df['Insulin'].mean())
df['BMI'] = df['BMI'].replace(0, df['BMI'].mean())

# Define features (X) and target (y)
X = df.drop('Outcome', axis=1) # All columns except the outcome
y = df['Outcome']             # The outcome column

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("\nShape of training data:", X_train.shape)
print("Shape of testing data:", X_test.shape)


Shape of training data: (614, 8)
Shape of testing data: (154, 8)


In [3]:
# --- 3. Model Training ---

# Initialize the Logistic Regression model
model = LogisticRegression(max_iter=1000) # Increased max_iter for convergence

# Train the model on the training data
model.fit(X_train, y_train)

# --- 4. Model Evaluation ---

# Make predictions on the test data
y_pred = model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy * 100:.2f}%")


Model Accuracy: 76.62%


In [4]:
# --- 5. Save the Trained Model ---

# Create a file name for the model
filename = 'diabetes_model.pkl'

# Open the file in write-binary mode and save the model
with open(filename, 'wb') as file:
    pickle.dump(model, file)

print(f"\nModel saved successfully as {filename}")


Model saved successfully as diabetes_model.pkl


In [5]:
import tkinter as tk
from tkinter import messagebox
import pickle
import numpy as np

# --- Load the Saved Model ---
# Make sure 'diabetes_model.pkl' is in the same directory
try:
    with open('diabetes_model.pkl', 'rb') as file:
        model = pickle.load(file)
except FileNotFoundError:
    messagebox.showerror("Error", "Model file 'diabetes_model.pkl' not found!")
    exit()

# --- GUI Creation ---
# Function to get values from GUI, make prediction, and show result
def predict_diabetes():
    try:
        # Get values from entry fields and convert them to float
        input_data = [float(entry.get()) for entry in entry_fields]
        
        # Convert to a NumPy array and reshape for the model
        input_array = np.array(input_data).reshape(1, -1)
        
        # Make a prediction
        prediction = model.predict(input_array)
        
        # Display the result
        if prediction[0] == 1:
            result_label.config(text="Prediction: You may have Diabetes.", fg="red")
        else:
            result_label.config(text="Prediction: You likely do not have Diabetes.", fg="green")
            
    except ValueError:
        messagebox.showerror("Input Error", "Please enter valid numbers in all fields.")
    except Exception as e:
        messagebox.showerror("Prediction Error", f"An error occurred: {e}")

# Main window
root = tk.Tk()
root.title("Diabetes Prediction System")
root.geometry("400x450")

# --- Create GUI Components ---
main_frame = tk.Frame(root, padx=15, pady=15)
main_frame.pack(expand=True)

title_label = tk.Label(main_frame, text="Enter Patient Details", font=("Helvetica", 16, "bold"))
title_label.grid(row=0, column=0, columnspan=2, pady=10)

# Labels for the input fields
feature_labels = [
    "Pregnancies:", "Glucose:", "Blood Pressure (mm Hg):", "Skin Thickness (mm):",
    "Insulin (mu U/ml):", "BMI (kg/m^2):", "Diabetes Pedigree Function:", "Age (years):"
]

entry_fields = []

# Create and place labels and entry fields in a grid
for i, label_text in enumerate(feature_labels):
    label = tk.Label(main_frame, text=label_text, font=("Helvetica", 10))
    label.grid(row=i+1, column=0, sticky="w", pady=5)
    
    entry = tk.Entry(main_frame, width=20)
    entry.grid(row=i+1, column=1, pady=5)
    entry_fields.append(entry)

# Prediction button
predict_button = tk.Button(main_frame, text="Predict", font=("Helvetica", 12, "bold"), command=predict_diabetes)
predict_button.grid(row=len(feature_labels)+1, column=0, columnspan=2, pady=15)

# Label to display the result
result_label = tk.Label(main_frame, text="Prediction: Awaiting input...", font=("Helvetica", 12, "italic"))
result_label.grid(row=len(feature_labels)+2, column=0, columnspan=2)

# Start the GUI event loop
root.mainloop()