In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


In [2]:

# Step 1: Load the dataset
data = pd.read_csv('diabetes.csv')



In [3]:

# Step 2: Explore the dataset
# Print basic information about the dataset
print("Dataset Overview:")
print(data.info())
print("\nFirst few rows of the dataset:")
print(data.head())


Dataset Overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
None

First few rows of the dataset:
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66   

In [4]:
# Step 3: Split features and target variable
# Assuming the target column is named 'Outcome'
X = data.drop('Outcome', axis=1)  # Features
y = data['Outcome']  # Target variable



In [5]:

# Step 4: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [6]:

# Step 5: Train the model
# Using Random Forest Classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)



In [7]:

# Step 6: Evaluate the model
# Make predictions on the test set
y_pred = model.predict(X_test)



In [8]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")



Model Accuracy: 72.08%


In [9]:
###### Step 7: Predict diabetes based on user input
def predict_diabetes(user_input):
    
    input_df = pd.DataFrame([user_input], columns=X.columns)
    prediction = model.predict(input_df)[0]
    return "Diabetes" if prediction == 1 else "No Diabetes"  



In [10]:
import joblib
joblib.dump(model, "diabetes_model.pkl")


['diabetes_model.pkl']

In [None]:
 # Step 8: Interactive user input for prediction
print("\nProvide input for prediction:")

# Collecting user inputs
pregnancies = int(input("Number of Pregnancies: "))
glucose = float(input("Glucose Level: "))
blood_pressure = float(input("Blood Pressure: "))
skin_thickness = float(input("Skin Thickness: "))
insulin = float(input("Insulin Level: "))
bmi = float(input("Body Mass Index (BMI): "))
diabetes_pedigree_function = float(input("Diabetes Pedigree Function: "))
age = int(input("Age: "))

example_input = [pregnancies, glucose, blood_pressure, skin_thickness, insulin, bmi, diabetes_pedigree_function, age]

result = predict_diabetes(example_input)
print(f"Prediction based on input: {result}")


Provide input for prediction:


Number of Pregnancies:  1
Glucose Level:  89
Blood Pressure:  66
Skin Thickness:  23
Insulin Level:  94
Body Mass Index (BMI):  28
