In [None]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [None]:
# Load the dataset
df = pd.read_csv('/content/diabetes.csv')

# Explore the dataset
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [None]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [None]:
# Check for missing values
df.isnull().sum()

# Separate features and target variable
X = df.drop(columns='Outcome')
y = df['Outcome']

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [None]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Initialize Logistic Regression model
model = LogisticRegression()

# Train the model
model.fit(X_train, y_train)

# Predict on test data
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Confusion Matrix:')
print(cm)
print('Classification Report:')
print(report)


Accuracy: 0.8051948051948052
Confusion Matrix:
[[91 11]
 [19 33]]
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.89      0.86       102
           1       0.75      0.63      0.69        52

    accuracy                           0.81       154
   macro avg       0.79      0.76      0.77       154
weighted avg       0.80      0.81      0.80       154



In [None]:
import numpy as np

# Assuming 'model' is your trained Logistic Regression model and 'scaler' is your trained StandardScaler

def predict_diabetes(pregnancies, glucose, blood_pressure, skin_thickness, insulin, bmi, diabetes_pedigree, age):
    # Input new data
    new_data = np.array([[pregnancies, glucose, blood_pressure, skin_thickness, insulin, bmi, diabetes_pedigree, age]])

    # Scale the new data
    new_data_scaled = scaler.transform(new_data)

    # Make prediction using logistic regression model
    predicted_probability = model.predict_proba(new_data_scaled)[0, 1]  # Predicted probability for class 1

    # Apply threshold for binary classification (e.g., 0.5)
    if predicted_probability >= 0.5:
        return "Diabetic"
    else:
        return "Non-diabetic"



In [None]:
# Example usage
result = predict_diabetes(2, 120, 70, 30, 85, 33.6, 0.627, 45)
print(f'Prediction: {result}')

Prediction: Non-diabetic




In [None]:
# Assuming you have already trained your model as per the previous steps

import pickle

# Save the model to disk
filename = 'diabetes_prediction_model.pkl'
pickle.dump(model, open(filename, 'wb'))

# Save the scaler too, since you'll need it for new data preprocessing
scaler_filename = 'diabetes_scaler.pkl'
pickle.dump(scaler, open(scaler_filename, 'wb'))
