# **DIABETES PREDICTION**

In [62]:
# Importing the Necessary Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [63]:
# Load the dataset
data = pd.read_csv("diabetes_prediction_dataset.csv")

In [64]:
# Separate features and target
X = data.drop("diabetes", axis=1)
y = data["diabetes"]

In [65]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [66]:
# Define which features need one-hot encoding
categorical_features = ["gender", "smoking_history"]


In [67]:
# Create a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), X_train.columns.difference(categorical_features)),
        ("cat", OneHotEncoder(), categorical_features)
    ])

In [68]:
# Fit and transform the data
X_train_scaled = preprocessor.fit_transform(X_train)
X_test_scaled = preprocessor.transform(X_test)

In [69]:
# Initialize models
models = [
    ("Logistic Regression", LogisticRegression(max_iter=1000)),
    ("Decision Tree", DecisionTreeClassifier()),
    ("Random Forest", RandomForestClassifier()),
    ("Support Vector Machine", SVC()),
    ("K-Nearest Neighbors", KNeighborsClassifier())
]

In [70]:
# Dictionary to store results
results = {}

In [71]:
# Train and evaluate each model
for name, model in models:
    print(f"Training {name}...")
    model.fit(X_train_scaled, y_train)

    y_pred = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)

    print(f"{name} Accuracy: {accuracy:.2f}")
    print(f"Classification Report for {name}:\n{classification_report(y_test, y_pred)}")
    print("="*50)

print("Done!")

Training Logistic Regression...
Logistic Regression Accuracy: 0.96
Classification Report for Logistic Regression:
              precision    recall  f1-score   support

           0       0.97      0.99      0.98     18292
           1       0.86      0.62      0.72      1708

    accuracy                           0.96     20000
   macro avg       0.91      0.80      0.85     20000
weighted avg       0.96      0.96      0.96     20000

Training Decision Tree...
Decision Tree Accuracy: 0.95
Classification Report for Decision Tree:
              precision    recall  f1-score   support

           0       0.98      0.97      0.97     18292
           1       0.72      0.73      0.72      1708

    accuracy                           0.95     20000
   macro avg       0.85      0.85      0.85     20000
weighted avg       0.95      0.95      0.95     20000

Training Random Forest...
Random Forest Accuracy: 0.97
Classification Report for Random Forest:
              precision    recall  f1-sc