# Diabetes Prediction using Multiple Classifiers

This notebook implements three different classifiers to predict diabetes:
1. K-Nearest Neighbors (KNN)
2. Support Vector Machine (SVM)
3. Gradient Descent Classifier

We'll analyze the dataset, preprocess the data, and compare the performance of these classifiers.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

## 1. Data Loading and Exploration

In [None]:
# Load the dataset
df = pd.read_csv('diabetes.csv')

# Display basic information about the dataset
print("Dataset Shape:", df.shape)
print("\nFirst few rows:")
display(df.head())

print("\nDataset Information:")
display(df.info())

print("\nStatistical Summary:")
display(df.describe())

## 2. Data Visualization

In [None]:
# Set up the plotting style
plt.style.use('seaborn-v0_8')

# Create correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Heatmap')
plt.show()

# Distribution of features by outcome
fig, axes = plt.subplots(4, 2, figsize=(15, 20))
axes = axes.ravel()

for idx, column in enumerate(df.columns[:-1]):
    sns.boxplot(x='Outcome', y=column, data=df, ax=axes[idx])
    axes[idx].set_title(f'Distribution of {column} by Outcome')

plt.tight_layout()
plt.show()

## 3. Data Preprocessing

In [None]:
# Separate features and target
X = df.drop('Outcome', axis=1)
y = df['Outcome']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Training set shape:", X_train_scaled.shape)
print("Testing set shape:", X_test_scaled.shape)

## 4. Model Implementation

In [None]:
# Implement the classifiers
from math import sqrt, exp

class KNNClassifier:
    def __init__(self, k=3):
        self.k = k
        self.X_train = None
        self.y_train = None
    
    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
    
    def euclidean_distance(self, x1, x2):
        return sqrt(sum((a - b) ** 2 for a, b in zip(x1, x2)))
    
    def predict(self, X):
        predictions = []
        for x in X:
            distances = [(self.euclidean_distance(x, x_train), y_train) 
                        for x_train, y_train in zip(self.X_train, self.y_train)]
            neighbors = sorted(distances, key=lambda x: x[0])[:self.k]
            pred = max(set(n[1] for n in neighbors), key=list(n[1] for n in neighbors).count)
            predictions.append(pred)
        return np.array(predictions)

class SVMClassifier:
    def __init__(self, learning_rate=0.01, lambda_param=0.01, n_iterations=1000):
        self.lr = learning_rate
        self.lambda_param = lambda_param
        self.n_iterations = n_iterations
        self.w = None
        self.b = None
    
    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.w = np.zeros(n_features)
        self.b = 0
        
        y_ = np.where(y <= 0, -1, 1)
        
        for _ in range(self.n_iterations):
            for idx, x_i in enumerate(X):
                condition = y_[idx] * (np.dot(x_i, self.w) + self.b) >= 1
                if condition:
                    self.w -= self.lr * (2 * self.lambda_param * self.w)
                else:
                    self.w -= self.lr * (2 * self.lambda_param * self.w - np.dot(x_i, y_[idx]))
                    self.b -= self.lr * y_[idx]
    
    def predict(self, X):
        linear_output = np.dot(X, self.w) + self.b
        return np.where(linear_output >= 0, 1, 0)

class GradientDescentClassifier:
    def __init__(self, learning_rate=0.01, n_iterations=1000):
        self.lr = learning_rate
        self.n_iterations = n_iterations
        self.weights = None
        self.bias = None
    
    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))
    
    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0
        
        for _ in range(self.n_iterations):
            linear_model = np.dot(X, self.weights) + self.bias
            y_predicted = self.sigmoid(linear_model)
            
            dw = (1 / n_samples) * np.dot(X.T, (y_predicted - y))
            db = (1 / n_samples) * np.sum(y_predicted - y)
            
            self.weights -= self.lr * dw
            self.bias -= self.lr * db
    
    def predict(self, X):
        linear_model = np.dot(X, self.weights) + self.bias
        y_predicted = self.sigmoid(linear_model)
        return np.where(y_predicted >= 0.5, 1, 0)

## 5. Model Training and Evaluation

In [None]:
def evaluate_model(y_true, y_pred, model_name):
    print(f"\n{model_name} Results:")
    print("-" * 50)
    print(f"Accuracy: {accuracy_score(y_true, y_pred):.2%}")
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred))
    
    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    cm = confusion_matrix(y_true, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix - {model_name}')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()

# Train and evaluate KNN
knn = KNNClassifier(k=3)
knn.fit(X_train_scaled, y_train)
y_pred_knn = knn.predict(X_test_scaled)
evaluate_model(y_test, y_pred_knn, "K-Nearest Neighbors")

# Train and evaluate SVM
svm = SVMClassifier()
svm.fit(X_train_scaled, y_train)
y_pred_svm = svm.predict(X_test_scaled)
evaluate_model(y_test, y_pred_svm, "Support Vector Machine")

# Train and evaluate Gradient Descent
gd = GradientDescentClassifier()
gd.fit(X_train_scaled, y_train)
y_pred_gd = gd.predict(X_test_scaled)
evaluate_model(y_test, y_pred_gd, "Gradient Descent")

## 6. Model Comparison

In [None]:
# Compare model accuracies
models = {
    'KNN': accuracy_score(y_test, y_pred_knn),
    'SVM': accuracy_score(y_test, y_pred_svm),
    'Gradient Descent': accuracy_score(y_test, y_pred_gd)
}

plt.figure(figsize=(10, 6))
plt.bar(models.keys(), models.values())
plt.title('Model Comparison - Accuracy Scores')
plt.ylabel('Accuracy')
plt.ylim(0, 1)

for i, v in enumerate(models.values()):
    plt.text(i, v + 0.01, f'{v:.2%}', ha='center')

plt.show()