In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
# Load the diabetes dataset
df = pd.read_csv('diabetes (1).csv')

# Function for Min-Max Scaling
def min_max_scaling(data):
    min_vals = np.min(data, axis=0)
    max_vals = np.max(data, axis=0)
    scaled_data = (data - min_vals) / (max_vals - min_vals)
    return scaled_data

# Function for computing Euclidean distance
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2)**2))

# Function to perform KNN classification
def knn_classify(train_data, train_labels, test_instance, k):
    distances = np.zeros(len(train_data))

    for i in range(len(train_data)):
        distances[i] = euclidean_distance(train_data[i], test_instance)

    # Get indices of k nearest neighbors
    k_neighbors_indices = np.argsort(distances)[:k]

    # Use Distance-Weighted Voting to break ties
    class_votes = {}
    for idx in k_neighbors_indices:
        label = train_labels[idx]
        weight = 1 / distances[idx]  # Inverse of distance
        class_votes[label] = class_votes.get(label, 0) + weight

    # Return the class with the highest weighted votes
    return max(class_votes, key=class_votes.get)

# Function to split data into training and testing sets
def train_test_split(data, labels, split_ratio=0.7):
    split_index = int(split_ratio * len(data))
    train_data, test_data = data[:split_index], data[split_index:]
    train_labels, test_labels = labels[:split_index], labels[split_index:]
    return train_data, train_labels, test_data, test_labels

# Function to evaluate KNN with different k values
def evaluate_knn(data, labels, k_values):
    accuracies = []

    for k in k_values:
        correct_classifications = 0

        for i in range(len(test_data)):
            predicted_label = knn_classify(train_data, train_labels, test_data[i], k)
            if predicted_label == test_labels[i]:
                correct_classifications += 1

        accuracy = correct_classifications / len(test_data) * 100
        accuracies.append(accuracy)

        print(f'k value: {k}\nNumber of correctly classified instances: {correct_classifications}\n'
              f'Total number of instances: {len(test_data)}\nAccuracy: {accuracy:.2f}%\n')

    # Calculate and print average accuracy
    avg_accuracy = np.mean(accuracies)
    print(f'Average Accuracy Across All Iterations: {avg_accuracy:.2f}%')

# Extract features and labels from the dataset
features = df.drop('Outcome', axis=1).values
labels = df['Outcome'].values

# Normalize features using Min-Max Scaling
scaled_features = min_max_scaling(features)

# Split data into training and testing sets
train_data, train_labels, test_data, test_labels = train_test_split(scaled_features, labels)

# Define k values for iterations
k_values = [26, 27, 28]

# Evaluate KNN with different k values
evaluate_knn(test_data, test_labels, k_values)

k value: 26
Number of correctly classified instances: 183
Total number of instances: 231
Accuracy: 79.22%

k value: 27
Number of correctly classified instances: 181
Total number of instances: 231
Accuracy: 78.35%

k value: 28
Number of correctly classified instances: 181
Total number of instances: 231
Accuracy: 78.35%

Average Accuracy Across All Iterations: 78.64%
