## Libraries

In [1]:
import numpy as np
import torch
import math
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split



## Data classes

In [2]:
class X_values:
    def __init__(self, dataset):
        self.feature_dim = dataset.shape[1]
        self.data_size = dataset.shape[0]
        self.tensor_form = torch.tensor(dataset, dtype=torch.float32)
        print(f"X_values created successfully with dimensions {self.tensor_form.shape}, feature_dim: {self.feature_dim}, data_size: {self.data_size}")

class Y_values:
    def __init__(self, targets):
        self.data_size = targets.shape[0]
        self.tensor_form = torch.tensor(targets, dtype=torch.float32)
        print(f"Y_values created successfully with dimensions {self.tensor_form.shape}")


class Dataset: # !! Use your classes while constructing an instance !!
    def __init__(self, x_values, y_values):
        self.x_tensor = x_values.tensor_form
        self.y_tensor = y_values.tensor_form

        if self.x_tensor.shape[0] != self.y_tensor.shape[0]:
            raise ValueError("Mismatch between X and y dimensions")

        self.feature_dim = x_values.feature_dim
        self.data_size = x_values.data_size

        print(f"Dataset created with X: {self.x_tensor.shape}, Y: {self.y_tensor.shape}")


## Splitting Function & Distance Metrics

In [39]:
def split_dataset(dataset, train_rate, val_rate, test_rate):
    if abs(train_rate + val_rate + test_rate - 1.0) > 1e-6:
        raise ValueError("Split rates must sum to 1")

    total_size = dataset.x_tensor.shape[0]
    train_size = int(total_size * train_rate)
    val_size = int(total_size * val_rate)

    indices = torch.randperm(total_size)
    train_indices = indices[:train_size]
    val_indices = indices[train_size:train_size + val_size]
    test_indices = indices[train_size + val_size:]

    x_train, y_train = dataset.x_tensor[train_indices], dataset.y_tensor[train_indices]
    x_val, y_val = dataset.x_tensor[val_indices], dataset.y_tensor[val_indices]
    x_test, y_test = dataset.x_tensor[test_indices], dataset.y_tensor[test_indices]

    train_dataset = Dataset(X_values(x_train.numpy()), Y_values(y_train.numpy()))
    val_dataset = Dataset(X_values(x_val.numpy()), Y_values(y_val.numpy()))
    test_dataset = Dataset(X_values(x_test.numpy()), Y_values(y_test.numpy()))

    return train_dataset, val_dataset, test_dataset


def distance_function(vec_x, vec_x_train, distance='euclidean', sigma=1, p=1):
    if distance == 'euclidean':
        # Squared Euclidean distance
        return torch.sum((vec_x - vec_x_train)**2, dim=1)

    elif distance == 'cosine':
        # Cosine similarity: 1 - cosine_similarity
        vec_x_norm = vec_x / torch.norm(vec_x, dim=1, keepdim=True)
        vec_x_train_norm = vec_x_train / torch.norm(vec_x_train, dim=1, keepdim=True)
        cosine_similarity = torch.sum(vec_x_norm * vec_x_train_norm, dim=1)
        return 1 - cosine_similarity  # Convert similarity to distance

    elif distance == 'manhattan':
        # Manhattan distance: sum of absolute differences
        return torch.sum(torch.abs(vec_x - vec_x_train), dim=1)

    elif distance == 'gaussian':
        # Gaussian distance: e^(-||vec_x - vec_x_train||^2 / (2 * sigma^2))
        euclidean_squared = torch.sum((vec_x - vec_x_train)**2, dim=1)
        return torch.exp(-euclidean_squared / (2 * sigma**2))

    elif distance == 'L_p':
        # Generalized L_p norm: (sum(|x_i - y_i|^p))^(1/p)
        return torch.sum(torch.abs(vec_x - vec_x_train)**p, dim=1)**(1/p)

    # !!Check whether this really works or not!!
    elif distance == 'chebyshev':
        return torch.max(torch.abs(vec_x - vec_x_train), dim=1)

    else:
        raise ValueError(f"Unsupported distance metric: {distance}")



## KNN from Scratch

In [43]:
# KNN algorithm
def knn(train_dataset, test_dataset, distance='euclidean', K=5):
    x_train, y_train = train_dataset.x_tensor, train_dataset.y_tensor
    x_test, y_test = test_dataset.x_tensor, test_dataset.y_tensor
    y_prediction = []

    for vec_x in x_test:
        # Compute distances in parallel between the test vector and all training vectors
        distances = distance_function(vec_x.unsqueeze(0), x_train, distance=distance)
        #print(distances)

        # Find the indices of the K smallest distances
        k_indices = torch.topk(distances, k=K, largest=False).indices

        # Get the labels of the K nearest neighbors
        k_labels = y_train[k_indices]

        # Perform majority voting (+1 or -1) for prediction
        majority_vote = torch.sign(torch.sum(k_labels))
        y_prediction.append(majority_vote)

    # Convert predictions to a tensor
    y_prediction = torch.tensor(y_prediction)

    # Evaluate accuracy
    accuracy = torch.mean((y_prediction == y_test).float())
    print("/////////////////////////")
    print(f"Distance: {distance}, K: {K}, KNN Accuracy: {accuracy.item() * 100:.2f}%")
    return y_prediction







## Testing function (Scratch)

In [72]:
# Test the implementation using Iris dataset
def test_knn():
    # Load Iris dataset (binary classification: classes 0 and 1)
    iris = datasets.load_iris()
    X = iris.data[iris.target != 2]  # Select only class 0 and 1
    y = iris.target[iris.target != 2]
    y = np.where(y == 0, -1, 1)      # Convert labels to -1 and 1

    # Standardize the dataset
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    # Split the dataset
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.9, random_state=42)
    print(f"Number of training samples: {len(x_train)}, Number of features: {x_train.shape[1]}")

    # Create dataset objects
    train_dataset = Dataset(X_values(x_train), Y_values(y_train))
    test_dataset = Dataset(X_values(x_test), Y_values(y_test))

    # Run KNN
    knn(train_dataset, test_dataset, distance='euclidean', K=4)



## Running the code

Distance: euclidean, K: 1, KNN Accuracy: 98.89%

Distance: euclidean, K: 2, KNN Accuracy: 97.78%

Distance: euclidean, K: 3, KNN Accuracy: 98.89%

Distance: euclidean, K: 4, KNN Accuracy: 46.67%




In [73]:
test_knn()

Number of training samples: 10, Number of features: 4
X_values created successfully with dimensions torch.Size([10, 4]), feature_dim: 4, data_size: 10
Y_values created successfully with dimensions torch.Size([10])
Dataset created with X: torch.Size([10, 4]), Y: torch.Size([10])
X_values created successfully with dimensions torch.Size([90, 4]), feature_dim: 4, data_size: 90
Y_values created successfully with dimensions torch.Size([90])
Dataset created with X: torch.Size([90, 4]), Y: torch.Size([90])
/////////////////////////
Distance: euclidean, K: 4, KNN Accuracy: 46.67%
