#  1. Implementing k-NN from Scratch


In [2]:
import numpy as np
from collections import Counter

# Function to calculate the Euclidean distance between two points
def euclidean_distance(a, b):
    return np.sqrt(np.sum((a - b) ** 2))

# k-NN function to classify test points based on the k nearest neighbors
def knn(X_train, y_train, X_test, k=3):
    predictions = []  # List to store predictions for each test point

    # Loop over each test point
    for test_point in X_test:
        # Calculate distance between test point and all training points
        distances = [euclidean_distance(test_point, x) for x in X_train]

        # Get the indices of the k nearest neighbors
        k_indices = np.argsort(distances)[:k]

        # Get the labels of the k nearest neighbors
        k_nearest_labels = [y_train[i] for i in k_indices]

        # Determine the most common label among the nearest neighbors
        most_common = Counter(k_nearest_labels).most_common(1)

        # Add the predicted label to the list of predictions
        predictions.append(most_common[0][0])

    return np.array(predictions)  # Return the list of predictions as a numpy array

# Example usage of the knn function:
X_train = np.array([[1, 2], [2, 3], [3, 4], [5, 6]])  # Training features
y_train = np.array([0, 0, 1, 1])  # Training labels
X_test = np.array([[1, 1], [4, 4]])  # Test features

# Call the knn function to get predictions for the test points
predictions = knn(X_train, y_train, X_test, k=4)
print(predictions)  # Output: array([0, 1])


[0 1]


# 2. Optimizing with NumPy
    Optimized Python Implementation:

In [3]:
# Optimized k-NN function using NumPy for distance calculation
def knn_optimized(X_train, y_train, X_test, k=3):
    predictions = []  # List to store predictions for each test point

    # Loop over each test point
    for test_point in X_test:
        # Calculate the distance between the test point and all training points using NumPy
        distances = np.linalg.norm(X_train - test_point, axis=1)

        # Get the indices of the k nearest neighbors
        k_indices = np.argsort(distances)[:k]

        # Get the labels of the k nearest neighbors
        k_nearest_labels = y_train[k_indices]

        # Determine the most common label among the nearest neighbors
        most_common = Counter(k_nearest_labels).most_common(1)

        # Add the predicted label to the list of predictions
        predictions.append(most_common[0][0])

    return np.array(predictions)  # Return the list of predictions as a numpy array


# 3. k-NN with Scikit-Learn
    Using KNeighborsClassifier and KNeighborsRegressor:

In [4]:
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris

# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Instantiate a k-NN classifier with k=3 and distance-based weighting
classifier = KNeighborsClassifier(n_neighbors=3, weights='distance')

# Fit the classifier to the training data
classifier.fit(X_train, y_train)

# Predict the labels for the test data
predictions = classifier.predict(X_test)

# Print the accuracy of the model on the test data
print("Accuracy:", accuracy_score(y_test, predictions))

# Example for regression with k-NN
# Instantiate a k-NN regressor with k=3 and distance-based weighting
regressor = KNeighborsRegressor(n_neighbors=3, weights='distance')

# Fit the regressor to the training data
regressor.fit(X_train, y_train)

# Predict the target values for the test data
predictions = regressor.predict(X_test)


Accuracy: 0.9666666666666667


# 4. Performance Evaluation
    Metrics:
           

In [5]:
from sklearn.metrics import classification_report, confusion_matrix, cross_val_score

# Generate and print the confusion matrix
print(confusion_matrix(y_test, predictions))

# Generate and print the classification report with precision, recall, and F1-score
print(classification_report(y_test, predictions))

# Perform cross-validation to evaluate model performance
scores = cross_val_score(classifier, X, y, cv=5)
print("Cross-validation scores:", scores)


ImportError: cannot import name 'cross_val_score' from 'sklearn.metrics' (/usr/local/lib/python3.10/dist-packages/sklearn/metrics/__init__.py)

# 6. Hyperparameter Tuning
    Grid Search and Random Search:



In [6]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# Define a parameter grid for GridSearchCV or RandomizedSearchCV
param_grid = {
    'n_neighbors': [3, 5, 7],  # Different values for k
    'weights': ['uniform', 'distance'],  # Weighting options
    'metric': ['euclidean', 'manhattan']  # Distance metrics
}

# Perform Grid Search to find the best hyperparameters
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5)
grid_search.fit(X_train, y_train)
print("Best parameters (Grid Search):", grid_search.best_params_)

# Perform Random Search to find the best hyperparameters
random_search = RandomizedSearchCV(KNeighborsClassifier(), param_distributions=param_grid, cv=5, n_iter=10)
random_search.fit(X_train, y_train)
print("Best parameters (Random Search):", random_search.best_params_)


Best parameters (Grid Search): {'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'uniform'}
Best parameters (Random Search): {'weights': 'distance', 'n_neighbors': 5, 'metric': 'euclidean'}


# Implementing k-NN Regression in Scikit-Learn
    Here's a basic implementation of k-NN regression using Scikit-Learn:

In [7]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_regression
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# Generate a synthetic regression dataset
X, y = make_regression(n_samples=500, n_features=2, noise=0.1, random_state=42)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the k-NN regressor with k=3
knn_regressor = KNeighborsRegressor(n_neighbors=3)

# Fit the model to the training data
knn_regressor.fit(X_train, y_train)

# Predict the target values for the test set
y_pred = knn_regressor.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")


Mean Absolute Error (MAE): 1.0848932458375187
Mean Squared Error (MSE): 2.9243728452646134
Root Mean Squared Error (RMSE): 1.7100797774561902


# Faiss

In [12]:
import faiss
import numpy as np

# Generate a large dataset
d = 64  # dimension
nb = 1000000  # number of database vectors
nq = 10000  # number of query vectors

np.random.seed(1234)
xb = np.random.random((nb, d)).astype('float32')
xb[:, 0] += np.arange(nb) / 1000.
xq = np.random.random((nq, d)).astype('float32')
xq[:, 0] += np.arange(nq) / 1000.

# Create index
index = faiss.IndexFlatL2(d)  # L2 distance
index.add(xb)  # add vectors to the index

# Perform the search
k = 5  # number of nearest neighbors
D, I = index.search(xq, k)  # D stores distances, I stores indices of nearest neighbors
print(I[:5])  # print the indices of the nearest neighbors for the first 5 queries


[[ 774  175  917  413  365]
 [  98   66  357  596  969]
 [ 504  263  740  738   59]
 [ 756 1102  454  353 1136]
 [ 352  381  545  264   34]]
