<a href="https://colab.research.google.com/github/RIR-Is-Everywhere/KNN_from_Scratch/blob/main/KNN_from_Scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# Setup
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier as SklearnKNN
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

In [6]:
#  KNN Algorithm from Scratch
class KNNClassifier:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X_train, y_train):
        self.X_train = np.array(X_train)
        self.y_train = np.array(y_train)

    def _distance(self, a, b):
        return np.linalg.norm(a - b)

    def predict(self, X_test):
        predictions = []
        for test_point in X_test:
            distances = [self._distance(test_point, train_point) for train_point in self.X_train]
            k_indices = np.argsort(distances)[:self.k]
            k_labels = self.y_train[k_indices]
            majority = Counter(k_labels).most_common(1)[0][0]
            predictions.append(majority)
        return np.array(predictions)

In [7]:
# 📊 Custom Evaluation Metrics
def custom_accuracy(y_true, y_pred):
    return np.mean(y_true == y_pred)

def custom_confusion_matrix(y_true, y_pred):
    labels = np.unique(np.concatenate((y_true, y_pred)))
    matrix = np.zeros((len(labels), len(labels)), dtype=int)
    for true, pred in zip(y_true, y_pred):
        matrix[true][pred] += 1
    return matrix

def custom_precision(y_true, y_pred):
    cm = custom_confusion_matrix(y_true, y_pred)
    with np.errstate(divide='ignore', invalid='ignore'):
        precision = np.diag(cm) / np.sum(cm, axis=0)
        precision = np.nan_to_num(precision)
    return np.mean(precision)

def custom_recall(y_true, y_pred):
    cm = custom_confusion_matrix(y_true, y_pred)
    with np.errstate(divide='ignore', invalid='ignore'):
        recall = np.diag(cm) / np.sum(cm, axis=1)
        recall = np.nan_to_num(recall)
    return np.mean(recall)

def custom_f1(prec, rec):
    if (prec + rec) == 0:
        return 0
    return 2 * (prec * rec) / (prec + rec)

In [9]:
#  Part 1: Iris Flower Dataset
iris = load_iris()
X_iris, y_iris = iris.data, iris.target

In [22]:
# Optimal K and split test
best_k = 0
best_acc = 0
for k in range(1, 11):
    X_train, X_test, y_train, y_test = train_test_split(X_iris, y_iris, test_size=0.2, random_state=42)
    knn = KNNClassifier(k=k)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    acc = custom_accuracy(y_test, y_pred)
    if acc > best_acc:
        best_acc = acc
        best_k = k

In [14]:
# Final run with best K
print(f"\n Iris Dataset - Best k: {best_k}")
knn = KNNClassifier(k=best_k)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

prec = custom_precision(y_test, y_pred)
rec = custom_recall(y_test, y_pred)
f1 = custom_f1(prec, rec)

print("Custom Accuracy:", round(best_acc, 4))
print("Custom Confusion Matrix:\n", custom_confusion_matrix(y_test, y_pred))
print("Custom Precision:", round(prec, 4))
print("Custom Recall:", round(rec, 4))
print("Custom F1-Score:", round(f1, 4))


 Iris Dataset - Best k: 1
Custom Accuracy: 1.0
Custom Confusion Matrix:
 [[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]
Custom Precision: 1.0
Custom Recall: 1.0
Custom F1-Score: 1.0


In [16]:

#  Compare with sklearn
sk_model = SklearnKNN(n_neighbors=best_k)
sk_model.fit(X_train, y_train)
sk_pred = sk_model.predict(X_test)

print("\n Scikit-learn Accuracy:", round(accuracy_score(y_test, sk_pred), 4))
print("Scikit-learn Precision:", round(precision_score(y_test, sk_pred, average='macro'), 4))
print("Scikit-learn Recall:", round(recall_score(y_test, sk_pred, average='macro'), 4))
print("Scikit-learn F1 Score:", round(f1_score(y_test, sk_pred, average='macro'), 4))



 Scikit-learn Accuracy: 1.0
Scikit-learn Precision: 1.0
Scikit-learn Recall: 1.0
Scikit-learn F1 Score: 1.0


In [17]:
#  Part 2: News Dataset (Custom)
news_data = [
    ("Politics: Prime Minister announces new budget.", "Politics"),
    ("Sports: Local team wins championship.", "Sports"),
    ("Technology: New AI tool released.", "Tech"),
    ("Tech: Quantum computing breakthrough.", "Tech"),
    ("Sports: Olympic games to begin soon.", "Sports"),
    ("Politics: Election campaigns begin.", "Politics"),
]

news_df = pd.DataFrame(news_data, columns=["text", "label"])
vectorizer = TfidfVectorizer()
X_news = vectorizer.fit_transform(news_df['text']).toarray()
y_news = LabelEncoder().fit_transform(news_df['label'])

In [18]:
# Split and train on News Dataset
Xn_train, Xn_test, yn_train, yn_test = train_test_split(X_news, y_news, test_size=0.33, random_state=42)


In [19]:
# Test optimal k for news
best_k_news = 0
best_acc_news = 0
for k in range(1, 6):
    knn = KNNClassifier(k=k)
    knn.fit(Xn_train, yn_train)
    yn_pred = knn.predict(Xn_test)
    acc = custom_accuracy(yn_test, yn_pred)
    if acc > best_acc_news:
        best_acc_news = acc
        best_k_news = k


In [20]:
# Evaluate with best k
print(f"\n News Dataset - Best k: {best_k_news}")
knn = KNNClassifier(k=best_k_news)
knn.fit(Xn_train, yn_train)
yn_pred = knn.predict(Xn_test)

prec = custom_precision(yn_test, yn_pred)
rec = custom_recall(yn_test, yn_pred)
f1 = custom_f1(prec, rec)

print("Custom Accuracy:", round(custom_accuracy(yn_test, yn_pred), 4))
print("Custom Confusion Matrix:\n", custom_confusion_matrix(yn_test, yn_pred))
print("Custom Precision:", round(prec, 4))
print("Custom Recall:", round(rec, 4))
print("Custom F1-Score:", round(f1, 4))


 News Dataset - Best k: 1
Custom Accuracy: 1.0
Custom Confusion Matrix:
 [[1 0]
 [0 1]]
Custom Precision: 1.0
Custom Recall: 1.0
Custom F1-Score: 1.0


In [21]:
# Compare with sklearn KNN
sk_news = SklearnKNN(n_neighbors=best_k_news)
sk_news.fit(Xn_train, yn_train)
yn_pred_sk = sk_news.predict(Xn_test)

print("\n Scikit-learn News Accuracy:", round(accuracy_score(yn_test, yn_pred_sk), 4))
print("Scikit-learn Precision:", round(precision_score(yn_test, yn_pred_sk, average='macro'), 4))
print("Scikit-learn Recall:", round(recall_score(yn_test, yn_pred_sk, average='macro'), 4))
print("Scikit-learn F1 Score:", round(f1_score(yn_test, yn_pred_sk, average='macro'), 4))


 Scikit-learn News Accuracy: 1.0
Scikit-learn Precision: 1.0
Scikit-learn Recall: 1.0
Scikit-learn F1 Score: 1.0
