In [13]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.datasets import load_iris, fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

In [14]:
class CustomKNN:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X, y):
        self.X_train = np.array(X)
        self.y_train = np.array(y)

    def predict(self, X):
        predictions = [self._predict_single(x) for x in X]
        return np.array(predictions)

    def _predict_single(self, x):
        distances = np.sqrt(np.sum((self.X_train - x) ** 2, axis=1))
        k_indices = distances.argsort()[:self.k]
        k_labels = self.y_train[k_indices]
        most_common = Counter(k_labels).most_common(1)
        return most_common[0][0]

In [15]:
def accuracy(y_true, y_pred):
    return np.mean(y_true == y_pred)

def confusion_matrix(y_true, y_pred):
    classes = np.unique(np.concatenate((y_true, y_pred)))
    cm = pd.DataFrame(0, index=classes, columns=classes)
    for t, p in zip(y_true, y_pred):
        cm.loc[t, p] += 1
    return cm

def precision_recall_f1(y_true, y_pred):
    classes = np.unique(np.concatenate((y_true, y_pred)))
    precisions, recalls, f1s = {}, {}, {}
    for cls in classes:
        tp = np.sum((y_pred == cls) & (y_true == cls))
        fp = np.sum((y_pred == cls) & (y_true != cls))
        fn = np.sum((y_pred != cls) & (y_true == cls))
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
        precisions[cls] = precision
        recalls[cls] = recall
        f1s[cls] = f1
    return precisions, recalls, f1s

In [16]:
def run_experiment(X, y, dataset_name="Dataset"):
    print(f"\n===== {dataset_name} =====")
    for split in [0.2, 0.3, 0.4]:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split, random_state=42, stratify=y)
        best_acc, best_k = 0, 0

        for k in range(1, 16):
            model = CustomKNN(k=k)
            model.fit(X_train, y_train)
            preds = model.predict(X_test)

            acc = accuracy(y_test, preds)
            if acc > best_acc:
                best_acc, best_k = acc, k
                best_preds, best_split = preds, split

        print(f"\nBest split {int((1-best_split)*100)}:{int(best_split*100)}, Best k={best_k}, Accuracy={best_acc:.4f}")
        cm = confusion_matrix(y_test, best_preds)
        precisions, recalls, f1s = precision_recall_f1(y_test, best_preds)

        print("Confusion Matrix:\n", cm)
        print("Precision:", precisions)
        print("Recall:", recalls)
        print("F1-score:", f1s)

        sk_model = KNeighborsClassifier(n_neighbors=best_k)
        sk_model.fit(X_train, y_train)
        sk_preds = sk_model.predict(X_test)
        sk_acc = accuracy(y_test, sk_preds)
        print(f"Sklearn Accuracy={sk_acc:.4f}")

In [17]:
iris = load_iris()
X_iris, y_iris = iris.data, iris.target
run_experiment(X_iris, y_iris, dataset_name="Iris Dataset")


===== Iris Dataset =====

Best split 80:20, Best k=3, Accuracy=1.0000
Confusion Matrix:
     0   1   2
0  10   0   0
1   0  10   0
2   0   0  10
Precision: {np.int64(0): np.float64(1.0), np.int64(1): np.float64(1.0), np.int64(2): np.float64(1.0)}
Recall: {np.int64(0): np.float64(1.0), np.int64(1): np.float64(1.0), np.int64(2): np.float64(1.0)}
F1-score: {np.int64(0): np.float64(1.0), np.int64(1): np.float64(1.0), np.int64(2): np.float64(1.0)}
Sklearn Accuracy=1.0000

Best split 70:30, Best k=5, Accuracy=0.9778
Confusion Matrix:
     0   1   2
0  15   0   0
1   0  15   0
2   0   1  14
Precision: {np.int64(0): np.float64(1.0), np.int64(1): np.float64(0.9375), np.int64(2): np.float64(1.0)}
Recall: {np.int64(0): np.float64(1.0), np.int64(1): np.float64(1.0), np.int64(2): np.float64(0.9333333333333333)}
F1-score: {np.int64(0): np.float64(1.0), np.int64(1): np.float64(0.967741935483871), np.int64(2): np.float64(0.9655172413793104)}
Sklearn Accuracy=0.9778

Best split 60:40, Best k=3, Accura

In [18]:
categories = ['sci.space', 'rec.sport.baseball']
news = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'))
vectorizer = TfidfVectorizer(max_features=1000)
X_news = vectorizer.fit_transform(news.data).toarray()
y_news = news.target
run_experiment(X_news, y_news, dataset_name="News Dataset")


===== News Dataset =====

Best split 80:20, Best k=9, Accuracy=0.5995
Confusion Matrix:
      0   1
0  162  37
1  122  76
Precision: {np.int64(0): np.float64(0.5704225352112676), np.int64(1): np.float64(0.672566371681416)}
Recall: {np.int64(0): np.float64(0.8140703517587939), np.int64(1): np.float64(0.3838383838383838)}
F1-score: {np.int64(0): np.float64(0.670807453416149), np.int64(1): np.float64(0.48874598070739556)}
Sklearn Accuracy=0.5617

Best split 70:30, Best k=1, Accuracy=0.6538
Confusion Matrix:
      0    1
0  148  151
1   55  241
Precision: {np.int64(0): np.float64(0.729064039408867), np.int64(1): np.float64(0.6147959183673469)}
Recall: {np.int64(0): np.float64(0.49498327759197325), np.int64(1): np.float64(0.8141891891891891)}
F1-score: {np.int64(0): np.float64(0.5896414342629481), np.int64(1): np.float64(0.7005813953488372)}
Sklearn Accuracy=0.6118

Best split 60:40, Best k=1, Accuracy=0.5712
Confusion Matrix:
      0    1
0  302   96
1  244  151
Precision: {np.int64(0): n