<a href="https://colab.research.google.com/github/Tajuddin80/Machine-Learning/blob/main/221002622_CSE312_221D13_LabReport02_knnFromScratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
import numpy as np

def euclidean_distance(point1, point2):

  distance = 0.0
  for i in range(len(point1)):
    distance += (point1[i] - point2[i])**2
  return np.sqrt(distance)

In [23]:
def get_neighbors(train_data, test_point, k):

  distances = []
  for train_point in train_data:
    dist = euclidean_distance(test_point, train_point)
    distances.append((dist, train_point))

  distances.sort(key=lambda x: x[0])

  neighbors = [distances[i][1] for i in range(k)]
  return neighbors

In [24]:
from collections import Counter

def predict_classification(neighbors):

  classes = [neighbor[-1] for neighbor in neighbors]
  class_counts = Counter(classes)
  predicted_class = max(class_counts, key=class_counts.get)
  return predicted_class

In [25]:
def k_nearest_neighbors(train_data, test_data, k):

  predictions = []
  for test_point in test_data:
    neighbors = get_neighbors(train_data, test_point, k)
    predicted_class = predict_classification(neighbors)
    predictions.append(predicted_class)
  return predictions

In [26]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# 1. Create a sample dataset
# Each inner list is a data point, the last element is the class label.
dataset = [
    [2.7810836, 2.550537003, 'A'],
    [1.465489372, 2.362125076, 'A'],
    [3.396561688, 4.400293529, 'A'],
    [1.38807019, 1.850220317, 'A'],
    [3.06407232, 3.005305971, 'A'],
    [7.627531214, 2.759262235, 'B'],
    [5.332441248, 2.088626775, 'B'],
    [6.922596716, 1.77106367, 'B'],
    [8.675418651, -0.242068655, 'B'],
    [7.673756466, 3.508563011, 'B']
]

# 2. Split the sample dataset into training and testing sets
train_data, test_data = train_test_split(dataset, test_size=0.4, random_state=42)

# Separate features and labels for test data for evaluation
X_test = [point[:-1] for point in test_data]
y_test_actual = [point[-1] for point in test_data]

# 3. Call the k_nearest_neighbors function
k = 3
predictions = k_nearest_neighbors(train_data, X_test, k)

# 4. Compare the predicted class labels with the actual class labels
accuracy = accuracy_score(y_test_actual, predictions)

print(f"Actual class labels: {y_test_actual}")
print(f"Predicted class labels: {predictions}")
print(f"Accuracy: {accuracy}")

Actual class labels: ['B', 'A', 'B', 'A']
Predicted class labels: ['B', 'A', 'B', 'A']
Accuracy: 1.0


In [27]:
def calculate_accuracy(y_true, y_pred):
  """
  Calculates the accuracy of predictions.

  Args:
    y_true: A list of actual labels.
    y_pred: A list of predicted labels.

  Returns:
    The accuracy of the predictions as a float.
  """
  correct_predictions = 0
  for true_label, pred_label in zip(y_true, y_pred):
    if true_label == pred_label:
      correct_predictions += 1
  accuracy = correct_predictions / len(y_true)
  return accuracy

In [28]:
def generate_confusion_matrix(y_true, y_pred):
  """
  Generates a confusion matrix.

  Args:
    y_true: A list of actual labels.
    y_pred: A list of predicted labels.

  Returns:
    A dictionary representing the confusion matrix.
  """
  classes = sorted(list(set(y_true + y_pred)))
  matrix = {true_class: {pred_class: 0 for pred_class in classes} for true_class in classes}

  for true_label, pred_label in zip(y_true, y_pred):
    matrix[true_label][pred_label] += 1

  return matrix

In [29]:
def calculate_precision(confusion_matrix, class_label):
  """
  Calculates the precision for a given class.

  Args:
    confusion_matrix: A dictionary representing the confusion matrix.
    class_label: The class label for which to calculate precision.

  Returns:
    The precision for the class as a float, or 0.0 if the denominator is zero.
  """
  true_positives = confusion_matrix.get(class_label, {}).get(class_label, 0)
  predicted_positives = sum(confusion_matrix.get(other_class, {}).get(class_label, 0) for other_class in confusion_matrix)
  if predicted_positives == 0:
    return 0.0
  return true_positives / predicted_positives

def calculate_recall(confusion_matrix, class_label):
  """
  Calculates the recall for a given class.

  Args:
    confusion_matrix: A dictionary representing the confusion matrix.
    class_label: The class label for which to calculate recall.

  Returns:
    The recall for the class as a float, or 0.0 if the denominator is zero.
  """
  true_positives = confusion_matrix.get(class_label, {}).get(class_label, 0)
  actual_positives = sum(confusion_matrix.get(class_label, {}).get(other_class, 0) for other_class in confusion_matrix.get(class_label, {}))
  if actual_positives == 0:
    return 0.0
  return true_positives / actual_positives

def calculate_f1_score(precision, recall):
  """
  Calculates the F1-score.

  Args:
    precision: The precision value.
    recall: The recall value.

  Returns:
    The F1-score as a float, or 0.0 if the sum of precision and recall is zero.
  """
  if precision + recall == 0:
    return 0.0
  return 2 * (precision * recall) / (precision + recall)

In [30]:
def classification_report(y_true, y_pred):
    """
    Generates a comprehensive classification report including accuracy,
    confusion matrix, precision, recall, and F1-score for each class.

    Args:
        y_true: A list of actual labels.
        y_pred: A list of predicted labels.

    Returns:
        A dictionary containing the classification report.
    """
    report = {}

    # 1. Generate confusion matrix
    confusion_matrix = generate_confusion_matrix(y_true, y_pred)
    report['confusion_matrix'] = confusion_matrix

    # 2. Get unique class labels
    classes = sorted(list(set(y_true + y_pred)))

    # 3. Calculate metrics for each class
    class_metrics = {}
    for class_label in classes:
        precision = calculate_precision(confusion_matrix, class_label)
        recall = calculate_recall(confusion_matrix, class_label)
        f1 = calculate_f1_score(precision, recall)
        class_metrics[class_label] = {
            'precision': precision,
            'recall': recall,
            'f1_score': f1
        }
    report['class_metrics'] = class_metrics

    # 6. Calculate overall accuracy
    accuracy = calculate_accuracy(y_true, y_pred)
    report['overall_accuracy'] = accuracy

    return report

# Example usage with the sample dataset from the previous cell (assuming it's available)
# Make sure to have y_test_actual and predictions defined from the previous cell
if 'y_test_actual' in locals() and 'predictions' in locals():
  classification_report_output = classification_report(y_test_actual, predictions)
  print("\nClassification Report:")
  display(classification_report_output)
else:
  print("Please run the previous cell to get y_test_actual and predictions.")



Classification Report:


{'confusion_matrix': {'A': {'A': 2, 'B': 0}, 'B': {'A': 0, 'B': 2}},
 'class_metrics': {'A': {'precision': 1.0, 'recall': 1.0, 'f1_score': 1.0},
  'B': {'precision': 1.0, 'recall': 1.0, 'f1_score': 1.0}},
 'overall_accuracy': 1.0}

In [31]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

# Calculate scikit-learn metrics
sklearn_accuracy = accuracy_score(y_test_actual, predictions)
sklearn_confusion_matrix = confusion_matrix(y_test_actual, predictions)
sklearn_precision = precision_score(y_test_actual, predictions, average=None)
sklearn_recall = recall_score(y_test_actual, predictions, average=None)
sklearn_f1_score = f1_score(y_test_actual, predictions, average=None)

# Print custom metrics
print("Custom Metrics:")
display(classification_report_output)

# Print scikit-learn metrics
print("\nScikit-learn Metrics:")
print(f"Accuracy: {sklearn_accuracy}")
print("Confusion Matrix:")
display(sklearn_confusion_matrix)

# Map class labels to precision, recall, f1-score from sklearn output for easier comparison
classes = sorted(list(set(y_test_actual + predictions)))
sklearn_class_metrics = {}
for i, class_label in enumerate(classes):
    sklearn_class_metrics[class_label] = {
        'precision': sklearn_precision[i],
        'recall': sklearn_recall[i],
        'f1_score': sklearn_f1_score[i]
    }

print("Class Metrics (Precision, Recall, F1-score):")
display(sklearn_class_metrics)

# Compare the results (visual inspection from the output)
print("\nComparison:")
print("Compare the 'Custom Metrics' and 'Scikit-learn Metrics' outputs above to ensure correctness.")

Custom Metrics:


{'confusion_matrix': {'A': {'A': 2, 'B': 0}, 'B': {'A': 0, 'B': 2}},
 'class_metrics': {'A': {'precision': 1.0, 'recall': 1.0, 'f1_score': 1.0},
  'B': {'precision': 1.0, 'recall': 1.0, 'f1_score': 1.0}},
 'overall_accuracy': 1.0}


Scikit-learn Metrics:
Accuracy: 1.0
Confusion Matrix:


array([[2, 0],
       [0, 2]])

Class Metrics (Precision, Recall, F1-score):


{'A': {'precision': np.float64(1.0),
  'recall': np.float64(1.0),
  'f1_score': np.float64(1.0)},
 'B': {'precision': np.float64(1.0),
  'recall': np.float64(1.0),
  'f1_score': np.float64(1.0)}}


Comparison:
Compare the 'Custom Metrics' and 'Scikit-learn Metrics' outputs above to ensure correctness.


In [3]:
import numpy as np
from collections import Counter
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import math

# -----------------------
# Step 1: KNN Algorithm
# -----------------------
class KNNClassifier:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X_train, y_train):
        self.X_train = np.array(X_train)
        self.y_train = np.array(y_train)

    def predict(self, X_test):
        predictions = [self._predict_point(x) for x in X_test]
        return np.array(predictions)

    def _predict_point(self, x):
        distances = [np.linalg.norm(x - train_x) for train_x in self.X_train]
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = self.y_train[k_indices]
        return Counter(k_nearest_labels).most_common(1)[0][0]

# -----------------------
# Step 2: Custom Metrics
# -----------------------
def custom_metrics(y_true, y_pred):
    labels = np.unique(y_true)
    results = {}

    for label in labels:
        TP = np.sum((y_pred == label) & (y_true == label))
        FP = np.sum((y_pred == label) & (y_true != label))
        FN = np.sum((y_pred != label) & (y_true == label))

        precision = TP / (TP + FP) if TP + FP else 0
        recall = TP / (TP + FN) if TP + FN else 0
        f1 = 2 * (precision * recall) / (precision + recall) if precision + recall else 0

        results[label] = {
            'Precision': round(precision, 4),
            'Recall': round(recall, 4),
            'F1 Score': round(f1, 4)
        }

    accuracy = np.mean(y_true == y_pred)
    results['Accuracy'] = round(accuracy, 4)

    return results

# -----------------------
# Step 3: Iris Dataset
# -----------------------
def run_iris_classification():
    iris = load_iris()
    X, y = iris.data, iris.target
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    model = KNNClassifier(k=3)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print(" Iris Dataset Evaluation:")
    print(custom_metrics(y_test, y_pred))

# -----------------------
# Step 4: News Dataset Example
# -----------------------
def run_news_classification():
    # Sample News Dataset
    data = {
        'title': [
            "Stocks fall amid inflation fears",
            "Champions League: Madrid wins again",
            "Scientists discover new exoplanet",
            "New iPhone released this week",
            "Elections coming up next month"
        ],
        'category': ['business', 'sports', 'science', 'tech', 'politics']
    }

    df = pd.DataFrame(data)

    # Convert text to numerical vectors (simple representation using word counts)
    from sklearn.feature_extraction.text import CountVectorizer
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(df['title']).toarray()

    label_enc = LabelEncoder()
    y = label_enc.fit_transform(df['category'])

    # Split into train/test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

    model = KNNClassifier(k=3)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print("\n News Dataset Evaluation:")
    print(custom_metrics(y_test, y_pred))

# -----------------------
# Run Both
# -----------------------
run_iris_classification()
run_news_classification()


 Iris Dataset Evaluation:
{np.int64(0): {'Precision': np.float64(1.0), 'Recall': np.float64(1.0), 'F1 Score': np.float64(1.0)}, np.int64(1): {'Precision': np.float64(1.0), 'Recall': np.float64(1.0), 'F1 Score': np.float64(1.0)}, np.int64(2): {'Precision': np.float64(1.0), 'Recall': np.float64(1.0), 'F1 Score': np.float64(1.0)}, 'Accuracy': np.float64(1.0)}

 News Dataset Evaluation:
{np.int64(1): {'Precision': 0, 'Recall': np.float64(0.0), 'F1 Score': 0}, np.int64(3): {'Precision': 0, 'Recall': np.float64(0.0), 'F1 Score': 0}, 'Accuracy': np.float64(0.0)}


In [4]:
import numpy as np
from collections import Counter
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# -------------------------
# Custom KNN Implementation
# -------------------------
class CustomKNN:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X_train, y_train):
        self.X_train = np.array(X_train)
        self.y_train = np.array(y_train)

    def predict(self, X_test):
        return np.array([self._predict_single(x) for x in X_test])

    def _predict_single(self, x):
        distances = [np.linalg.norm(x - train_x) for train_x in self.X_train]
        nearest_indices = np.argsort(distances)[:self.k]
        nearest_labels = self.y_train[nearest_indices]
        return Counter(nearest_labels).most_common(1)[0][0]

# -------------------------
# Evaluation Function
# -------------------------
def evaluate(y_true, y_pred):
    return {
        'accuracy': round(accuracy_score(y_true, y_pred), 4),
        'precision_macro': round(precision_score(y_true, y_pred, average='macro'), 4),
        'recall_macro': round(recall_score(y_true, y_pred, average='macro'), 4),
        'f1_macro': round(f1_score(y_true, y_pred, average='macro'), 4)
    }

# -------------------------
# Test with Various k and Split Ratios
# -------------------------
def compare_knn_models():
    iris = load_iris()
    X, y = iris.data, iris.target

    best_config = None
    best_custom = None
    best_sklearn = None
    best_score = 0

    for test_size in [0.2, 0.3, 0.4]:
        for k in range(1, 11):
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

            # Custom KNN
            custom_knn = CustomKNN(k=k)
            custom_knn.fit(X_train, y_train)
            y_pred_custom = custom_knn.predict(X_test)
            custom_scores = evaluate(y_test, y_pred_custom)

            # Sklearn KNN
            sk_knn = KNeighborsClassifier(n_neighbors=k)
            sk_knn.fit(X_train, y_train)
            y_pred_sklearn = sk_knn.predict(X_test)
            sklearn_scores = evaluate(y_test, y_pred_sklearn)

            # Compare by F1 (macro)
            avg_f1 = (custom_scores['f1_macro'] + sklearn_scores['f1_macro']) / 2
            if avg_f1 > best_score:
                best_score = avg_f1
                best_config = {'k': k, 'test_size': test_size}
                best_custom = custom_scores
                best_sklearn = sklearn_scores

    # Final Output
    print(f" Best Config → k = {best_config['k']}, Test Size = {best_config['test_size']}\n")

    print(" Custom KNN Evaluation:")
    for k, v in best_custom.items():
        print(f"{k}: {v}")

    print("\n Scikit-learn KNN Evaluation:")
    for k, v in best_sklearn.items():
        print(f"{k}: {v}")

# Run Comparison
compare_knn_models()


 Best Config → k = 1, Test Size = 0.2

 Custom KNN Evaluation:
accuracy: 1.0
precision_macro: 1.0
recall_macro: 1.0
f1_macro: 1.0

 Scikit-learn KNN Evaluation:
accuracy: 1.0
precision_macro: 1.0
recall_macro: 1.0
f1_macro: 1.0
