In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_curve, roc_auc_score, confusion_matrix, classification_report, accuracy_score

def evaluate_knn_classifier(X_train, X_test, y_train, y_test, k, metric, p=None):
    """
    Train and evaluate KNN classifier with given parameters.

    Parameters:
    - X_train (numpy.ndarray): Training features.
    - X_test (numpy.ndarray): Testing features.
    - y_train (list): Training labels.
    - y_test (list): Testing labels.
    - k (int): Number of neighbors.
    - metric (str): Distance metric.
    - p (float, optional): Parameter for Minkowski distance.

    Returns:
    - dict: Performance metrics including accuracy, AUROC, confusion matrix, and classification report.
    """
    if metric == 'minkowski':
        neigh = KNeighborsClassifier(n_neighbors=k, metric=metric, p=p)
    else:
        neigh = KNeighborsClassifier(n_neighbors=k, metric=metric)

    neigh.fit(X_train, y_train)

    # Predict probabilities and labels on the test set
    y_test_prob = neigh.predict_proba(X_test)[:, 1]
    y_test_pred = neigh.predict(X_test)

    # Compute metrics
    accuracy = accuracy_score(y_test, y_test_pred)
    auc_score = roc_auc_score(y_test, y_test_prob)
    cm = confusion_matrix(y_test, y_test_pred)
    report = classification_report(y_test, y_test_pred)

    return {
        'accuracy': accuracy,
        'auroc': auc_score,
        'confusion_matrix': cm,
        'classification_report': report
    }

def main():
    # Load the dataset from the Excel file
    df = pd.read_excel('/content/Book1.xlsx')

    # Strip leading/trailing spaces from column names
    df.columns = df.columns.str.strip()

    # Extract texts from ENGLISH and HINDI columns
    english_texts = df['ENGLISH'].tolist()
    hindi_texts = df['HINDI'].tolist()

    # Create labels for the texts (0 for English, 1 for Hindi)
    english_labels = [0] * len(english_texts)
    hindi_labels = [1] * len(hindi_texts)

    # Combine the texts and labels
    texts = english_texts + hindi_texts
    labels = english_labels + hindi_labels

    # Create a TF-IDF Vectorizer and fit_transform the text data
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(texts).toarray()

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, labels, test_size=0.3, random_state=42)

    # Evaluate developed KNN classifier
    k = 3
    metrics = [('euclidean', None), ('manhattan', None), ('minkowski', 1), ('minkowski', 2)]

    print("Performance of Developed KNN Classifier:")
    for metric, p in metrics:
        results = evaluate_knn_classifier(X_train, X_test, y_train, y_test, k, metric, p)
        print(f"\nMetric: {metric} (p={p})")
        print(f"Accuracy: {results['accuracy']:.4f}")
        print(f"AUROC: {results['auroc']:.4f}")
        print("Confusion Matrix:")
        print(results['confusion_matrix'])
        print("Classification Report:")
        print(results['classification_report'])

    # Evaluate package-provided KNN classifier with default settings
    print("\nPerformance of Package-Provided KNN Classifier:")
    default_knn = KNeighborsClassifier(n_neighbors=5)
    default_knn.fit(X_train, y_train)

    # Predict probabilities and labels on the test set
    y_test_prob = default_knn.predict_proba(X_test)[:, 1]
    y_test_pred = default_knn.predict(X_test)

    # Compute metrics
    accuracy = accuracy_score(y_test, y_test_pred)
    auc_score = roc_auc_score(y_test, y_test_prob)
    cm = confusion_matrix(y_test, y_test_pred)
    report = classification_report(y_test, y_test_pred)

    print(f"Accuracy: {accuracy:.4f}")
    print(f"AUROC: {auc_score:.4f}")
    print("Confusion Matrix:")
    print(cm)
    print("Classification Report:")
    print(report)

if __name__ == "__main__":
    main()


Performance of Developed KNN Classifier:

Metric: euclidean (p=None)
Accuracy: 0.5841
AUROC: 0.6932
Confusion Matrix:
[[163 764]
 [  0 910]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.18      0.30       927
           1       0.54      1.00      0.70       910

    accuracy                           0.58      1837
   macro avg       0.77      0.59      0.50      1837
weighted avg       0.77      0.58      0.50      1837


Metric: manhattan (p=None)
Accuracy: 0.5683
AUROC: 0.6731
Confusion Matrix:
[[134 793]
 [  0 910]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.14      0.25       927
           1       0.53      1.00      0.70       910

    accuracy                           0.57      1837
   macro avg       0.77      0.57      0.47      1837
weighted avg       0.77      0.57      0.47      1837


Metric: minkowski (p=1)
Accuracy: 0.5683
AUROC: 0.6731
Con