In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

def evaluate_knn_with_metrics(X_train, X_test, y_train, y_test, metrics):
    results = []
    for metric, p in metrics:
        # Train KNN classifier with the specified metric
        if metric == 'minkowski':
            neigh = KNeighborsClassifier(n_neighbors=3, metric=metric, p=p)
        else:
            neigh = KNeighborsClassifier(n_neighbors=3, metric=metric)

        neigh.fit(X_train, y_train)

        # Predict on the test set
        y_test_pred = neigh.predict(X_test)

        # Evaluate accuracy
        accuracy = accuracy_score(y_test, y_test_pred)
        results.append((metric, p, accuracy))

    return results

def main():
    # Load the dataset from the Excel file
    df = pd.read_excel('/content/Book1.xlsx')

    # Strip leading/trailing spaces from column names
    df.columns = df.columns.str.strip()

    # Extract texts from ENGLISH and HINDI columns
    english_texts = df['ENGLISH'].tolist()
    hindi_texts = df['HINDI'].tolist()

    # Create labels for the texts (0 for English, 1 for Hindi)
    english_labels = [0] * len(english_texts)
    hindi_labels = [1] * len(hindi_texts)

    # Combine the texts and labels
    texts = english_texts + hindi_texts
    labels = english_labels + hindi_labels

    # Create a TF-IDF Vectorizer and fit_transform the text data
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(texts).toarray()

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, labels, test_size=0.3, random_state=42)

    # List of metrics and p values to evaluate
    metrics = [('euclidean', None), ('manhattan', None), ('minkowski', 1), ('minkowski', 2)]

    # Evaluate KNN with different metrics
    results = evaluate_knn_with_metrics(X_train, X_test, y_train, y_test, metrics)

    # Print results
    for metric, p, accuracy in results:
        print(f"Accuracy with {metric} (p={p}): {accuracy:.4f}")

if __name__ == "__main__":
    main()


Accuracy with euclidean (p=None): 0.5841
Accuracy with manhattan (p=None): 0.5683
Accuracy with minkowski (p=1): 0.5683
Accuracy with minkowski (p=2): 0.5841
