In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

def main():
    # Load the dataset from the Excel file
    df = pd.read_excel('/content/Book1.xlsx')

    # Strip leading/trailing spaces from column names
    df.columns = df.columns.str.strip()

    # Extract texts from ENGLISH and HINDI columns
    english_texts = df['ENGLISH'].tolist()
    hindi_texts = df['HINDI'].tolist()

    # Create labels for the texts (0 for English, 1 for Hindi)
    english_labels = [0] * len(english_texts)
    hindi_labels = [1] * len(hindi_texts)

    # Combine the texts and labels
    texts = english_texts + hindi_texts
    labels = english_labels + hindi_labels

    # Create a TF-IDF Vectorizer and fit_transform the text data
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(texts).toarray()

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, labels, test_size=0.3, random_state=42)

    # Train a K-Nearest Neighbors classifier
    neigh = KNeighborsClassifier(n_neighbors=3)
    neigh.fit(X_train, y_train)

    # Test the accuracy of the classifier using the test set
    accuracy = neigh.score(X_test, y_test)
    print(f"Accuracy of K-Nearest Neighbors classifier: {accuracy:.2f}")

    # Predict the classes for all test vectors
    test_predictions = neigh.predict(X_test)
    print("Predictions for the test set:")
    print(test_predictions)

    # Classify a specific test vector (e.g., the first test vector)
    test_vect = X_test[0].reshape(1, -1)  # Reshape to 2D array as predict expects 2D input
    specific_prediction = neigh.predict(test_vect)
    print(f"Classification for the specific test vector: {specific_prediction[0]}")

if __name__ == "__main__":
    main()


Accuracy of K-Nearest Neighbors classifier: 0.58
Predictions for the test set:
[1 1 1 ... 1 1 1]
Classification for the specific test vector: 1
