In [17]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

def load_and_prepare_data():
    """Load the dataset and prepare features and labels."""
    df = pd.read_excel('Book1.xlsx')
    X = df[['ENGLISH', 'HINDI']]
    num_samples = len(X)

    # Create labels, ensuring the length matches with features
    y = np.array([0] * (num_samples // 2) + [1] * (num_samples - num_samples // 2))
    np.random.shuffle(y)

    assert len(X) == len(y), "Mismatch between features and labels length"
    return X, y

def preprocess_data(X):
    """Convert text data to TF-IDF features and combine."""
    vectorizer_english = TfidfVectorizer()
    vectorizer_hindi = TfidfVectorizer()

    X_english = vectorizer_english.fit_transform(X['ENGLISH']).toarray()
    X_hindi = vectorizer_hindi.fit_transform(X['HINDI']).toarray()

    # Combine English and Hindi TF-IDF features
    X_combined = np.hstack((X_english, X_hindi))
    return X_combined

def find_best_k(X_train, y_train):
    """Use GridSearchCV to find the best k value for KNN."""
    param_grid = {'n_neighbors': range(1, 21)}
    grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    return grid_search.best_params_['n_neighbors']

def classify_and_evaluate(X_train, y_train, X_test, y_test, best_k):
    """Classify using the best k and evaluate performance."""
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    model = KNeighborsClassifier(n_neighbors=best_k)
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)

    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy with k={best_k}: {accuracy:.2f}")

def main():
    # Load and preprocess data
    X, y = load_and_prepare_data()
    X_vectorized = preprocess_data(X)

    # Split data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)

    # Find the best k value
    best_k = find_best_k(X_train, y_train)
    print(f"The best k value is: {best_k}")

    # Classify and evaluate test data with the best k value
    classify_and_evaluate(X_train, y_train, X_test, y_test, best_k)

if __name__ == "__main__":
    main()


The best k value is: 11
Accuracy with k=11: 0.51
