In [10]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report

def main():
    # Load the dataset from the Excel file
    file_path = "C:/Users/year3/Downloads/Book1.xlsx"  # Update this path as needed
    df = pd.read_excel(file_path)
    
    # Strip leading/trailing spaces from column names
    df.columns = df.columns.str.strip()
    
    # Extract texts from ENGLISH and HINDI columns
    english_texts = df['ENGLISH'].tolist()
    hindi_texts = df['HINDI'].tolist()
    
    # Create labels for the texts (0 for English, 1 for Hindi)
    english_labels = [0] * len(english_texts)
    hindi_labels = [1] * len(hindi_texts)
    
    # Combine the texts and labels
    texts = english_texts + hindi_texts
    labels = english_labels + hindi_labels
    
    # Create a TF-IDF Vectorizer and fit_transform the text data
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(texts).toarray()
    
    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, labels, test_size=0.3, random_state=42)
    
    # Train a K-Nearest Neighbors classifier with k=3
    k = 3
    neigh = KNeighborsClassifier(n_neighbors=k)
    neigh.fit(X_train, y_train)
    
    # Predict on both training and test sets
    y_train_pred = neigh.predict(X_train)
    y_test_pred = neigh.predict(X_test)
    
    # Generate confusion matrices
    cm_train = confusion_matrix(y_train, y_train_pred)
    cm_test = confusion_matrix(y_test, y_test_pred)
    
    # Print confusion matrices
    print("Confusion Matrix for Training Set:")
    print(cm_train)
    print("Confusion Matrix for Test Set:")
    print(cm_test)
    
    # Print classification reports
    print("\nClassification Report for Training Set:")
    print(classification_report(y_train, y_train_pred))
    
    print("\nClassification Report for Test Set:")
    print(classification_report(y_test, y_test_pred))
    
    # Infer model performance
    train_accuracy = neigh.score(X_train, y_train)
    test_accuracy = neigh.score(X_test, y_test)
    
    print(f"\nTraining Accuracy: {train_accuracy:.4f}")
    print(f"Test Accuracy: {test_accuracy:.4f}")
    
    if train_accuracy > test_accuracy:
        print("The model might be overfitting.")
    elif train_accuracy < test_accuracy:
        print("The model might be underfitting.")
    else:
        print("The model appears to be well-fitted.")

if __name__ == "__main__":
    main()


Confusion Matrix for Training Set:
[[ 797 1337]
 [   0 2151]]
Confusion Matrix for Test Set:
[[163 764]
 [  0 910]]

Classification Report for Training Set:
              precision    recall  f1-score   support

           0       1.00      0.37      0.54      2134
           1       0.62      1.00      0.76      2151

    accuracy                           0.69      4285
   macro avg       0.81      0.69      0.65      4285
weighted avg       0.81      0.69      0.65      4285


Classification Report for Test Set:
              precision    recall  f1-score   support

           0       1.00      0.18      0.30       927
           1       0.54      1.00      0.70       910

    accuracy                           0.58      1837
   macro avg       0.77      0.59      0.50      1837
weighted avg       0.77      0.58      0.50      1837


Training Accuracy: 0.6880
Test Accuracy: 0.5841
The model might be overfitting.
