In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
)
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import TfidfVectorizer

# Function to load the dataset
def load_data(file_name):
    """Load English and Hindi text from the specified Excel file."""
    data = pd.read_excel(file_name)
    return data['ENGLISH'], data['HINDI']

# Function to split the data into training and testing sets
def split_data(X, y, test_size=0.2, random_state=42):
    """Split the data into training and testing sets."""
    return train_test_split(X, y, test_size=test_size, random_state=random_state)

# Function to initialize classifiers
def initialize_classifiers():
    """Initialize various classifiers for model evaluation."""
    return {
        'Support Vector Machine': SVC(probability=True),
        'Decision Tree': DecisionTreeClassifier(),
        'Random Forest': RandomForestClassifier(),
        'AdaBoost': AdaBoostClassifier(algorithm='SAMME'),
        'CatBoost': CatBoostClassifier(verbose=0),
        'XGBoost': XGBClassifier(eval_metric='logloss'),
        'Naive Bayes': GaussianNB(),
    }

# Function to evaluate classifiers and return performance metrics
def evaluate_classifiers(classifiers, X_train, X_test, y_train, y_test):
    """Evaluate classifiers and return their performance metrics."""
    results = []

    for name, clf in classifiers.items():
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted', zero_division=1)
        recall = recall_score(y_test, y_pred, average='weighted', zero_division=1)
        f1 = f1_score(y_test, y_pred, average='weighted', zero_division=1)

        results.append([name, accuracy * 100, precision * 100, recall * 100, f1 * 100])

    return pd.DataFrame(results, columns=['Classifier', 'Accuracy (%)', 'Precision (%)', 'Recall (%)', 'F1 Score (%)'])

# Main function to run the code
def main():
    """Main function to load data, preprocess, and evaluate classifiers."""
    file_name = 'Book1.xlsx'
    english_text, hindi_text = load_data(file_name)

    # TF-IDF Vectorization
    print("Performing TF-IDF transformation...")
    tfidf = TfidfVectorizer(max_features=5000)
    X_tfidf = tfidf.fit_transform(english_text)

    # Split the data
    X_train, X_test, y_train, y_test = split_data(X_tfidf, hindi_text)

    # Initialize classifiers
    classifiers = initialize_classifiers()

    # Evaluate classifiers and display results
    results_df = evaluate_classifiers(classifiers, X_train, X_test, y_train, y_test)

    # Print the results
    print("Classifier Performance Metrics:")
    print(results_df)


if __name__ == "__main__":
    main()  #


Performing TF-IDF transformation...
