In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import time

def load_data(file_name):
    """Load the English and Hindi text data from the specified Excel file."""
    data = pd.read_excel(file_name)
    return data['ENGLISH'], data['HINDI']

def remove_rare_classes(X, y, min_samples=3):
    """Remove classes from the dataset that have fewer than the specified minimum samples."""
    value_counts = y.value_counts()
    valid_classes = value_counts[value_counts >= min_samples].index
    mask = y.isin(valid_classes)
    return X[mask], y[mask]

def split_data(X, y, test_size=0.3, random_state=42):
    """Split the data into training and testing sets."""
    return train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y)

def grid_search_cv(X_train, y_train, n_splits=2):
    """Perform GridSearchCV to find the best hyperparameters for the Random Forest model."""
    # Define the parameter grid for hyperparameter tuning
    param_grid = {
        'n_estimators': [10, 50, 100],
        'max_depth': [10, 20, 30],
        'min_samples_split': [2, 5, 10]
    }
    rf = RandomForestClassifier(class_weight='balanced')
    cv = StratifiedKFold(n_splits=n_splits)
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=cv, n_jobs=-1, verbose=0)
    start_time = time.time()
    grid_search.fit(X_train, y_train)
    end_time = time.time()

    return grid_search.best_params_, grid_search.best_score_, end_time - start_time

def randomized_search_cv(X_train, y_train, n_splits=2):
    """Perform RandomizedSearchCV to find the best hyperparameters for the Random Forest model."""
    # Define the parameter distribution for hyperparameter tuning
    param_dist = {
        'n_estimators': [10, 50, 100, 200],
        'max_depth': [10, 20, 30, None],
        'min_samples_split': [2, 5, 10]
    }
    rf = RandomForestClassifier(class_weight='balanced')
    cv = StratifiedKFold(n_splits=n_splits)
    random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, n_iter=10, cv=cv, n_jobs=-1, verbose=0, random_state=42)
    start_time = time.time()
    random_search.fit(X_train, y_train)
    end_time = time.time()

    return random_search.best_params_, random_search.best_score_, end_time - start_time

def main():
    """Main function to load data, preprocess, and perform hyperparameter tuning."""
    file_name = "Book1.xlsx"

    english_text, hindi_text = load_data(file_name)
    tfidf_vectorizer = TfidfVectorizer(max_features=5000)
    X_tfidf = tfidf_vectorizer.fit_transform(english_text)

    svd = TruncatedSVD(n_components=100)
    X_reduced = svd.fit_transform(X_tfidf)

    X_reduced, hindi_text = remove_rare_classes(X_reduced, hindi_text, min_samples=3)
    X_train, X_test, y_train, y_test = split_data(X_reduced, hindi_text, test_size=0.3)

    # Run GridSearchCV and capture best parameters and performance score
    grid_params, grid_score, grid_time = grid_search_cv(X_train, y_train, n_splits=2)
    # Run RandomizedSearchCV and capture best parameters and performance score
    random_params, random_score, random_time = randomized_search_cv(X_train, y_train, n_splits=2)

    # Print performance comparison results
    print("\nPerformance Comparison:")
    print(f"GridSearchCV Best Score: {grid_score:.4f}, Time Taken: {grid_time:.4f} seconds")
    print(f"RandomizedSearchCV Best Score: {random_score:.4f}, Time Taken: {random_time:.4f} seconds")

if __name__ == "__main__":
    main()



Performance Comparison:
GridSearchCV Best Score: 0.8944, Time Taken: 5.7969 seconds
RandomizedSearchCV Best Score: 0.9444, Time Taken: 2.4508 seconds
