In [7]:
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import Perceptron
from sklearn.pipeline import Pipeline
import warnings

# Function to load the data
def load_data(file_name):
    data = pd.read_excel(file_name)
    return data['ENGLISH'], data['HINDI']

# Function to create Perceptron pipeline
def create_perceptron_pipeline():
    tfidf_vectorizer = TfidfVectorizer()
    perceptron_model = Perceptron()

    perceptron_pipeline = Pipeline([
        ('tfidf', tfidf_vectorizer),
        ('model', perceptron_model)
    ])

    return perceptron_pipeline

# Function to create MLP pipeline
def create_mlp_pipeline():
    tfidf_vectorizer = TfidfVectorizer()
    mlp_model = MLPClassifier(max_iter=200)  # Set max_iter to avoid convergence warnings

    mlp_pipeline = Pipeline([
        ('tfidf', tfidf_vectorizer),
        ('model', mlp_model)
    ])

    return mlp_pipeline

# Function to define the hyperparameter grid for Perceptron
def get_perceptron_hyperparameters():
    param_grid = {
        'model__penalty': ['l2', 'elasticnet'],
        'model__alpha': [0.0001, 0.001, 0.01],
        'model__max_iter': [1000, 2000, 3000]
    }
    return param_grid

# Function to define the hyperparameter grid for MLP
def get_mlp_hyperparameters():
    param_grid = {
        'model__hidden_layer_sizes': [(50,), (100,), (50, 50)],
        'model__activation': ['tanh', 'relu'],
        'model__solver': ['sgd', 'adam'],
        'model__alpha': [0.0001, 0.001, 0.01],
        'model__learning_rate': ['constant', 'adaptive']
    }
    return param_grid

# Function to perform RandomizedSearchCV for hyperparameter tuning
def tune_hyperparameters(pipeline, param_grid, X, y):
    # Filter out classes with only one instance
    y_counts = y.value_counts()
    classes_to_keep = y_counts[y_counts > 1].index
    X_filtered = X[y.isin(classes_to_keep)]
    y_filtered = y[y.isin(classes_to_keep)]

    # Using StratifiedKFold for cross-validation with fewer splits
    stratified_kfold = StratifiedKFold(n_splits=2)
    random_search = RandomizedSearchCV(pipeline, param_distributions=param_grid, n_iter=5, cv=stratified_kfold, n_jobs=-1, random_state=42)
    random_search.fit(X_filtered, y_filtered)
    return random_search.best_params_, random_search.best_score_

# Main function to run the code
def main():
    warnings.filterwarnings("ignore")
    file_name = 'Book1.xlsx'
    english_text, hindi_text = load_data(file_name)



    # Tune hyperparameters for Perceptron
    perceptron_pipeline = create_perceptron_pipeline()
    perceptron_params = get_perceptron_hyperparameters()
    perceptron_best_params, perceptron_best_score = tune_hyperparameters(perceptron_pipeline, perceptron_params, english_text, hindi_text)
    print("Best Perceptron Parameters:", perceptron_best_params)
    print("Best Perceptron Score:", perceptron_best_score)

    # Tune hyperparameters for MLP
    mlp_pipeline = create_mlp_pipeline()
    mlp_params = get_mlp_hyperparameters()
    mlp_best_params, mlp_best_score = tune_hyperparameters(mlp_pipeline, mlp_params, english_text, hindi_text)
    print("Best MLP Parameters:", mlp_best_params)
    print("Best MLP Score:", mlp_best_score)

if __name__ == "__main__":
    main()


Best Perceptron Parameters: {'model__penalty': 'l2', 'model__max_iter': 1000, 'model__alpha': 0.0001}
Best Perceptron Score: 0.7
Best MLP Parameters: {'model__solver': 'sgd', 'model__learning_rate': 'constant', 'model__hidden_layer_sizes': (100,), 'model__alpha': 0.0001, 'model__activation': 'tanh'}
Best MLP Score: 0.03333333333333333
