In [None]:
import shap
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD  # For dimensionality reduction
import matplotlib.pyplot as plt

# Load and split the data (English to Hindi dataset)
def load_data(file_name):
    data = pd.read_excel(file_name)
    return data['ENGLISH'], data['HINDI']

def split_data(X, y, test_size=0.2, random_state=42):
    return train_test_split(X, y, test_size=test_size, random_state=random_state)

def train_random_forest(X_train, y_train, n_estimators=10):  # Reduce the number of trees
    # Training a RandomForestClassifier
    model = RandomForestClassifier(n_estimators=n_estimators)
    model.fit(X_train, y_train)
    return model

# SHAP Explanation and Feature Importance
def explain_model_with_shap(model, X_train, feature_names):
    # Initialize SHAP explainer for tree-based models (Random Forest in this case)
    explainer = shap.TreeExplainer(model)

    # Calculate SHAP values for the training set
    shap_values = explainer.shap_values(X_train)

    # Plot the summary of feature importance
    shap.summary_plot(shap_values, X_train, feature_names=feature_names)

def main():
    file_name = "Book1.xlsx"

    print("Loading data...")
    english_text, hindi_text = load_data(file_name)
    print("Data loaded successfully.")

    # Initialize the TF-IDF vectorizer with a max_features limit
    print("Initializing TF-IDF vectorizer...")
    tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limit the number of features
    X_tfidf = tfidf_vectorizer.fit_transform(english_text)
    feature_names = tfidf_vectorizer.get_feature_names_out()
    print("TF-IDF transformation completed.")

    # Apply dimensionality reduction (PCA or SVD)
    svd = TruncatedSVD(n_components=100)
    X_reduced = svd.fit_transform(X_tfidf)

    # Split the data
    print("Splitting data into training and testing sets...")
    X_train, X_test, y_train, y_test = split_data(X_reduced, hindi_text)
    print(f"Data split completed. Training set size: {X_train.shape[0]}, Testing set size: {X_test.shape[0]}.")

    # Train the RandomForest model
    print("Training Random Forest model...")
    model = train_random_forest(X_train, y_train)
    print("Model training completed.")

    # Explain the model with SHAP and plot feature importances
    print("Explaining the model with SHAP...")
    explain_model_with_shap(model, X_train, feature_names)
    print("SHAP explanation completed.")

if __name__ == "__main__":
    main()


Loading data...
Data loaded successfully.
Initializing TF-IDF vectorizer...
TF-IDF transformation completed.
Splitting data into training and testing sets...
Data split completed. Training set size: 2448, Testing set size: 613.
Training Random Forest model...
Model training completed.
Explaining the model with SHAP...
