In [1]:
!pip install plotly==5.22.0 regex==2024.5.15 requests==2.31.0 sentence-transformers==2.2.2 torch==2.3.0 tqdm==4.66.4 transformers==4.41.0 umap==0.1.1 umap-learn==0.5.6



In [2]:
import os
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE, MDS
from sklearn.cluster import KMeans, AgglomerativeClustering
from umap import UMAP
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import plotly.io as pio
from sklearn.metrics import f1_score, accuracy_score
import logging

# Setup logging
logging.basicConfig(filename='error_log.txt', level=logging.ERROR)

def cluster_embeddings(embeddings, method='kmeans', n_clusters=2):
    try:
        if method == 'kmeans':
            clusterer = KMeans(n_clusters=n_clusters, n_init=10, random_state=42)
        elif method == 'agglomerative':
            clusterer = AgglomerativeClustering(n_clusters=n_clusters)
        else:
            raise ValueError("Invalid clustering method")
        return clusterer.fit_predict(embeddings), clusterer
    except Exception as e:
        logging.error(f"Error in clustering: {str(e)}")
        return np.array([]), None

def flip_labels(predicted_labels, label1, label2):
    flipped_labels = np.where(predicted_labels == label1, label2, predicted_labels)
    flipped_labels = np.where(predicted_labels == label2, label1, flipped_labels)
    return flipped_labels

def get_word_embeddings(sentences, words, true_labels, tokenizer, model):
    try:
        inputs = tokenizer(sentences, return_tensors='pt', truncation=True, padding=True)
        outputs = model(**inputs)
        token_embeddings = outputs.last_hidden_state
        word_embeddings = []
        valid_sentences = []
        valid_words = []
        valid_true_labels = []

        for idx, (sentence, word, label) in enumerate(zip(sentences, words, true_labels)):
            word_tokens = tokenizer.tokenize(word)
            word_token_ids = tokenizer.convert_tokens_to_ids(word_tokens)
            word_positions = []

            for i in range(len(inputs.input_ids[idx]) - len(word_token_ids) + 1):
                if inputs.input_ids[idx][i:i + len(word_token_ids)].tolist() == word_token_ids:
                    word_positions.extend(range(i, i + len(word_token_ids)))

            if word_positions:
                word_embedding = token_embeddings[idx, word_positions, :].mean(dim=0).detach().cpu().numpy()
                word_embeddings.append(word_embedding)
                valid_sentences.append(sentence)
                valid_words.append(word)
                valid_true_labels.append(label)

        return np.array(word_embeddings), valid_sentences, valid_words, valid_true_labels
    except Exception as e:
        logging.error(f"Error processing sentences: {str(e)}")
        return np.array([]), [], [], []

def evaluate_clustering(true_labels, predicted_labels):
    try:
        accuracy = accuracy_score(true_labels, predicted_labels)
        f1 = f1_score(true_labels, predicted_labels, average='weighted')
        return accuracy, f1
    except Exception as e:
        logging.error(f"Error evaluating clustering: {str(e)}")
        return 0, 0

def dimensionality_reduction(embeddings, method='pca', n_components=2):
    try:
        if method == 'pca':
            reducer = PCA(n_components=n_components)
        elif method == 'mds':
            reducer = MDS(n_components=n_components)
        elif method == 'umap':
            reducer = UMAP(n_components=n_components)
        else:
            raise ValueError("Invalid dimensionality reduction method")
        return reducer.fit_transform(embeddings)
    except Exception as e:
        logging.error(f"Error in dimensionality reduction: {str(e)}")
        return np.array([])

def process_word(word, df, model_name, cluster_method):
    sentences = df[df['Word'] == word]['Sentence'].tolist()
    true_labels_column = 'Sense' if 'Sense' in df.columns else 'Sense'
    true_labels = df[df['Word'] == word][true_labels_column].tolist()

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)

    embeddings, valid_sentences, valid_words, valid_true_labels = get_word_embeddings(
        sentences, df[df['Word'] == word]['Word'].tolist(), true_labels, tokenizer, model
    )

    if embeddings.size == 0:
        return None, None, None, None, None, None

    predicted_labels, _ = cluster_embeddings(embeddings, method=cluster_method, n_clusters=len(set(valid_true_labels)))
    if predicted_labels.size == 0:
        return None, None, None, None, None, None

    flipped_labels = flip_labels(predicted_labels, 0, 1)
    accuracy = max(*evaluate_clustering(valid_true_labels, predicted_labels),
                   *evaluate_clustering(valid_true_labels, flipped_labels))
    f1 = max(*evaluate_clustering(valid_true_labels, predicted_labels),
             *evaluate_clustering(valid_true_labels, flipped_labels))
    misclassified = np.where(np.array(valid_true_labels) != predicted_labels)[0]

    return embeddings, predicted_labels, valid_sentences, f1, accuracy, misclassified


def create_plot_grid(df, model_names, cluster_methods, reduce_methods, output_dir='/Users/syamimmanuelpaulbondada/Downloads/html_report'):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    report_html = """<html>
<head>
<style>
    body { font-family: Arial, sans-serif; }
    table { width: 50%; border-collapse: collapse; margin-bottom: 20px; }
    th, td { border: 1px solid black; padding: 8px; text-align: left; }
    th { background-color: #f2f2f2; }
    ul { list-style-type: none; padding: 0; }
    ul li { margin: 5px 0; }
</style>
<script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
</head>
<body>"""
    overall_results = {model: {method: {'f1_scores': [], 'accuracies': []} for method in cluster_methods} for model in model_names}

    for word in df['Word'].unique():
        report_html += f"<h2>Word: {word}</h2>"

        for cluster_method in cluster_methods:
            report_html += f"<h3>Clustering Method: {cluster_method}</h3>"
            report_html += "<table><tr><th>Model</th><th>Accuracy</th><th>F1 Score</th></tr>"

            fig = make_subplots(
                rows=len(model_names),
                cols=len(reduce_methods),
                subplot_titles=[f"{model.split('/')[-1]} - {reduce}" for model in model_names for reduce in reduce_methods],
                vertical_spacing=0.1,
                horizontal_spacing=0.05
            )

            for i, model_name in enumerate(model_names):
                # Unpack the 6 values returned by the process_word function
                embeddings, labels, sentences, f1, accuracy, misclassified = process_word(word, df, model_name, cluster_method)

                if embeddings is None:
                    continue

                if f1 is not None and accuracy is not None:
                    overall_results[model_name][cluster_method]['f1_scores'].append(f1)
                    overall_results[model_name][cluster_method]['accuracies'].append(accuracy)


                reduced_embeddings_dict = {reduce_method: dimensionality_reduction(embeddings, method=reduce_method) for reduce_method in reduce_methods}

                hover_texts = [f"Sentence: {sent}" for sent in sentences]

                for j, reduce_method in enumerate(reduce_methods):
                    reduced_embeddings = reduced_embeddings_dict[reduce_method]
                    if reduced_embeddings.size == 0:
                        continue

                    scatter = go.Scatter(
                        x=reduced_embeddings[:, 0],
                        y=reduced_embeddings[:, 1],
                        mode='markers',
                        marker=dict(
                            color=labels,
                            colorscale='Viridis',
                            size=10
                        ),
                        text=hover_texts,
                        hoverinfo='text',
                        showlegend=False
                    )

                    fig.add_trace(scatter, row=i + 1, col=j + 1)

                accuracy_str = f"{accuracy:.2f}" if accuracy is not None else "N/A"
                f1_str = f"{f1:.2f}" if f1 is not None else "N/A"
                report_html += f"<tr><td>{model_name}</td><td>{accuracy_str}</td><td>{f1_str}</td></tr>"

            plot_height_per_row = 700  # Increased height for each row
            plot_width_per_col = 700  # Increased width for each column
            total_height = plot_height_per_row * len(model_names)
            total_width = plot_width_per_col * len(reduce_methods)
            
            fig.update_layout(
                height=total_height,
                width=total_width,
                title_text=f"Word: {word} - Clustering: {cluster_method}",
            )

            plot_html = pio.to_html(fig, full_html=False)
            report_html += plot_html
            report_html += "</table>"

    report_html += "<h2>Overall model evaluation</h2>"
    report_html += "<table><tr><th>Model</th><th>Clustering Method</th><th>Average Accuracy</th><th>Average F1 Score</th></tr>"
    for model_name, cluster_results in overall_results.items():
        for cluster_method, scores in cluster_results.items():
            avg_accuracy = np.mean(scores['accuracies']) if scores['accuracies'] else 0
            avg_f1 = np.mean(scores['f1_scores']) if scores['f1_scores'] else 0
            report_html += f"<tr><td>{model_name}</td><td>{cluster_method}</td><td>{avg_accuracy:.2f}</td><td>{avg_f1:.2f}</td></tr>"
    report_html += "</table>"

    report_html += """
    <script>
        document.addEventListener('DOMContentLoaded', function() {
            function setPlotHeight(height) {
                var plotDivs = document.querySelectorAll('.plotly-graph-div');
                plotDivs.forEach(function(div) {
                    div.style.height = height + 'px';
                });
            }
            var plotHeightPerRow = 700;  // Increased to match the new height
            var totalHeight = plotHeightPerRow * """ + str(len(model_names)) + """;
            setPlotHeight(totalHeight);
            var plots = document.querySelectorAll('[id^="plotly"]');
            plots.forEach(function(plot) {
                Plotly.relayout(plot, { height: totalHeight });
            });
        });
    </script>
    </body>
    </html>"""

    with open(os.path.join(output_dir, 'Accuracies_and_plots.html'), 'w', encoding='utf-8') as f:
        f.write(report_html)

In [3]:
csv_path = '/Users/syamimmanuelpaulbondada/Downloads/kooli_words.csv'  
df = pd.read_csv(csv_path)

model_names = [
    'pierluigic/xl-lexeme',
    'google/muril-base-cased',
    'l3cube-pune/telugu-bert',
    'google/muril-large-cased'
]
cluster_methods = ['kmeans']
reduce_methods = ['umap']

create_plot_grid(df, model_names, cluster_methods, reduce_methods)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Some weights of BertModel were not initialized from the model checkpoint at l3cube-pune/telugu-bert and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of the model checkpoint at google/muril-large-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.pre