# Evaluation of Topic Modelling

Please install the relevant packages for the evaluation: 
- sentence_transformers
- umap
- hdbscan
- sklearn
- bertopic
- datasets
- octis

In [1]:
from sentence_transformers import SentenceTransformer
from umap.umap_ import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic

  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


In [2]:
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
umap_model = UMAP(n_neighbors=3, n_components=3, min_dist=0.05)
hdbscan_model = HDBSCAN(min_cluster_size=80, min_samples=40,
                        gen_min_span_tree=True,
                        prediction_data=True)
vectorizer_model = CountVectorizer(ngram_range=(1, 2))
model = BERTopic(
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    top_n_words=5,
    min_topic_size=10,
    language='english',
    verbose=True
)

In [4]:
# Loading a sample test set from the Hugging Face's dataset repository
from datasets import load_dataset
hf_dataset = 'HHousen/ParaSCI'
dataset = load_dataset(hf_dataset, split="test")

# Extracting the 'sentence1' field from each data entry to create a list of documents/sentences
docs = [data['sentence1'] for data in list(dataset)]

# Applying the BERTopic model on the sample list of documents
# This will return two lists:
# - 'topics': a list where each entry is the topic assigned to the corresponding document
# - 'probs': a list of probabilities associated with each topic assignment
topics, probs = model.fit_transform(docs)

Repo card metadata block was not found. Setting CardData to empty.
Batches: 100%|████████████████████████████████| 153/153 [00:12<00:00, 12.15it/s]
2023-09-25 00:28:58,529 - BERTopic - Transformed documents to Embeddings
2023-09-25 00:29:06,721 - BERTopic - Reduced dimensionality
2023-09-25 00:29:06,813 - BERTopic - Clustered reduced embeddings


In [5]:
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
from octis.evaluation_metrics.coherence_metrics import Coherence

# Retrieving the vectorizer model from BERTopic and building an analyzer 
# The analyzer will be a function to split the documents into tokens (words or n-grams)
vectorizer = model.vectorizer_model
analyzer = vectorizer.build_analyzer()
tokens = [analyzer(doc) for doc in docs]

def get_metrics(topk=5):
    """Prepare evaluation measures using OCTIS"""
    
    # Initializing Coherence measure with the "c_npmi" (normalized pointwise mutual information) metric
    # This metric evaluates the semantic similarity between the top words in each topic
    # This metric represents Topic Coherence
    npmi = Coherence(texts=tokens, topk=topk, measure="c_npmi")
    
    # Initializing TopicDiversity measure
    # This metric evaluates how diverse the top words are across different topics
    topic_diversity = TopicDiversity(topk=topk)

    # Grouping the metrics into categories for clarity
    coherence = [(npmi, "npmi")]
    diversity = [(topic_diversity, "diversity")]

    # Combining the metrics into a list
    metrics = [(coherence, "Coherence"), (diversity, "Diversity")]

    return metrics

In [6]:
# Using the function defined in the previous block to get evaluation metrics
metrics = get_metrics()

# Retrieving the top 10 words for each topic identified by BERTopic
# Each topic is represented as a list of words
bertopic_topics = [
    [
        vals[0]  # Get the word from the tuple (word, weight)
        for vals in model.get_topic(i)[:10]  # Retrieve top 10 words for topic 'i'
    ]
    for i in range(len(set(topics)) - 1)  # Loop through all unique topics
]

# Organizing the retrieved topics into a dictionary format for further processing or evaluation
output_tm = {"topics": bertopic_topics}

In [7]:
# Initializing an empty dictionary to store the evaluation results
results = {}

# Looping through the metrics to compute the scores for the extracted topics
for scorers, _ in metrics:
    for scorer, name in scorers:
        # Using the scorer to evaluate the topics and storing the score
        score = scorer.score(output_tm)
        results[name] = float(score)  # Converting the score to float for consistent formatting

# Printing the evaluation results, which will be used as Topic Coherence and Topic Diversity respectively
print("Results")
print("============")
for metric, score in results.items():
    print(f"{metric}: {str(score)}")
print(" ")  # Adding a space for cleaner output

Results
npmi: 0.11711191329782415
diversity: 0.4666666666666667
 


# Evaluation of Paraphrasing

Please install the relevant packages for the evaluation: 
- datasets
- transformers
- rouge
- sacrebleu

In [8]:
# Loading a sample test set from the Hugging Face's dataset repository for the evaluation of paraphrasing
from datasets import load_dataset
hf_dataset = 'HHousen/ParaSCI'
dataset = load_dataset(hf_dataset, split="test")

Repo card metadata block was not found. Setting CardData to empty.


In [9]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from rouge import Rouge 
from sacrebleu.metrics import BLEU

# Initializing the ROUGE and BLEU metrics for evaluation
rouge = Rouge()
bleu = BLEU()

def paraphrase(sent, en_lan, lan_en):
    """Paraphrase a sentence by translating it to another language and then back to the original language."""
    
    # Forward translation: English to the target language (e.g., French, German, etc.)
    
    # Initializing the tokenizer and model for forward translation
    forward_tokenizer = AutoTokenizer.from_pretrained(en_lan)
    forward_model = AutoModelForSeq2SeqLM.from_pretrained(en_lan)
    
    # Encoding the sentence to input IDs and generating the translation
    input_ids = forward_tokenizer.encode(sent, return_tensors="pt")
    forward_outputs = forward_model.generate(input_ids)
    
    # Decoding the output IDs to get the translated sentence
    forward_decoded = forward_tokenizer.decode(forward_outputs[0], skip_special_tokens=True)
    
    # Backward translation: Target language back to English
    
    # Initializing the tokenizer and model for backward translation
    back_tokenizer = AutoTokenizer.from_pretrained(lan_en)
    back_model = AutoModelForSeq2SeqLM.from_pretrained(lan_en)
    
    # Encoding the translated sentence to input IDs and generating the paraphrased version
    back_input_ids = back_tokenizer.encode(forward_decoded, return_tensors="pt")
    back_outputs = back_model.generate(back_input_ids)
    
    # Decoding the output IDs to get the paraphrased sentence
    back_decoded = back_tokenizer.decode(back_outputs[0], skip_special_tokens=True)

    return back_decoded

In [10]:
# Prefix commonly used for the Helsinki-NLP models in the Hugging Face model repository
prefix = "Helsinki-NLP/opus-mt-"

# Setting up model paths for different language pairs

# English-German and reverse
f1 = 'facebook/wmt19-en-de'  # Facebook's WMT English-German model
f2 = 'Helsinki-NLP/opus-mt-gem-en'  # Helsinki-NLP's model for English-German

# English-Chinese (Mandarin) and reverse
zh1 = prefix + 'zh-en'  # Chinese to English
zh2 = prefix + 'en-zh'  # English to Chinese

# English-German and reverse
ge1 = prefix + 'de-en'  # German to English
ge2 = prefix + 'en-de'  # English to German

# English-French and reverse
fr1 = prefix + 'fr-en'  # French to English
fr2 = prefix + 'en-fr'  # English to French

# English-Russian and reverse
ru1 = prefix + 'ru-en'  # Russian to English
ru2 = prefix + 'en-ru'  # English to Russian

# English-Arabic and reverse
ar1 = prefix + 'ar-en'  # Arabic to English
ar2 = prefix + 'en-ar'  # English to Arabic

# English-Japanese and reverse
ja1 = prefix + 'jap-en'  # Japanese to English
ja2 = prefix + 'en-jap'  # English to Japanese

In [11]:
# Initialize empty lists to store reference paraphrases and generated paraphrases
refs, paras = [], []

# Looping through a subset of the dataset to generate paraphrases
for data in list(dataset):
    # Extract the sentence that needs to be paraphrased
    to_para = data['sentence1']
    
    # Default setting: Generate the paraphrase by translating to Chinese (zh) and then translating back to English (en)
    paraed = paraphrase(to_para, zh1, zh2)
    
    # Extract the reference paraphrase from the dataset
    ref = data['sentence2']
    
    # Append the reference and generated paraphrase to the lists
    refs.append(ref)
    paras.append(paraed)

In [12]:
# Compute the corpus-level BLEU score for the generated paraphrases against the reference paraphrases
bleu_score = bleu.corpus_score(paras, [refs])

# Display the BLEU score
print(f"Corpus BLEU Score: {bleu_score.score}")

Corpus BLEU Score: 0.34391269612522707


In [13]:
# Compute the average ROUGE scores for the generated paraphrases against the reference paraphrases
rouge_scores = rouge.get_scores(paras, refs, avg=True)

# Display the ROUGE scores
print(f"Average ROUGE Scores: {rouge_scores}")

Average ROUGE Scores: {'rouge-1': {'r': 0.00200677409268527, 'p': 0.006522342934275908, 'f': 0.00293086131506436}, 'rouge-2': {'r': 0.00030902965492835157, 'p': 0.0009451539776787983, 'f': 0.0004490302780307705}, 'rouge-l': {'r': 0.0019690569260281243, 'p': 0.0064420697134223845, 'f': 0.0028797348814809014}}
