In [None]:
# 0. Setup - Run this cell first if you don't have the libraries installed
%pip install nltk rouge-score bert_score sentence-transformers sacrebleu transformers torch accelerate

Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
Collecting accelerate
  Using cached accelerate-1.7.0-py3-none-any.whl.metadata (19 kB)
Collecting absl-py (from rouge-score)
  Using cached absl_py-2.2.2-py3-none-any.whl.metadata (2.6 kB)
Collecting matplotlib (from bert_score)
  Using cached matplotlib-3.10.3-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting portalocker (from sacrebleu)
  Dow


[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# # NLTK downloads (run once in Python console or a separate cell)
import nltk
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True) # Needed for WordNet in some NLTK versions

True

# Import Libs

In [3]:
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import single_meteor_score # Note: NLTK's meteor is single_meteor_score
from rouge_score import rouge_scorer
import sacrebleu # For chrF
from sentence_transformers import SentenceTransformer, util
from bert_score import score as bert_score_calc
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import torch
import re
import pandas as pd

print("Libraries imported successfully!")

  from .autonotebook import tqdm as notebook_tqdm


Libraries imported successfully!


# Define sample sentences

In [4]:
source_sentence = "The cat sat on the mat."

target_sentences = [
    "A feline was resting on the rug.",                            # High similarity
    "The cat was on the mat.",                                   # Very high similarity, slight variation
    "There is a cat on the mat.",                                # High similarity
    "The dog chased the ball.",                                  # Low similarity
    "Weather is pleasant today.",                                # No similarity
    "Le chat est assis sur le tapis."                            # French translation - for some metrics to show 0
]

print("Source Sentence:", source_sentence)
print("Target Sentences:", target_sentences)

Source Sentence: The cat sat on the mat.
Target Sentences: ['A feline was resting on the rug.', 'The cat was on the mat.', 'There is a cat on the mat.', 'The dog chased the ball.', 'Weather is pleasant today.', 'Le chat est assis sur le tapis.']


# Preprocessing Function

In [5]:
def normalize_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip() # Normalize whitespace
    return text

normalized_source = normalize_text(source_sentence)
normalized_targets = [normalize_text(t) for t in target_sentences]


# Lexical Sim. Metrices

In [None]:
# 4.1 BLEU (Bilingual Evaluation Understudy)
# Measures precision of n-grams. Output is 0-1 (higher is better).
# NLTK's sentence_bleu expects tokenized input.
print("\n--- BLEU Scores ---")
smoothie = SmoothingFunction().method1 # Smoothing for short sentences

for target in target_sentences:
    tokenized_source = normalized_source.split() # Using normalized for consistency here
    tokenized_target = normalize_text(target).split()
    bleu_score = sentence_bleu([tokenized_source], tokenized_target, smoothing_function=smoothie)
    print(f"Target: \"{target}\" -> BLEU: {bleu_score:.4f}")


--- BLEU Scores ---
Target: "A feline was resting on the rug." -> BLEU: 0.0699
Target: "The cat was on the mat." -> BLEU: 0.2541
Target: "There is a cat on the mat." -> BLEU: 0.1757
Target: "The dog chased the ball." -> BLEU: 0.0523
Target: "Weather is pleasant today." -> BLEU: 0.0000
Target: "Le chat est assis sur le tapis." -> BLEU: 0.0000


In [None]:
# 4.2 ROUGE (Recall-Oriented Understudy for Gisting Evaluation)
# Measures recall of n-grams. We'll look at ROUGE-L F-score. Output is 0-1.
print("\n--- ROUGE-L F-scores ---")
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

for target in target_sentences:
    # ROUGE scorer expects untokenized strings
    scores = scorer.score(source_sentence, target)
    print(f"Target: \"{target}\" -> ROUGE-L F1: {scores['rougeL'].fmeasure:.4f}")


--- ROUGE-L F-scores ---
Target: "A feline was resting on the rug." -> ROUGE-L F1: 0.3077
Target: "The cat was on the mat." -> ROUGE-L F1: 0.8333
Target: "There is a cat on the mat." -> ROUGE-L F1: 0.6154
Target: "The dog chased the ball." -> ROUGE-L F1: 0.3636
Target: "Weather is pleasant today." -> ROUGE-L F1: 0.0000
Target: "Le chat est assis sur le tapis." -> ROUGE-L F1: 0.0000


In [8]:
# 4.3 METEOR (Metric for Evaluation of Translation with Explicit ORdering)
# Considers synonyms and stemming. Output is 0-1.
# NLTK's single_meteor_score expects untokenized strings.
print("\n--- METEOR Scores ---")
for target in target_sentences:
    # METEOR is sensitive to tokenization; NLTK's implementation handles it.
    # It's generally better for English.
    try:
        # For non-English or very different sentences, METEOR might be low or error if word alignment fails.
        meteor_val = single_meteor_score(source_sentence, target)
    except Exception as e: # Can sometimes have issues with very dissimilar/non-alpha sentences
        meteor_val = 0.0
        print(f"Note: METEOR encountered an issue with '{target}', score set to 0. Error: {e}")
    print(f"Target: \"{target}\" -> METEOR: {meteor_val:.4f}")


--- METEOR Scores ---
Note: METEOR encountered an issue with 'A feline was resting on the rug.', score set to 0. Error: "hypothesis" expects pre-tokenized hypothesis (Iterable[str]): A feline was resting on the rug.
Target: "A feline was resting on the rug." -> METEOR: 0.0000
Note: METEOR encountered an issue with 'The cat was on the mat.', score set to 0. Error: "hypothesis" expects pre-tokenized hypothesis (Iterable[str]): The cat was on the mat.
Target: "The cat was on the mat." -> METEOR: 0.0000
Note: METEOR encountered an issue with 'There is a cat on the mat.', score set to 0. Error: "hypothesis" expects pre-tokenized hypothesis (Iterable[str]): There is a cat on the mat.
Target: "There is a cat on the mat." -> METEOR: 0.0000
Note: METEOR encountered an issue with 'The dog chased the ball.', score set to 0. Error: "hypothesis" expects pre-tokenized hypothesis (Iterable[str]): The dog chased the ball.
Target: "The dog chased the ball." -> METEOR: 0.0000
Note: METEOR encountered a

In [9]:
# 4.4 chrF (character n-gram F-score)
# Good for morphological variations and less sensitive to tokenization issues. Output is 0-100 (higher is better).
print("\n--- chrF Scores ---")
for target in target_sentences:
    # sacrebleu.sentence_chrf expects untokenized strings.
    # It takes the candidate first, then a list of references.
    chrf_score = sacrebleu.sentence_chrf(target, [source_sentence]).score
    print(f"Target: \"{target}\" -> chrF: {chrf_score:.2f}") # Score is typically 0-100


--- chrF Scores ---
Target: "A feline was resting on the rug." -> chrF: 18.01
Target: "The cat was on the mat." -> chrF: 64.69
Target: "There is a cat on the mat." -> chrF: 64.21
Target: "The dog chased the ball." -> chrF: 16.73
Target: "Weather is pleasant today." -> chrF: 17.94
Target: "Le chat est assis sur le tapis." -> chrF: 12.12


# Embedding-based Similarity Metrics

In [10]:
# 5.1 Sentence-BERT (Sentence Transformers)
# Calculates cosine similarity between sentence embeddings. Output is -1 to 1 (higher is better).
print("\n--- Sentence-BERT Similarity ---")
# You can choose different models from huggingface.co/models?library=sentence-transformers
# 'all-MiniLM-L6-v2' is fast and good. 'all-mpnet-base-v2' is more robust.
sbert_model_name = 'all-MiniLM-L6-v2'
try:
    sbert_model = SentenceTransformer(sbert_model_name)
    source_embedding = sbert_model.encode(source_sentence, convert_to_tensor=True)
    for target in target_sentences:
        target_embedding = sbert_model.encode(target, convert_to_tensor=True)
        cosine_similarity = util.pytorch_cos_sim(source_embedding, target_embedding).item()
        print(f"Target: \"{target}\" -> SBERT Cosine Sim: {cosine_similarity:.4f}")
except Exception as e:
    print(f"Error loading SentenceTransformer model {sbert_model_name}: {e}")
    print("Skipping Sentence-BERT.")


--- Sentence-BERT Similarity ---


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Target: "A feline was resting on the rug." -> SBERT Cosine Sim: 0.5631
Target: "The cat was on the mat." -> SBERT Cosine Sim: 0.9177
Target: "There is a cat on the mat." -> SBERT Cosine Sim: 0.9126
Target: "The dog chased the ball." -> SBERT Cosine Sim: 0.1213
Target: "Weather is pleasant today." -> SBERT Cosine Sim: -0.0260
Target: "Le chat est assis sur le tapis." -> SBERT Cosine Sim: 0.0319


In [None]:
# 5.2 BERTScore
# Computes similarity by matching token embeddings from BERT, weighted by IDF.
# Returns Precision, Recall, and F1. We'll use F1. Output is 0-1 (higher is better).
print("\n--- BERTScore F1 ---")
# BERTScore can be slow for many pairs without a GPU.
# It automatically uses a default BERT model (can be specified).
# It expects lists of candidates and references.
try:
    # bert_score_calc returns (P, R, F1) tensors
    # We calculate it one by one for clarity here, but batching is more efficient.
    for target in target_sentences:
        P, R, F1 = bert_score_calc([target], [source_sentence], lang="en", verbose=False, idf=False) # Disable IDF for pure semantic similarity
        # For more robust scores, across domains,  idf=True or idf_sents=[list of sentences for corpus stats]
        print(f"Target: \"{target}\" -> BERTScore F1: {F1.mean():.4f}")
except Exception as e:
    print(f"Error calculating BERTScore: {e}")
    print("Skipping BERTScore. Ensure you have a compatible PyTorch version and transformers.")


--- BERTScore F1 ---


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Target: "A feline was resting on the rug." -> BERTScore F1: 0.9447


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Target: "The cat was on the mat." -> BERTScore F1: 0.9728


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Target: "There is a cat on the mat." -> BERTScore F1: 0.9496


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Target: "The dog chased the ball." -> BERTScore F1: 0.9350


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Target: "Weather is pleasant today." -> BERTScore F1: 0.8659


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Target: "Le chat est assis sur le tapis." -> BERTScore F1: 0.8546


# LLM-based Similarity (Prompt Engineering)

In [12]:
# 6. LLM-based Similarity (FLAN-T5-small)
print("\n--- LLM (FLAN-T5-small) Prompted Similarity ---")
llm_model_name = "google/flan-t5-small" 

try:
    tokenizer_llm = AutoTokenizer.from_pretrained(llm_model_name)
    model_llm = AutoModelForSeq2SeqLM.from_pretrained(llm_model_name)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model_llm.to(device)
    print(f"Loaded LLM: {llm_model_name}")

    def get_llm_similarity(sentence1, sentence2):
        prompt = f"""
        Sentence 1: "{sentence1}"
        Sentence 2: "{sentence2}"
        Question: How semantically similar are Sentence 1 and Sentence 2?
        Provide a similarity score from 0.0 (not similar) to 1.0 (identical in meaning).
        Answer (Score only, e.g., 0.75):
        """
        inputs = tokenizer_llm(prompt, return_tensors="pt", max_length=512, truncation=True)
        inputs = inputs.to(device)

        outputs = model_llm.generate(**inputs, max_new_tokens=10) # Generate a short response
        response_text = tokenizer_llm.decode(outputs[0], skip_special_tokens=True)

        # [!] Attempt to parse the score from the response
        match = re.search(r"(\d\.\d+)", response_text)
        if match:
            try:
                return float(match.group(1))
            except ValueError:
                return f"Could not parse float from: {response_text}"
        else:
            return f"Could not find score in: {response_text}"

    for target in target_sentences:
        llm_score = get_llm_similarity(source_sentence, target)
        print(f"Target: \"{target}\" -> LLM Similarity: {llm_score}")
    print("\nNote: LLM-based scores are highly dependent on the prompt and model capabilities.")
    print("The parsing of the score is also heuristic and might need refinement.")

except Exception as e:
    print(f"Error with LLM model {llm_model_name}: {e}")
    print("Skipping LLM-based similarity. Ensure transformers and a model are correctly set up.")


--- LLM (FLAN-T5-small) Prompted Similarity ---


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Loaded LLM: google/flan-t5-small
Target: "A feline was resting on the rug." -> LLM Similarity: 1.0
Target: "The cat was on the mat." -> LLM Similarity: 1.0
Target: "There is a cat on the mat." -> LLM Similarity: 0.0
Target: "The dog chased the ball." -> LLM Similarity: 1.0
Target: "Weather is pleasant today." -> LLM Similarity: 1.0
Target: "Le chat est assis sur le tapis." -> LLM Similarity: 1.0

Note: LLM-based scores are highly dependent on the prompt and model capabilities.
The parsing of the score is also heuristic and might need refinement.


# Results

In [14]:
# 7. Consolidating Results (Fully Functional)

import pandas as pd
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import single_meteor_score
from rouge_score import rouge_scorer
import sacrebleu
from sentence_transformers import util # Ensure util is imported if sbert_model is used
# bert_score_calc should be imported from bert_score import score as bert_score_calc
# Ensure get_llm_similarity is defined from the previous cell

results_summary = []
smoothie = SmoothingFunction().method1 # For BLEU

# Define a rouge scorer instance once
rouge_l_scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

print("Starting result consolidation...\n")

for i, target in enumerate(target_sentences):
    print(f"Processing target {i+1}/{len(target_sentences)}: \"{target[:30]}...\"")
    current_scores = {"Target Sentence": target}
    normalized_target_for_bleu = normalize_text(target).split()
    normalized_source_for_bleu = normalize_text(source_sentence).split()

    # BLEU
    try:
        bleu_val = sentence_bleu([normalized_source_for_bleu], normalized_target_for_bleu, smoothing_function=smoothie)
        current_scores["BLEU"] = f"{bleu_val:.3f}"
    except Exception as e:
        current_scores["BLEU"] = f"Error: {e}"

    # ROUGE-L
    try:
        rouge_scores = rouge_l_scorer.score(source_sentence, target)
        current_scores["ROUGE-L"] = f"{rouge_scores['rougeL'].fmeasure:.3f}"
    except Exception as e:
        current_scores["ROUGE-L"] = f"Error: {e}"

    # METEOR
    try:
        meteor_val = single_meteor_score(source_sentence, target)
        current_scores["METEOR"] = f"{meteor_val:.3f}"
    except Exception as e: # Handles cases like ZeroDivisionError for very dissimilar sentences or non-alpha
        current_scores["METEOR"] = "Error/Low" # Or f"{e}" for specific error

    # chrF
    try:
        chrf_val = sacrebleu.sentence_chrf(target, [source_sentence]).score
        current_scores["chrF"] = f"{chrf_val:.2f}" # chrF score is 0-100
    except Exception as e:
        current_scores["chrF"] = f"Error: {e}"

    # Sentence-BERT
    try:
        # Check if sbert_model was loaded successfully in the previous cell
        if 'sbert_model' in globals() and sbert_model is not None:
            source_emb = sbert_model.encode(source_sentence, convert_to_tensor=True)
            target_emb = sbert_model.encode(target, convert_to_tensor=True)
            sbert_sim = util.pytorch_cos_sim(source_emb, target_emb).item()
            current_scores["SBERT"] = f"{sbert_sim:.3f}"
        else:
            current_scores["SBERT"] = "Skipped (Model not loaded)"
    except Exception as e:
        current_scores["SBERT"] = f"Error: {e}"

    # BERTScore F1
    try:
        # bert_score_calc should be available if imports were successful
        P, R, F1 = bert_score_calc([target], [source_sentence], lang="en", verbose=False, idf=False)
        current_scores["BERTScore F1"] = f"{F1.mean():.3f}"
    except NameError:
        current_scores["BERTScore F1"] = "Skipped (bert_score_calc not found)"
    except Exception as e:
        current_scores["BERTScore F1"] = f"Error: {e}"


    # LLM Similarity (FLAN-T5)
    try:
        # Check if LLM models and function were loaded/defined
        if ('model_llm' in globals() and model_llm is not None and
            'tokenizer_llm' in globals() and tokenizer_llm is not None and
            'get_llm_similarity' in globals()):
            llm_sim = get_llm_similarity(source_sentence, target)
            current_scores["LLM Sim (FLAN)"] = llm_sim # llm_sim might already be a string (score or error message)
        else:
            current_scores["LLM Sim (FLAN)"] = "Skipped (Model/Func not loaded)"
    except Exception as e:
        current_scores["LLM Sim (FLAN)"] = f"Error: {e}"

    results_summary.append(current_scores)

df_results = pd.DataFrame(results_summary)

# Set pandas display options for neat printing
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000) # Adjust width as needed for your display
pd.set_option('display.colheader_justify', 'center')
pd.set_option('display.precision', 3) # For float formatting if not already string

print("\n\n--- Consolidated Results Table ---")
print(df_results.to_string(index=False))

print("\nProcessing complete.")

Starting result consolidation...

Processing target 1/6: "A feline was resting on the ru..."


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing target 2/6: "The cat was on the mat...."


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing target 3/6: "There is a cat on the mat...."


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing target 4/6: "The dog chased the ball...."


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing target 5/6: "Weather is pleasant today...."


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing target 6/6: "Le chat est assis sur le tapis..."


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.




--- Consolidated Results Table ---
        Target Sentence           BLEU ROUGE-L   METEOR   chrF SBERT  BERTScore F1  LLM Sim (FLAN)
A feline was resting on the rug. 0.070  0.308  Error/Low 18.01  0.563    0.945           1.0      
         The cat was on the mat. 0.254  0.833  Error/Low 64.69  0.918    0.973           1.0      
      There is a cat on the mat. 0.176  0.615  Error/Low 64.21  0.913    0.950           0.0      
        The dog chased the ball. 0.052  0.364  Error/Low 16.73  0.121    0.935           1.0      
      Weather is pleasant today. 0.000  0.000  Error/Low 17.94 -0.026    0.866           1.0      
 Le chat est assis sur le tapis. 0.000  0.000  Error/Low 12.12  0.032    0.855           1.0      

Processing complete.
