Link: https://learn.microsoft.com/en-us/python/api/azure-ai-evaluation/azure.ai.evaluation.ungroundedattributesevaluator?view=azure-python

# METEOR using NLTK

In [1]:
import os

In [2]:
reference = "The cat is sitting on the mat"
translation_a = "The feline is sitting on the mat"
translation_b = "Mat the one sitting is cat the"
translation_c = "The kitten is seated on mat"

In [3]:
pip install nltk



In [4]:
import nltk
nltk.download("wordnet")
nltk.download("omw-1.4")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [5]:
from nltk.translate.meteor_score import meteor_score

score_1 = meteor_score([reference.split()], translation_a.split())
score_2 = meteor_score([reference.split()], translation_b.split())
score_3 = meteor_score([reference.split()], translation_c.split())


score_1, score_2, score_3

(0.8412698412698414, 0.4285714285714286, 0.6463768115942029)

In [6]:
ref = [
    "The quick brown fox jumps over the lazy dog.",
    "The swift brown fox leaps above the sleepy hound."
]

h1 = "The fast brown fox jumps over the lazy dog."
h2 = "Brown quick the fox jumps over the dog lazy."

ref_tokenized = [r.split() for r in ref]

s1 = meteor_score(ref_tokenized, h1.split())
s2 = meteor_score(ref_tokenized, h2.split())

s1, s2

(0.9993141289437586, 0.7052154195011338)

In [10]:
r1 = "GPT-4 is doing wonders for summarization."
hh = "GPT-4 works really awesome for summary kind of related tasks."

ss = meteor_score([r1.split()], hh.split())
ss

0.15625

In [11]:
pip install azure-ai-evaluation

Collecting azure-ai-evaluation
  Downloading azure_ai_evaluation-1.8.0-py3-none-any.whl.metadata (38 kB)
Collecting promptflow-devkit>=1.17.1 (from azure-ai-evaluation)
  Downloading promptflow_devkit-1.18.1-py3-none-any.whl.metadata (5.7 kB)
Collecting promptflow-core>=1.17.1 (from azure-ai-evaluation)
  Downloading promptflow_core-1.18.1-py3-none-any.whl.metadata (2.7 kB)
Collecting azure-identity>=1.16.0 (from azure-ai-evaluation)
  Downloading azure_identity-1.23.0-py3-none-any.whl.metadata (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting azure-core>=1.30.2 (from azure-ai-evaluation)
  Downloading azure_core-1.34.0-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.9/42.9 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Collecting azure-storage-blob>=12.10.0 (from azure-ai-evaluation)
  Downloading azure_storage_blob-12.25.1-py3-none-any.whl

# BLEU

In [2]:
import os

from azure.ai.evaluation import evaluate, RelevanceEvaluator, ViolenceEvaluator, BleuScoreEvaluator

# NLP bleu score evaluator
bleu_score_evaluator = BleuScoreEvaluator()
result = bleu_score_evaluator(
    response="Tokyo is the capital of Japan.",
    ground_truth="The capital of Japan is Tokyo."
)

result

[nltk_data] Downloading package perluniprops to /root/nltk_data...
[nltk_data]   Unzipping misc/perluniprops.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


{'bleu_score': 0.22961813530951883,
 'bleu_result': 'fail',
 'bleu_threshold': 0.5}

In [4]:
r1 = "GPT-4 is doing wonders for summarization."
hh = "GPT-4 works really awesome for summary kind of related tasks."

In [5]:
result = bleu_score_evaluator(
    response=r1,
    ground_truth=hh
)

result

{'bleu_score': 0.024049818735296713,
 'bleu_result': 'fail',
 'bleu_threshold': 0.5}

# F1 Score

In [6]:
from azure.ai.evaluation import F1ScoreEvaluator

f1_evaluator = F1ScoreEvaluator(threshold=0.6)
result = f1_evaluator(
    response=r1,
    ground_truth=hh
)
result

{'f1_score': 0.25, 'f1_result': 'fail', 'f1_threshold': 0.6}

# GLEU

In [7]:
from azure.ai.evaluation import GleuScoreEvaluator

gleu_evaluator = GleuScoreEvaluator()
result = gleu_evaluator(
    response=r1,
    ground_truth=hh
)
result

{'gleu_score': 0.07894736842105263,
 'gleu_result': 'fail',
 'gleu_threshold': 0.5}

# METEOR

In [8]:
from azure.ai.evaluation import MeteorScoreEvaluator

meteor_evaluator = MeteorScoreEvaluator(alpha=0.8, threshold=0.3)
result = meteor_evaluator(
    response=r1,
    ground_truth=hh
)
result

{'meteor_score': 0.14705882352941177,
 'meteor_result': 'fail',
 'meteor_threshold': 0.3}

# ROUGE

In [9]:
from azure.ai.evaluation import RougeScoreEvaluator, RougeType

rouge_evaluator = RougeScoreEvaluator(
    rouge_type=RougeType.ROUGE_4,
    precision_threshold=0.5,
    recall_threshold=0.5,
    f1_score_threshold=0.5
)
result = rouge_evaluator(
    response=r1,
    ground_truth=hh
)
result

{'rouge_precision': 0.0,
 'rouge_recall': 0.0,
 'rouge_f1_score': 0.0,
 'rouge_precision_result': 'fail',
 'rouge_recall_result': 'fail',
 'rouge_f1_score_result': 'fail',
 'rouge_precision_threshold': 0.5,
 'rouge_recall_threshold': 0.5,
 'rouge_f1_score_threshold': 0.5}

In [10]:
pip install -U sentence-transformers

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 

# Similarity

In [11]:


from sentence_transformers import SentenceTransformer, util
sentences = ["I'm happy", "I'm full of happiness"]

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Compute embedding for both lists
embedding_1 = model.encode(r1, convert_to_tensor=True)
embedding_2 = model.encode(hh, convert_to_tensor=True)

util.pytorch_cos_sim(embedding_1, embedding_2)

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tensor([[0.7941]])