In [None]:
!pip install sacrebleu
!pip install bert-score
!pip install unbabel-comet


Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.0.

In [None]:
import pandas as pd
import sacrebleu
import re

# Load your CSV
df = pd.read_csv("/content/drive/MyDrive/ssmt_project/MT_translated.csv")

# Optional cleaning function (you can customize this)
def clean_text(text):
    return re.sub(r'(హుహ్[,.\s]*)+', '', str(text)).strip()

# Apply cleaning
df["actual_te"] = df["actual_te"].apply(clean_text)
df["actual_en_translated"] = df["actual_en_translated"].apply(clean_text)
df["asr_decoded_translated"] = df["asr_decoded_translated"].apply(clean_text)

# References and hypotheses
references = df["actual_te"].tolist()
actual_en_hypotheses = df["actual_en_translated"].tolist()
asr_hypotheses = df["asr_decoded_translated"].tolist()

# BLEU
bleu_actual = sacrebleu.corpus_bleu(actual_en_hypotheses, [references])
bleu_asr = sacrebleu.corpus_bleu(asr_hypotheses, [references])

# ChrF
chrf_metric = sacrebleu.metrics.CHRF()
chrf_actual = chrf_metric.corpus_score(actual_en_hypotheses, [references])
chrf_asr = chrf_metric.corpus_score(asr_hypotheses, [references])

# TER
ter_metric = sacrebleu.metrics.TER()
ter_actual = ter_metric.corpus_score(actual_en_hypotheses, [references])
ter_asr = ter_metric.corpus_score(asr_hypotheses, [references])

# Print results
print("===== Scores for actual_en_translated =====")
print("BLEU:", bleu_actual.score)
print("ChrF:", chrf_actual.score)
print("TER:", ter_actual.score)

print("\n===== Scores for asr_decoded_translated =====")
print("BLEU:", bleu_asr.score)
print("ChrF:", chrf_asr.score)
print("TER:", ter_asr.score)


===== Scores for actual_en_translated =====
BLEU: 9.441641551996806
ChrF: 45.15947215998387
TER: 94.23497267759564

===== Scores for asr_decoded_translated =====
BLEU: 7.420697887792014
ChrF: 43.9219458032578
TER: 93.7568306010929


In [4]:
!pip install sacrebleu


Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.1.1-py3-none-any.whl (19 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-3.1.1 sacrebleu-2.5.1


In [5]:
import pandas as pd
import sacrebleu
import re
import nltk
from nltk.translate.meteor_score import meteor_score

# Download necessary NLTK data
nltk.download('wordnet')

# ---------- Load & Clean ----------
df = pd.read_csv("/content/drive/MyDrive/ssmt_project/eval_translated.csv")

# Optional text cleaning
def clean_text(text):
    return re.sub(r'(హుహ్[,.\s]*)+', '', str(text)).strip()

for col in ["actual_te", "actual_en_translated", "asr_decoded_translated"]:
    df[col] = df[col].apply(clean_text)

# ---------- Evaluation Inputs ----------
references = df["actual_te"].tolist()
actual_en_hypotheses = df["actual_en_translated"].tolist()
asr_hypotheses = df["asr_decoded_translated"].tolist()

# ---------- BLEU ----------
bleu_actual = sacrebleu.corpus_bleu(actual_en_hypotheses, [references])
bleu_asr = sacrebleu.corpus_bleu(asr_hypotheses, [references])

# ---------- ChrF ----------
chrf_metric = sacrebleu.metrics.CHRF()
chrf_actual = chrf_metric.corpus_score(actual_en_hypotheses, [references])
chrf_asr = chrf_metric.corpus_score(asr_hypotheses, [references])

# ---------- TER ----------
ter_metric = sacrebleu.metrics.TER()
ter_actual = ter_metric.corpus_score(actual_en_hypotheses, [references])
ter_asr = ter_metric.corpus_score(asr_hypotheses, [references])

# ---------- METEOR ----------
def compute_meteor(refs, hyps):
    scores = []
    for ref, hyp in zip(refs, hyps):
        ref_tokens = ref.split()
        hyp_tokens = hyp.split()
        score = meteor_score([ref_tokens], hyp_tokens)
        scores.append(score)
    return sum(scores) / len(scores)

meteor_actual = compute_meteor(references, actual_en_hypotheses)
meteor_asr = compute_meteor(references, asr_hypotheses)

# ---------- Print Results ----------
print("===== Scores for actual_en_translated =====")
print(f"BLEU  : {bleu_actual.score:.2f}")
print(f"ChrF  : {chrf_actual.score:.2f}")
print(f"TER   : {ter_actual.score:.2f}")
print(f"METEOR: {meteor_actual:.4f}")

print("\n===== Scores for asr_decoded_translated =====")
print(f"BLEU  : {bleu_asr.score:.2f}")
print(f"ChrF  : {chrf_asr.score:.2f}")
print(f"TER   : {ter_asr.score:.2f}")
print(f"METEOR: {meteor_asr:.4f}")


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


===== Scores for actual_en_translated =====
BLEU  : 9.44
ChrF  : 45.16
TER   : 94.23
METEOR: 0.2632

===== Scores for asr_decoded_translated =====
BLEU  : 7.42
ChrF  : 43.92
TER   : 93.76
METEOR: 0.2471


In [6]:
!pip install bert-score

from bert_score import score

# Compute BERTScore for actual_en_translated
P1, R1, F1 = score(actual_en_hypotheses, references, lang="te", verbose=True)
bertscore_actual = F1.mean().item()

# Compute BERTScore for asr_decoded_translated
P2, R2, F2 = score(asr_hypotheses, references, lang="te", verbose=True)
bertscore_asr = F2.mean().item()

# Print Results
print("===== BERTScore =====")
print(f"actual_en_translated     : {bertscore_actual:.4f}")
print(f"asr_decoded_translated   : {bertscore_asr:.4f}")


Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.0.

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

calculating scores...
computing bert embedding.


  0%|          | 0/14 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/7 [00:00<?, ?it/s]

done in 229.27 seconds, 1.92 sentences/sec
calculating scores...
computing bert embedding.


  0%|          | 0/14 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/7 [00:00<?, ?it/s]

done in 245.55 seconds, 1.80 sentences/sec
===== BERTScore =====
actual_en_translated     : 0.8080
asr_decoded_translated   : 0.8044


In [None]:
!pip install jiwer


In [8]:
!pip install unbabel-comet -q

from comet import download_model, load_from_checkpoint
import torch

# Download and load a pretrained COMET model
model_path = download_model("Unbabel/wmt22-comet-da")  # You can use other models too
comet_model = load_from_checkpoint(model_path)

# Prepare data in COMET format
comet_data_actual = [
    {"src": src, "mt": mt, "ref": ref}
    for src, mt, ref in zip(df["actual_en"], df["actual_en_translated"], df["actual_te"])
]

comet_data_asr = [
    {"src": src, "mt": mt, "ref": ref}
    for src, mt, ref in zip(df["actual_en"], df["asr_decoded_translated"], df["actual_te"])
]

# Predict
comet_actual = comet_model.predict(comet_data_actual, batch_size=8, gpus=1 if torch.cuda.is_available() else 0)
comet_asr = comet_model.predict(comet_data_asr, batch_size=8, gpus=1 if torch.cuda.is_available() else 0)

# Print COMET scores
print("===== COMET Score =====")
print(f"actual_en_translated     : {comet_actual['mean_score']:.4f}")
print(f"asr_decoded_translated   : {comet_asr['mean_score']:.4f}")


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.5.1. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/2760a223ac957f30acfb18c8aa649b01cf1d75f2/checkpoints/model.ckpt`
/usr/local/lib/python3.11/dist-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']
INFO:pytorch_lightning.utilities.rank_zero:You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
Predicting 

===== COMET Score =====





KeyError: 'mean_score'

In [11]:
# Extract the list of scores
scores = comet_actual['scores']

# Calculate the average
average_score = sum(scores) / len(scores)

# Print the average score
print(f"Average COMET Score: {average_score:.4f}")


Average COMET Score: 0.7250


In [12]:
# Extract the list of scores
scores = comet_asr['scores']

# Calculate the average
average_score = sum(scores) / len(scores)

# Print the average score
print(f"Average COMET Score: {average_score:.4f}")


Average COMET Score: 0.7197


In [15]:
!pip install jiwer
import pandas as pd
from jiwer import wer

# Load your dataset
df = pd.read_csv("/content/drive/MyDrive/ssmt_project/eval_translated.csv")

# Calculate WER for each row
df["wer"] = df.apply(lambda row: wer(row["actual_en"], row["asr_decoded"]), axis=1)

# Filter rows where WER > 0 (i.e., errors)
df_with_errors = df[df["wer"] > 0]

# Save only the error rows to a new CSV
df_with_errors.to_csv("rows_with_wer_errors.csv", index=False)




In [17]:
df_with_errors.shape

(339, 8)