In [1]:
import torch
import numpy as np
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import load_dataset, Dataset
from sacrebleu import corpus_bleu
import pandas as pd
from comet import download_model, load_from_checkpoint

print(f"NumPy version: {np.__version__}")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"GPU name: {torch.cuda.get_device_name(0)}")


NumPy version: 1.25.2
PyTorch version: 2.5.1+cu118
CUDA available: True
GPU name: NVIDIA A100-PCIE-40GB


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [3]:
model_name = "Helsinki-NLP/opus-mt-en-de"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

input_text = "This is a test sentence for translation."
inputs = tokenizer(input_text, return_tensors="pt", padding=True).to(device)
outputs = model.generate(**inputs)
translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Translated Text:", translated_text)

Translated Text: Dies ist ein Testsatz für die Übersetzung.


In [4]:
dataset = load_dataset("wmt14", "de-en")

print("Available Splits:", dataset.keys())
print("Sample Data:", dataset['train'][0])

Available Splits: dict_keys(['train', 'validation', 'test'])
Sample Data: {'translation': {'de': 'Wiederaufnahme der Sitzungsperiode', 'en': 'Resumption of the session'}}


In [5]:
train_data = dataset['train']
english_sentences = [example['translation']['en'] for example in train_data]
german_sentences = [example['translation']['de'] for example in train_data]

print(f"Number of samples: {len(english_sentences)}")

Number of samples: 4508785


In [6]:
subset_size = 100  
english_subset = english_sentences[:subset_size]
german_subset = german_sentences[:subset_size]


In [7]:
def translate_batch(sentences, batch_size=16):
    translations = []
    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True).to("cuda")
        outputs = model.generate(**inputs)
        translated_texts = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
        translations.extend(translated_texts)
    return translations

translated_subset = translate_batch(english_subset)

In [8]:
bleu_score = corpus_bleu(translated_subset, [german_subset])
print(f"BLEU Score: {bleu_score.score}")


BLEU Score: 25.59905338780902


In [9]:
results = pd.DataFrame({
    "source": english_subset,
    "reference": german_subset,
    "translated": translated_subset
})
results.to_csv("translated_subset_results.csv", index=False)
print("Results saved to translated_subset_results.csv")

Results saved to translated_subset_results.csv


In [10]:
results = pd.read_csv("translated_subset_results.csv")

print("Sample translations with refrences:")
print(results.head(10))


Sample translations with refrences:
                                              source  \
0                          Resumption of the session   
1  I declare resumed the session of the European ...   
2  Although, as you will have seen, the dreaded '...   
3  You have requested a debate on this subject in...   
4  In the meantime, I should like to observe a mi...   
5     Please rise, then, for this minute' s silence.   
6  (The House rose and observed a minute' s silence)   
7              Madam President, on a point of order.   
8  You will be aware from the press and televisio...   
9  One of the people assassinated very recently i...   

                                           reference  \
0                 Wiederaufnahme der Sitzungsperiode   
1  Ich erkläre die am Freitag, dem 17. Dezember u...   
2  Wie Sie feststellen konnten, ist der gefürchte...   
3  Im Parlament besteht der Wunsch nach einer Aus...   
4  Heute möchte ich Sie bitten - das ist auch der...   
5  Ich bitt

In [11]:
model_path = download_model("Unbabel/wmt20-comet-da")
comet_model = load_from_checkpoint(model_path)

data_for_comet = [
    {"src": src, "mt": mt, "ref": ref}
    for src, mt, ref in zip(english_subset, translated_subset, german_subset)
]

comet_scores = comet_model.predict(data_for_comet, batch_size=8, gpus=1)
print("COMET Scores:", comet_scores)

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Lightning automatically upgraded your loaded checkpoint from v1.3.5 to v2.4.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../.cache/huggingface/hub/models--Unbabel--wmt20-comet-da/snapshots/4c372befe4d603e6d0363f434248ecad66945607/checkpoints/model.ckpt`
Encoder model frozen.
/opt/conda/lib/python3.11/site-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA A100-PCIE-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_m

COMET Scores: Prediction([('scores', [1.2063143253326416, 0.712995707988739, 0.23980432748794556, 0.20518940687179565, 0.5901339650154114, 0.7375058531761169, 1.026737093925476, 1.093149185180664, 0.6110635995864868, 0.2614041268825531, 0.5620208978652954, 0.6270941495895386, 0.6581820249557495, 1.093149185180664, 0.6186625957489014, 0.695594310760498, 0.6161750555038452, 0.404382586479187, 0.7712704539299011, 0.22877958416938782, 0.654088020324707, 0.25909215211868286, 0.7377778887748718, 0.38653793931007385, 0.7547405362129211, 0.795854389667511, 0.6559374928474426, 0.6802090406417847, 1.1843135356903076, 0.734921395778656, 0.6526519656181335, 0.560311496257782, 0.5602272152900696, 0.2370995283126831, 0.5649402141571045, 0.6802854537963867, 0.1352175772190094, 0.583405077457428, 0.7225415110588074, 0.20118583738803864, 0.7264267206192017, 0.7179704308509827, 0.6021649241447449, 0.5071976780891418, 0.5306804776191711, 0.22495430707931519, 0.6968742609024048, 0.6516973376274109, 0.6726

In [12]:
comet_results = pd.DataFrame({
    "source": english_subset,
    "reference": german_subset,
    "translated": translated_subset,
    "comet_score": comet_scores['scores']
})

system_score = comet_scores['system_score']
comet_results.to_csv("comet_results_with_system_score.csv", index=False)
print(f"System COMET Score: {system_score}")


System COMET Score: 0.5830645172856748


In [13]:
threshold = 0.5 

low_score_data = [
    {
        "source": src,
        "translated": mt,
        "reference": ref,
        "comet_score": score
    }
    for src, mt, ref, score in zip(english_subset, translated_subset, german_subset, comet_scores['scores'])
    if score < threshold
]

def annotate_errors(data):
    annotated_data = []
    for row in data:
        source = row["source"]
        translated = row["translated"]
        reference = row["reference"]

        if len(translated.split()) < len(reference.split()):
            error = f"<bad>{translated}</bad> (Under-translation: missing words)"
        elif len(translated.split()) > len(reference.split()):
            error = f"<bad>{translated}</bad> (Over-translation: extra words)"
        else:
            error = f"<bad>{translated}</bad> (Possible terminology or grammar issues)"

        annotated_data.append({
            "source": source,
            "translated": translated,
            "reference": reference,
            "comet_score": row["comet_score"],
            "error_annotation": error
        })
    return annotated_data

annotated_results = annotate_errors(low_score_data)

annotated_results_df = pd.DataFrame(annotated_results)

annotated_results_df.to_csv("Annotated_Low_Score_Translations.csv", index=False)

print("Annotated results saved to 'Annotated_Low_Score_Translations.csv'")


Annotated results saved to 'Annotated_Low_Score_Translations.csv'


In [14]:
fine_tuning_data = []
for index, row in annotated_results_df.iterrows():
    source = row['source']
    target = row['error_annotation']  
    reference = row['reference']  

    fine_tuning_data.append({"source": source, "target": target, "reference": reference})

fine_tuning_df = pd.DataFrame(fine_tuning_data)

fine_tuning_file = "fine_tuning_dataset.csv"
fine_tuning_df.to_csv(fine_tuning_file, index=False)
print(f"Fine-tuning dataset saved to {fine_tuning_file}")


Fine-tuning dataset saved to fine_tuning_dataset.csv
