In [1]:
import torch
import numpy as np
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import load_dataset, Dataset
from sacrebleu import corpus_bleu
import pandas as pd
from comet import download_model, load_from_checkpoint
import os
from huggingface_hub import login
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

print(f"NumPy version: {np.__version__}")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"GPU name: {torch.cuda.get_device_name(0)}")


NumPy version: 1.25.2
PyTorch version: 2.5.1+cu118
CUDA available: True
GPU name: Tesla V100-SXM2-32GB


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [3]:
os.environ["HUGGING_FACE_HUB_TOKEN"] = "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"  # Replace with your actual token


In [4]:
de_en_dataset = load_dataset("wmt14", "de-en")

hi_en_dataset = load_dataset("cfilt/iitb-english-hindi")

In [5]:
de_en_model_name = "Helsinki-NLP/opus-mt-de-en"
hi_en_model_name = "Helsinki-NLP/opus-mt-hi-en"

In [6]:
login('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')

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.
Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [7]:
print("Loading LLaMA 2 model and tokenizer...")
llama_model_name = "meta-llama/Llama-2-7b-hf"
llama_tokenizer = AutoTokenizer.from_pretrained(llama_model_name, padding_side='left')
llama_model = AutoModelForCausalLM.from_pretrained(
    llama_model_name,
    device_map="auto",
    torch_dtype=torch.float16
).to(device)

Loading LLaMA 2 model and tokenizer...


We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
llama_tokenizer.pad_token = llama_tokenizer.eos_token
llama_model.config.pad_token_id = llama_tokenizer.eos_token_id

In [9]:
print("Loading German-English model and tokenizer...")
de_en_tokenizer = AutoTokenizer.from_pretrained(de_en_model_name)
de_en_model = AutoModelForSeq2SeqLM.from_pretrained(de_en_model_name).to(device)

Loading German-English model and tokenizer...


In [10]:
print("Loading Hindi-English model and tokenizer...")
hi_en_tokenizer = AutoTokenizer.from_pretrained(hi_en_model_name)
hi_en_model = AutoModelForSeq2SeqLM.from_pretrained(hi_en_model_name).to(device)

Loading Hindi-English model and tokenizer...


In [11]:
print("Processing German-English data...")
de_train_data = de_en_dataset['train']
german_sentences = [example['translation']['de'] for example in de_train_data]
english_de_references = [example['translation']['en'] for example in de_train_data]

Processing German-English data...


In [12]:
print("Loading Hindi-English dataset...")
hi_en_dataset = load_dataset("cfilt/iitb-english-hindi")
print("Dataset structure:", hi_en_dataset['train'].features)
print("Sample data:", hi_en_dataset['train'][0])

Loading Hindi-English dataset...
Dataset structure: {'translation': {'en': Value(dtype='string', id=None), 'hi': Value(dtype='string', id=None)}}
Sample data: {'translation': {'en': 'Give your application an accessibility workout', 'hi': 'अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें'}}


In [13]:
print("Processing Hindi-English data...")
hi_train_data = hi_en_dataset['train']
hindi_sentences = [example['translation']['hi'] for example in hi_train_data]
english_hi_references = [example['translation']['en'] for example in hi_train_data]

Processing Hindi-English data...


In [14]:
subset_size = 2000
german_subset = german_sentences[:subset_size]
english_de_subset = english_de_references[:subset_size]
hindi_subset = hindi_sentences[:subset_size]
english_hi_subset = english_hi_references[:subset_size]

In [15]:
def translate_batch(sentences, model, tokenizer, batch_size=16):
    translations = []
    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True).to(device)
        outputs = model.generate(**inputs)
        translated_texts = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
        translations.extend(translated_texts)
    return translations

In [16]:
def translate_batch_llama(sentences, model, tokenizer, batch_size=16):
    translations = []
    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i + batch_size]
        prompts = [f"Translate the following text to English: {text}\nTranslation: " for text in batch]
        inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).to(device)
        outputs = model.generate(
            **inputs,
            max_new_tokens=128,
            do_sample=False,  
            temperature=None,  
            top_p=None,       
            pad_token_id=tokenizer.eos_token_id
        )
        translated_texts = [
            tokenizer.decode(output[inputs.input_ids.shape[1]:], skip_special_tokens=True)
            for output in outputs
        ]
        translations.extend(translated_texts)
    return translations

In [17]:
print("Translating German to English...")
de_en_translations = translate_batch(german_subset, de_en_model, de_en_tokenizer)
print("Translating Hindi to English...")
hi_en_translations = translate_batch(hindi_subset, hi_en_model, hi_en_tokenizer)

Translating German to English...
Translating Hindi to English...


In [18]:
print("Translating German to English using LLaMA 2")
llama_de_en_translations = translate_batch_llama(german_subset, llama_model, llama_tokenizer)

print("Translating Hindi to English using LLaMA 2")
llama_hi_en_translations = translate_batch_llama(hindi_subset, llama_model, llama_tokenizer)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Translating German to English using LLaMA 2
Translating Hindi to English using LLaMA 2


In [19]:
print("Calculating BLEU scores...")
de_en_bleu = corpus_bleu(de_en_translations, [english_de_subset])
hi_en_bleu = corpus_bleu(hi_en_translations, [english_hi_subset])

Calculating BLEU scores...


In [20]:
print(f"German-English BLEU Score: {de_en_bleu.score}")
print(f"Hindi-English BLEU Score: {hi_en_bleu.score}")

German-English BLEU Score: 30.73252691188548
Hindi-English BLEU Score: 27.678089024616547


In [21]:
print("Calculating BLEU scores for LLaMA 2")
llama_de_en_bleu = corpus_bleu(llama_de_en_translations, [english_de_subset])
llama_hi_en_bleu = corpus_bleu(llama_hi_en_translations, [english_hi_subset])

print(f"LLaMA 2 German-English BLEU Score: {llama_de_en_bleu.score}")
print(f"LLaMA 2 Hindi-English BLEU Score: {llama_hi_en_bleu.score}")

Calculating BLEU scores for LLaMA 2
LLaMA 2 German-English BLEU Score: 5.865690850642541
LLaMA 2 Hindi-English BLEU Score: 0.3542982006686051


In [22]:
de_en_results = pd.DataFrame({
    "source": german_subset,
    "reference": english_de_subset,
    "translated": de_en_translations
})
de_en_results.to_csv("translated_subset_results_de_en.csv", index=False)

In [23]:
hi_en_results = pd.DataFrame({
    "source": hindi_subset,
    "reference": english_hi_subset,
    "translated": hi_en_translations
})
hi_en_results.to_csv("translated_subset_results_hi_en.csv", index=False)

In [24]:
llama_de_en_results = pd.DataFrame({
    "source": german_subset,
    "reference": english_de_subset,
    "translated": llama_de_en_translations
})
llama_de_en_results.to_csv("translated_subset_results_llama_de_en.csv", index=False)

llama_hi_en_results = pd.DataFrame({
    "source": hindi_subset,
    "reference": english_hi_subset,
    "translated": llama_hi_en_translations
})
llama_hi_en_results.to_csv("translated_subset_results_llama_hi_en.csv", index=False)

In [25]:
print("\nSample German-English translations:")
for i in range(3):
    print(f"\nSource (DE): {german_subset[i]}")
    print(f"Translation (EN): {de_en_translations[i]}")
    print(f"Reference (EN): {english_de_subset[i]}")

print("\nSample Hindi-English translations:")
for i in range(3):
    print(f"\nSource (HI): {hindi_subset[i]}")
    print(f"Translation (EN): {hi_en_translations[i]}")
    print(f"Reference (EN): {english_hi_subset[i]}")


Sample German-English translations:

Source (DE): Wiederaufnahme der Sitzungsperiode
Translation (EN): Resumption of the session
Reference (EN): Resumption of the session

Source (DE): Ich erkläre die am Freitag, dem 17. Dezember unterbrochene Sitzungsperiode des Europäischen Parlaments für wiederaufgenommen, wünsche Ihnen nochmals alles Gute zum Jahreswechsel und hoffe, daß Sie schöne Ferien hatten.
Translation (EN): I declare resumed the session of the European Parliament adjourned on Friday, 17 December, and wish you once again all the best for the turn of the year and hope that you have had a good holiday.
Reference (EN): I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.

Source (DE): Wie Sie feststellen konnten, ist der gefürchtete "Millenium-Bug " nicht eingetreten. Doch sind Bürger einiger unserer Mitgliedstaaten Opfer von

In [26]:
print("\nLoading COMET model...")
model_path = download_model("Unbabel/wmt20-comet-da")
comet_model = load_from_checkpoint(model_path)


Loading COMET model...


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Lightning automatically upgraded your loaded checkpoint from v1.3.5 to v2.4.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../.cache/huggingface/hub/models--Unbabel--wmt20-comet-da/snapshots/4c372befe4d603e6d0363f434248ecad66945607/checkpoints/model.ckpt`
Encoder model frozen.
/opt/conda/lib/python3.11/site-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']


In [27]:
print("Preparing COMET evaluation data...")
de_en_comet_data = [
    {"src": src, "mt": mt, "ref": ref}
    for src, mt, ref in zip(german_subset, de_en_translations, english_de_subset)
]

hi_en_comet_data = [
    {"src": src, "mt": mt, "ref": ref}
    for src, mt, ref in zip(hindi_subset, hi_en_translations, english_hi_subset)
]

Preparing COMET evaluation data...


In [28]:
print("Preparing COMET evaluation data for LLaMA 2...")
llama_de_en_comet_data = [
    {"src": src, "mt": mt, "ref": ref}
    for src, mt, ref in zip(german_subset, llama_de_en_translations, english_de_subset)
]

llama_hi_en_comet_data = [
    {"src": src, "mt": mt, "ref": ref}
    for src, mt, ref in zip(hindi_subset, llama_hi_en_translations, english_hi_subset)
]

Preparing COMET evaluation data for LLaMA 2...


In [29]:
print("Running COMET evaluation for German-English...")
de_en_comet_scores = comet_model.predict(de_en_comet_data, batch_size=8, gpus=1)

print("Running COMET evaluation for Hindi-English...")
hi_en_comet_scores = comet_model.predict(hi_en_comet_data, batch_size=8, gpus=1)

print(f"\nGerman-English System COMET Score: {de_en_comet_scores['system_score']}")
print(f"Hindi-English System COMET Score: {hi_en_comet_scores['system_score']}")

Running COMET evaluation for German-English...


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Predicting DataLoader 0: 100%|████████████████████████████████████████████████████████| 250/250 [00:21<00:00, 11.46it/s]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [

Running COMET evaluation for Hindi-English...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Predicting DataLoader 0: 100%|████████████████████████████████████████████████████████| 250/250 [00:17<00:00, 14.52it/s]



German-English System COMET Score: 0.5341249912087805
Hindi-English System COMET Score: 0.4155202876748517


In [30]:
print("Running COMET evaluation for LLaMA 2 German-English")
llama_de_en_comet_scores = comet_model.predict(llama_de_en_comet_data, batch_size=8, gpus=1)

print("Running COMET evaluation for LLaMA 2 Hindi-English")
llama_hi_en_comet_scores = comet_model.predict(llama_hi_en_comet_data, batch_size=8, gpus=1)

print(f"\nLLaMA 2 German-English System COMET Score: {llama_de_en_comet_scores['system_score']}")
print(f"LLaMA 2 Hindi-English System COMET Score: {llama_hi_en_comet_scores['system_score']}")

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Running COMET evaluation for LLaMA 2 German-English


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Predicting DataLoader 0: 100%|████████████████████████████████████████████████████████| 250/250 [00:31<00:00,  7.95it/s]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Running COMET evaluation for LLaMA 2 Hindi-English


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Predicting DataLoader 0: 100%|████████████████████████████████████████████████████████| 250/250 [00:24<00:00, 10.07it/s]



LLaMA 2 German-English System COMET Score: -0.6147800029972568
LLaMA 2 Hindi-English System COMET Score: -1.5059212676659226


In [31]:
de_en_comet_results = pd.DataFrame({
    "source": german_subset,
    "reference": english_de_subset,
    "translated": de_en_translations,
    "comet_score": de_en_comet_scores['scores']
})
de_en_comet_results.to_csv("comet_results_de_en.csv", index=False)

hi_en_comet_results = pd.DataFrame({
    "source": hindi_subset,
    "reference": english_hi_subset,
    "translated": hi_en_translations,
    "comet_score": hi_en_comet_scores['scores']
})
hi_en_comet_results.to_csv("comet_results_hi_en.csv", index=False)

In [32]:
llama_de_en_comet_results = pd.DataFrame({
    "source": german_subset,
    "reference": english_de_subset,
    "translated": llama_de_en_translations,
    "comet_score": llama_de_en_comet_scores['scores']
})
llama_de_en_comet_results.to_csv("comet_results_llama_de_en.csv", index=False)

llama_hi_en_comet_results = pd.DataFrame({
    "source": hindi_subset,
    "reference": english_hi_subset,
    "translated": llama_hi_en_translations,
    "comet_score": llama_hi_en_comet_scores['scores']
})
llama_hi_en_comet_results.to_csv("comet_results_llama_hi_en.csv", index=False)

In [33]:
llama_de_en_comet_results = pd.DataFrame({
    "source": german_subset,
    "reference": english_de_subset,
    "translated": llama_de_en_translations,
    "comet_score": llama_de_en_comet_scores['scores']
})
llama_de_en_comet_results.to_csv("comet_results_llama_de_en.csv", index=False)

llama_hi_en_comet_results = pd.DataFrame({
    "source": hindi_subset,
    "reference": english_hi_subset,
    "translated": llama_hi_en_translations,
    "comet_score": llama_hi_en_comet_scores['scores']
})
llama_hi_en_comet_results.to_csv("comet_results_llama_hi_en.csv", index=False)

In [34]:
threshold = 0.5

de_en_low_score_data = [
    {
        "source": src,
        "translated": mt,
        "reference": ref,
        "comet_score": score
    }
    for src, mt, ref, score in zip(german_subset, de_en_translations, english_de_subset, de_en_comet_scores['scores'])
    if score < threshold
]

hi_en_low_score_data = [
    {
        "source": src,
        "translated": mt,
        "reference": ref,
        "comet_score": score
    }
    for src, mt, ref, score in zip(hindi_subset, hi_en_translations, english_hi_subset, hi_en_comet_scores['scores'])
    if score < threshold
]

def annotate_errors(data, language_pair):
    annotated_data = []
    for row in data:
        source = row["source"]
        translated = row["translated"]
        reference = row["reference"]

        if len(translated.split()) < len(reference.split()):
            error = f"<bad>{translated}</bad> (Under-translation: missing content)"
        elif len(translated.split()) > len(reference.split()):
            error = f"<bad>{translated}</bad> (Over-translation: extra content)"
        else:
            error = f"<bad>{translated}</bad> (Possible semantic or grammatical issues)"

        annotated_data.append({
            "source": source,
            "translated": translated,
            "reference": reference,
            "comet_score": row["comet_score"],
            "error_annotation": error,
            "language_pair": language_pair
        })
    return annotated_data

print("\nGenerating error analysis...")
de_en_annotated = annotate_errors(de_en_low_score_data, "German-English")
hi_en_annotated = annotate_errors(hi_en_low_score_data, "Hindi-English")

all_annotated_results = pd.DataFrame(de_en_annotated + hi_en_annotated)
all_annotated_results.to_csv("Annotated_Low_Score_Translations_Both_Pairs.csv", index=False)

print("\nSummary Statistics:")
print(f"German-English translations below threshold: {len(de_en_low_score_data)}")
print(f"Hindi-English translations below threshold: {len(hi_en_low_score_data)}")
print(f"Total translations analyzed: {len(german_subset)}")



Generating error analysis...

Summary Statistics:
German-English translations below threshold: 723
Hindi-English translations below threshold: 954
Total translations analyzed: 2000


In [35]:
fine_tuning_data = []
for annot in de_en_annotated + hi_en_annotated:
    fine_tuning_data.append({
        "source": annot['source'],
        "target": annot['error_annotation'],
        "reference": annot['reference'],
        "language_pair": annot['language_pair']
    })

fine_tuning_df = pd.DataFrame(fine_tuning_data)
fine_tuning_df.to_csv("fine_tuning_dataset_both_pairs.csv", index=False)
print("\nFine-tuning dataset saved to 'fine_tuning_dataset_both_pairs.csv'")


Fine-tuning dataset saved to 'fine_tuning_dataset_both_pairs.csv'


In [None]:
#changes made

In [36]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

from huggingface_hub import login
login("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") 

llama_model_name = "meta-llama/Llama-2-7b-hf"  
tokenizer_llama = AutoTokenizer.from_pretrained(llama_model_name)
model_llama = AutoModelForCausalLM.from_pretrained(llama_model_name, device_map="auto")

input_text_llama = "This is a test sentence for translation."

tokens_llama = tokenizer_llama(input_text_llama, return_tensors="pt").to("cuda")

outputs_llama = model_llama.generate(**tokens_llama, max_length=50)

translated_text_llama = tokenizer_llama.decode(outputs_llama[0], skip_special_tokens=True)
print("LLaMA 2 Translated Text:", translated_text_llama)


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.
Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.
We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


LLaMA 2 Translated Text: This is a test sentence for translation.
This is a test sentence for translation. This is a test sentence for translation. This is a test sentence for translation. This is a test sentence for translation. This is a test sentence for translation.


In [37]:
from openai import OpenAI
client = OpenAI(api_key="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")  

print("OpenAI package imported and configured")

OpenAI package imported and configured


In [38]:
def translate_with_gpt(input_text):
    messages = [
        {"role": "system", "content": "Translate the following English text to German. Only provide the translation."},
        {"role": "user", "content": input_text}
    ]
    
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=messages,
        temperature=0.3
    )
    
    return response.choices[0].message.content.strip()

test_input = "This is a test sentence for translation."
gpt_translated_text = translate_with_gpt(test_input)

print("GPT Translated Text:", gpt_translated_text)

HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
Retrying request to /chat/completions in 0.453463 seconds
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
Retrying request to /chat/completions in 0.816318 seconds
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"


RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}