In [None]:
# Install required packages
!pip install sacrebleu evaluate

Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Down

In [None]:
import pandas as pd
import torch
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

# Load your fine-tuned model
model_path = "/content/drive/MyDrive/ssmt_project/Models/mbart-en-te-final"
model = MBartForConditionalGeneration.from_pretrained(model_path)
tokenizer = MBart50TokenizerFast.from_pretrained(model_path)

# Set language codes
tokenizer.src_lang = "en_XX"
model.config.forced_bos_token_id = tokenizer.lang_code_to_id["te_IN"]

# Load the new data to translate
input_csv_path = "/content/drive/MyDrive/ssmt_project/asr_output_all.csv"
df = pd.read_csv(input_csv_path)

# Translation function
def translate(text_list, batch_size=8):
    translations = []
    model.eval()
    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i+batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=128).to(model.device)
        with torch.no_grad():
            generated_tokens = model.generate(**inputs, max_length=128, num_beams=5)
        outputs = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
        translations.extend(outputs)
    return translations

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Translate both columns
df["actual_en_translated"] = translate(df["actual_en"].tolist())
df["asr_decoded_translated"] = translate(df["asr_decoded"].tolist())

# Save the output to a new CSV
output_csv_path = "/content/drive/MyDrive/ssmt_project/eval_translated.csv"
df.to_csv(output_csv_path, index=False)

print(f"Translated file saved at: {output_csv_path}")




Translated file saved at: /content/drive/MyDrive/ssmt_project/eval_translated.csv
