In [12]:
import torch
from transformers import MBartForConditionalGeneration, MBart50Tokenizer

model_path = "models/"

device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
print(torch.cuda.is_available())

sl = "en_XX"
tl = "ta_IN"

model = MBartForConditionalGeneration.from_pretrained(model_path + "mbart-large-50-many-to-many-mmt").to(device)
tokenizer_EnTa = MBart50Tokenizer.from_pretrained(model_path + "mbart-large-50-many-to-many-mmt", src_lang=sl, tgt_lang=tl)
tokenizer_TaEn = MBart50Tokenizer.from_pretrained(model_path + "mbart-large-50-many-to-many-mmt", src_lang=tl, tgt_lang=sl)


True


In [20]:
import sys

idx = 0

def preprocess(text, tokenizer, tl):
  global idx
  idx += 1
  sys.stdout.write("\r" + f"{idx}/{len(df)}")
  inputs = tokenizer(text, truncation=True, max_length=200, padding="max_length", return_tensors="pt")
  inputs = inputs.to(device)
  generated_tok = model.generate(**inputs, forced_bos_token_id=tokenizer.lang_code_to_id[tl])

  return tokenizer.batch_decode(generated_tok, skip_special_tokens=True)[0]

In [21]:
import pandas as pd

data_path = "cleaned_snopes.csv"
df = pd.read_csv(data_path)
# df["claim"] = df["claim"].apply(preprocess, args=(tokenizer_EnTa, tl))
# df.to_csv("snopes_tamil.csv", encoding="utf-8", index=None)

df = pd.read_csv("snopes_tamil.csv")
df["claim"] = df["claim"].apply(preprocess, args=(tokenizer_TaEn, sl))
df.to_csv("snopes_backtranslation_TaEn.csv", encoding="utf-8", index=None)

1/4433



4433/4433

In [None]:
from sklearn.model_selection import train_test_split
from transformers import DataCollatorForSeq2Seq
from tqdm import tqdm
import torch

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

x_ids = torch.as_tensor([ten["input_ids"] for ten in tqdm(tok_data)])
y_ids = torch.as_tensor([ten["labels"] for ten in tqdm(tok_data)])

print(x_ids.shape, y_ids.shape)

100%|██████████| 50001/50001 [00:00<00:00, 1431492.62it/s]
100%|██████████| 50001/50001 [00:00<00:00, 1671976.80it/s]


torch.Size([50001, 128]) torch.Size([50001, 128])


In [None]:
def convert_to_dict(data, lang):
  d = {}
  for idx, dat in enumerate(data[:6]):
    d[idx] = {lang: data[idx]}

  return d

x_train, x_test, y_train, y_test = train_test_split(x_ids, y_ids, test_size=0.25)
x_test, x_val, y_test, y_val = train_test_split(x_test, y_test, test_size=0.5) # validation

print(x_train.shape, x_test.shape, x_val.shape)

x_train = convert_to_dict(x_train, s_lang)
x_test = convert_to_dict(x_test, s_lang)
x_val = convert_to_dict(x_val, s_lang)

y_train = convert_to_dict(y_train, t_lang)
y_test = convert_to_dict(y_test, t_lang)
y_val = convert_to_dict(y_val, t_lang)

# train = [{"id": str(idx), "translation": {s_lang: x_train, t_lang: y_train}} for idx in ids][0]
# test = [{"id": str(idx), "translation": {s_lang: x_test, t_lang: y_test}} for idx in ids][0]

torch.Size([37500, 128]) torch.Size([6250, 128]) torch.Size([6251, 128])


In [None]:
class Data(torch.utils.data.Dataset):
  def __init__(self, x, y):
    self.x = x
    self.y = y

  def __getitem__(self, idx):
    item = {key: torch.as_tensor(val[idx]) for key, val in self.x.items()}
    print(item)
    item["labels"] = torch.as_tensor(self.y[idx])
    return item

  def __len__(self):
    return len(self.y)

train = Data(x_train, y_train)
test = Data(x_test, y_test)
val = Data(x_val, y_val)

In [None]:
!pip install -q evaluate sacrebleu

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.2/486.2 kB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import numpy as np
import evaluate

metric = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds

    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}

    return result

Downloading builder script: 0.00B [00:00, ?B/s]

In [None]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

training_args = Seq2SeqTrainingArguments(
    output_dir=f"{s_lang}-{t_lang}_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()



In [None]:
# sl = "English"
# tl = "French"
# prefix = f"translate {sl} to {tl}: "

# text = prefix + "My name is Eduard"

# inputs = tokenizer(text, return_tensors="pt").input_ids
# outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)
# print(tokenizer.decode(outputs[0], skip_special_tokens=True))