# Train Translator

Based on: https://huggingface.co/docs/transformers/tasks/translation

In [32]:
import sys, os, datetime
import json
import torch
import random
import glob
from tqdm.notebook import tqdm
from transformers import AutoTokenizer
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import TranslationPipeline
from datasets import load_dataset, Dataset, DatasetDict, concatenate_datasets

In [33]:
import cdli
import languages

In [34]:
def get_finetune_model_id(model_id):
    model_dir = f"../results/{model_id}"
    checkpoints = [(os.path.abspath(x), int(os.path.split(x)[1].split("-")[1])) for x in
                   glob.glob(f"{model_dir}/checkpoint-*")]
    checkpoints = sorted(checkpoints, key=lambda x: x[1])[-1]
    return checkpoints[0]

In [35]:
# os.environ["WANDB_NOTEBOOK_NAME"] = "TrainTranslatorNew.ipynb"
# os.environ["TOKENIZERS_PARALLELISM"] = "false"

source_langs = set(["akk", "sux"])

# target_langs = set(["en", "it", "es", "fr", "de"])
target_langs = set(["en"])

base_model_id = "t5-small"
finetune_model_id = None
# finetune_model_id = get_finetune_model_id("t5-base-p-akksux-en-20220722-173018")

model_max_length = 512
batch_size = 4 if os.path.basename(base_model_id).startswith("t5-large") else (
    8 if os.path.basename(base_model_id).startswith("t5-small") else 128)

# num_train_epochs = 30
num_train_epochs = 10

is_bi = True
is_finetune = finetune_model_id is not None and len(finetune_model_id) > 1

In [36]:
date_id = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
flags = ""
suffix = ""
if is_bi:
    flags += "-bi"
if is_finetune:
    flags += "-f"
    suffix += f"-{os.path.basename(os.path.split(finetune_model_id)[0])}-{os.path.basename(finetune_model_id)}"
model_id = f"{os.path.basename(base_model_id)}{flags}-{''.join(sorted(list(source_langs)))}-{''.join(sorted(list(target_langs)))}-{date_id}{suffix}"
model_id

't5-small-bi-akksux-en-20240111-203816'

In [37]:
has_cuda = torch.cuda.is_available()
device = torch.cuda.device(0) if has_cuda else "cpu"
has_cuda, device

(True, <torch.cuda.device at 0x749930b29e70>)

In [38]:
!nvidia-smi

Thu Jan 11 20:38:17 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 545.29.06              Driver Version: 545.29.06    CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3060 Ti     Off | 00000000:0A:00.0  On |                  N/A |
|  0%   49C    P8              26W / 220W |    913MiB /  8192MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

## Load Training Data

In [39]:
def get_prefix(src_lang, tgt_lang):
    s = languages.all_languages[src_lang]
    t = languages.all_languages[tgt_lang]
    return f"translate {s} to {t}: "


get_prefix("suxts", "es")

'translate Sumerian to Spanish: '

In [40]:
def load_translations_dataset(from_lang, to_lang):
    ds = Dataset.from_json(f"../data/translations_{from_lang}_to_{to_lang}.jsonl")
    srcs = [get_prefix(from_lang, to_lang) + x[from_lang] for x in ds]
    targets = [x[to_lang] for x in ds]
    if is_bi:
        srcs.extend([get_prefix(to_lang, from_lang) + x[to_lang] for x in ds])
        targets.extend([x[from_lang] for x in ds])
    #     ds = ds.add_column("source", srcs)
    #     ds = ds.add_column("target", targets)
    #     ds = ds.remove_columns([from_lang])
    #     ds = ds.rename_column(to_lang, "target")
    ds = Dataset.from_dict({"source": srcs, "target": targets})
    return ds


translation_datasets = {lang: load_translations_dataset(lang, "en") for lang in source_langs}


In [42]:
translation_datasets

{'sux': Dataset({
     features: ['source', 'target'],
     num_rows: 42350
 }),
 'akk': Dataset({
     features: ['source', 'target'],
     num_rows: 191258
 })}

In [43]:
translation_datasets["sux"][1]

{'source': 'translate Sumerian to English: # (gesz)isimu3(+mu2) - |_gisz_-(U)-_ad-sar_| = %a pe-er-hu-um # (gesz)ildagx(|A-(_gu4_xKUR)|) - |_gisz-a-gu4_xKUR| = %a a-da-ru-um # (gesz)szinig - |_gisz-szinig_|# = %a bi-nu-um # (gesz)asal2 - |_gisz-a-tu-gaba-lisz_|# = %a s,a-ar-ba-tum!(_lum_) # - [...] = %a [...] sza# _dingir#_',
 'target': 'bud poplar tamarisk poplar divine weapon'}

In [44]:
merged_dataset = concatenate_datasets(translation_datasets.values())

In [45]:
merged_dataset

Dataset({
    features: ['source', 'target'],
    num_rows: 233608
})

In [46]:
dataset = merged_dataset.shuffle()

In [47]:
dataset[1]

{'source': 'translate Sumerian to English: en-nu bad3 za3-mu gal-ug3 _uru_-_ka_-gi-na lugal lagasz(ki)',
 'target': 'Watch of the wall, Zamu, a chief of personnel. URU-KA-gina, king of Lagash.'}

In [48]:
avg_src_chars_per_token = 1.8338974021110785
avg_tgt_chars_per_token = 2.829482016086902

In [49]:
translations = dataset.train_test_split(test_size=0.1)

In [50]:
train_dataset, test_dataset = translations["train"], translations["test"]

In [51]:
test_dataset

Dataset({
    features: ['source', 'target'],
    num_rows: 23361
})

In [52]:
test_dataset[1120:1200]

{'source': ['translate Sumerian to English: gir4',
  'translate Sumerian to English: gazi al-gu2-ga2',
  'translate Akkadian to English: il-la-ku x#+[x x x x x x x]',
  "translate English to Sumerian: With your looking at the people: it is a bountiful wind. With your looking at the righteous hero, the man: for him life is prolonged. I am a no-mother-haver, you are my mother. I am a no-father-haver, you are my father. You have placed my semen in the inside: you bore me in the cella. Gatumdu, your shiny name is sweet. I will lay me down here this night, you are my big dagger, following my side. You are a reed, planted in the big water, you have placed life in me. You are a broad parasol, under your shade let me cool myself there. With the index finger? of your right hand, 'stroke' me with it my lady Gatumdu. I will go to the city, may my sign be good.",
  'translate Akkadian to English: [...] _sag-du igi e2 bar dingir_-_mesz_ : ru-u2-tum (d)e2-a _en ku6_ [i-s,ar]-ru-ru : s,a-ra-ra : a-la

## Tokenize the Data

In [53]:
tokenizer = AutoTokenizer.from_pretrained(base_model_id, model_max_length=model_max_length)

In [54]:

if tokenizer.model_max_length == model_max_length:
    print("tokenizer.model_max_length == model_max_length")
tokenizer.model_max_length

tokenizer.model_max_length == model_max_length


512

In [55]:
print("pad", tokenizer.pad_token, tokenizer.pad_token_id)
print("eos", tokenizer.eos_token, tokenizer.eos_token_id)
print("unk", tokenizer.unk_token, tokenizer.unk_token_id)

pad <pad> 0
eos </s> 1
unk <unk> 2


In [None]:
ccc = 0
sum_src_chars_per_token = 0.0
num_src_chars_per_token = 0
sum_tgt_chars_per_token = 0.0
num_tgt_chars_per_token = 0


def preprocess_function(examples):
    global ccc, sum_src_chars_per_token, sum_tgt_chars_per_token, num_src_chars_per_token, num_tgt_chars_per_token
    # print(examples) run this wihtout this comment if you want the true debug++ experience
    inputs = [example for example in examples["source"]]
    targets = [example for example in examples["target"]]
    model_inputs = tokenizer(inputs, max_length=model_max_length, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=model_max_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]

    nexamples = len(inputs)
    for i in range(nexamples):
        nchar = len(inputs[i])
        ntoks = len(model_inputs["input_ids"][i])
        if ntoks > 0:
            sum_src_chars_per_token += nchar / ntoks
            num_src_chars_per_token += 1
        nchar = len(targets[i])
        ntoks = len(model_inputs["labels"][i])
        if ntoks > 0:
            sum_tgt_chars_per_token += nchar / ntoks
            num_tgt_chars_per_token += 1

    ccc += 1
    if ccc == 1:
        print(model_inputs["input_ids"][0])
        print(model_inputs["labels"][0])
        nchar = len(targets[0])
        ntoks = len(model_inputs["labels"][0])
        print(nchar, ntoks, nchar / ntoks)

    return model_inputs


tokenized_translations = translations.map(preprocess_function, batched=True)
tokenized_translations

In [57]:
avg_src_chars_per_token = sum_src_chars_per_token / num_src_chars_per_token
avg_tgt_chars_per_token = sum_tgt_chars_per_token / num_tgt_chars_per_token
print("avg_src_chars_per_token", "=", avg_src_chars_per_token)
print("avg_tgt_chars_per_token", "=", avg_tgt_chars_per_token)

avg_src_chars_per_token = 2.554184432226667
avg_tgt_chars_per_token = 2.0533358474040417


In [58]:
tokenized_translations["train"] = tokenized_translations["train"].remove_columns(["source", "target"])
tokenized_translations["test"] = tokenized_translations["test"].remove_columns(["source", "target"])
tokenized_translations

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 210247
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 23361
    })
})

In [59]:
tokenized_translations["train"][0]["labels"][:10]

[784, 226, 3, 226, 3, 226, 3, 226, 3, 226]

## Load the Model

In [60]:
model = AutoModelForSeq2SeqLM.from_pretrained(finetune_model_id if is_finetune else base_model_id,
                                              max_length=model_max_length, )

In [61]:
model.config

T5Config {
  "_name_or_path": "t5-small",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "max_length": 512,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "ma

## Train

In [67]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
data_collator


DataCollatorForSeq2Seq(tokenizer=T5TokenizerFast(name_or_path='t5-small', vocab_size=32100, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<extra_id

In [71]:
from torch import optim

model_path = f"../results/{model_id}-{num_train_epochs}-{batch_size}-last"

training_args = Seq2SeqTrainingArguments(
    output_dir=model_path,
    evaluation_strategy="epoch",
    learning_rate=2.0e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=num_train_epochs,
    fp16=has_cuda,
    include_tokens_per_second=True

)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_translations["train"],
    eval_dataset=tokenized_translations["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    optimizers=(optim.AdamW(model.parameters(), lr=2.0e-5), None),
)

print(training_args.__dict__, "\n" * 3)
print(trainer.__dict__)

Num processes: 1
Process index: 0
Local process index: 0
Device: cuda
, '_n_gpu': 1, '__cached__setup_devices': device(type='cuda', index=0), 'deepspeed_plugin': None} 



	0: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	32000: AddedToken("<extra_id_99>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	32001: AddedToken("<extra_id_98>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	32002: AddedToken("<extra_id_97>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	32003: AddedToken("<extra_id_96>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	32004: AddedToken("<extra_id_95>", rstrip=False, lst

In [72]:
trainer.train()
trainer.save_model()  # Save the trained model

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,2.4321,2.182954
2,2.1348,1.944015
3,2.0283,1.816833
4,1.9647,1.731686
5,1.9067,1.674687
6,1.8761,1.635022
7,1.8165,1.605134
8,1.7754,1.585951
9,1.7515,1.575035
10,1.7391,1.571051


## Sample

In [73]:
pipeline = TranslationPipeline(model=model.to("cpu"), tokenizer=tokenizer, max_length=model_max_length)

In [74]:
pipeline

<transformers.pipelines.text2text_generation.TranslationPipeline at 0x749967e13e80>

In [75]:
pipeline("translate English to French: hello my name is Frank")



[{'translation_text': '_mu_-ia _mu_-_mesz_-ni'}]

In [76]:
source_test = translations["test"][0]["source"]
target_test = translations["test"][0]["target"]
print(source_test)
print("-" * 80)
print(target_test)

translate English to Akkadian: given; so as not to be forgotten written down; ... month "qarratu," 29th day, eponym: "Sîn-sheya."
--------------------------------------------------------------------------------
ta-ad-nu a-na la ma-sza-e sza-t,i2-ir _ku!_ _iti_ qar-ra-a-tu _u4 2(u) 9(disz)-kam2 li-mu [(disz)](d)3(asz)-sze-ia


In [81]:
def translate(text):
    return pipeline(text)

print(source_test)
print(translate(source_test))

translate English to Akkadian: given; so as not to be forgotten written down; ... month "qarratu," 29th day, eponym: "Sîn-sheya."
[{'translation_text': 'szu-u2 sza2 _nu_ sza2 _szu_-_min_ x#+[x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x '}]


## Save to Huggingface

In [78]:
model_path = os.path.relpath(f"../results/{model_id}-{num_train_epochs}-{batch_size}-last")
trainer.save_model(model_path)
model_path

'../results/t5-small-bi-akksux-en-20240111-203816-10-8-last'

In [79]:
tokenizer.save_pretrained(model_path)

('../results/t5-small-bi-akksux-en-20240111-203816-10-8-last/tokenizer_config.json',
 '../results/t5-small-bi-akksux-en-20240111-203816-10-8-last/special_tokens_map.json',
 '../results/t5-small-bi-akksux-en-20240111-203816-10-8-last/tokenizer.json')