# Train Translator

Based on: https://huggingface.co/docs/transformers/tasks/translation

In [1]:
import sys, os, datetime
import json
import torch
import random
import glob
from tqdm.notebook import tqdm
from transformers import AutoTokenizer
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import TranslationPipeline
from datasets import load_dataset, Dataset, DatasetDict, concatenate_datasets

In [2]:
import cdli
import languages

asdasdadadasdasdasdadadasdasdasdadadasdasdasdadadasdasdasdadadasdasdasdadadasdasdasdadadasdasdasdada... (truncated)


In [3]:
def get_finetune_model_id(model_id):
    model_dir = f"../results/{model_id}"
    checkpoints = [(os.path.abspath(x), int(os.path.split(x)[1].split("-")[1])) for x in
                   glob.glob(f"{model_dir}/checkpoint-*")]
    checkpoints = sorted(checkpoints, key=lambda x: x[1])[-1]
    return checkpoints[0]

In [4]:
# os.environ["WANDB_NOTEBOOK_NAME"] = "TrainTranslatorNew.ipynb"
# os.environ["TOKENIZERS_PARALLELISM"] = "false"

source_langs = set(["akk", "sux"])

# target_langs = set(["en", "it", "es", "fr", "de"])
target_langs = set(["en"])

base_model_id = "t5-small"
finetune_model_id = None
# finetune_model_id = get_finetune_model_id("t5-base-p-akksux-en-20220722-173018")

model_max_length = 512
batch_size = 4 if os.path.basename(base_model_id).startswith("t5-large") else (
    8 if os.path.basename(base_model_id).startswith("t5-small") else 128)

num_train_epochs = 30
# num_train_epochs = 10

is_bi = True
is_finetune = finetune_model_id is not None and len(finetune_model_id) > 1

In [5]:
date_id = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
flags = ""
suffix = ""
if is_bi:
    flags += "-bi"
if is_finetune:
    flags += "-f"
    suffix += f"-{os.path.basename(os.path.split(finetune_model_id)[0])}-{os.path.basename(finetune_model_id)}"
model_id = f"{os.path.basename(base_model_id)}{flags}-{''.join(sorted(list(source_langs)))}-{''.join(sorted(list(target_langs)))}-{date_id}{suffix}"
model_id

't5-small-bi-akksux-en-20240112-074206'

In [6]:
has_cuda = torch.cuda.is_available()
device = torch.cuda.device(0) if has_cuda else "cpu"
has_cuda, device

(True, <torch.cuda.device at 0x709b56d5f790>)

In [7]:
!nvidia-smi

Fri Jan 12 07:42:07 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 545.29.06              Driver Version: 545.29.06    CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3060 Ti     Off | 00000000:0A:00.0  On |                  N/A |
| 56%   43C    P0              53W / 220W |    855MiB /  8192MiB |      1%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

## Load Training Data

In [8]:
def get_prefix(src_lang, tgt_lang):
    s = languages.all_languages[src_lang]
    t = languages.all_languages[tgt_lang]
    return f"translate {s} to {t}: "


get_prefix("suxts", "es")

'translate Sumerian to Spanish: '

In [9]:
def load_translations_dataset(from_lang, to_lang):
    ds = Dataset.from_json(f"../data/translations_{from_lang}_to_{to_lang}.jsonl")
    srcs = [get_prefix(from_lang, to_lang) + x[from_lang] for x in ds]
    targets = [x[to_lang] for x in ds]
    if is_bi:
        srcs.extend([get_prefix(to_lang, from_lang) + x[to_lang] for x in ds])
        targets.extend([x[from_lang] for x in ds])
    #     ds = ds.add_column("source", srcs)
    #     ds = ds.add_column("target", targets)
    #     ds = ds.remove_columns([from_lang])
    #     ds = ds.rename_column(to_lang, "target")
    ds = Dataset.from_dict({"source": srcs, "target": targets})
    return ds


translation_datasets = {lang: load_translations_dataset(lang, "en") for lang in source_langs}


In [10]:
translation_datasets

{'sux': Dataset({
     features: ['source', 'target'],
     num_rows: 42350
 }),
 'akk': Dataset({
     features: ['source', 'target'],
     num_rows: 191258
 })}

In [11]:
translation_datasets["sux"][1]

{'source': 'translate Sumerian to English: # (gesz)isimu3(+mu2) - |_gisz_-(U)-_ad-sar_| = %a pe-er-hu-um # (gesz)ildagx(|A-(_gu4_xKUR)|) - |_gisz-a-gu4_xKUR| = %a a-da-ru-um # (gesz)szinig - |_gisz-szinig_|# = %a bi-nu-um # (gesz)asal2 - |_gisz-a-tu-gaba-lisz_|# = %a s,a-ar-ba-tum!(_lum_) # - [...] = %a [...] sza# _dingir#_',
 'target': 'bud poplar tamarisk poplar divine weapon'}

In [12]:
merged_dataset = concatenate_datasets(translation_datasets.values())

In [13]:
merged_dataset

Dataset({
    features: ['source', 'target'],
    num_rows: 233608
})

In [14]:
dataset = merged_dataset.shuffle()

In [15]:
dataset[1]

{'source': 'translate Akkadian to English: ina _ugu lu-sanga_-_mesz_ sza _uru_-kal(*)-[ha] sza _lugal_ be-li isz-pur-a-ni a-na-ku ki-i ra-ma-ni-ia(*)# a-na _lu-sanga_ as-sa-al(*)# ki(*)-i(*)# an(*)-ni(*)-i(*) iq(*)-t,i(*)#-[bi-a]',
 'target': 'Concerning the priests of Calah about whom the king, my lord, wrote to me, I questioned a priest personally; he said as follows:'}

In [16]:
avg_src_chars_per_token = 1.8338974021110785
avg_tgt_chars_per_token = 2.829482016086902

In [17]:
translations = dataset.train_test_split(test_size=0.1)

In [18]:
train_dataset, test_dataset = translations["train"], translations["test"]

In [19]:
test_dataset

Dataset({
    features: ['source', 'target'],
    num_rows: 23361
})

In [20]:
test_dataset[1120:1200]

{'source': ['translate Akkadian to English: _igi#_ (m)u-su-na-a',
  'translate English to Sumerian: an interest rate of 100 sila of barley per gur is to be charged,',
  'translate English to Akkadian: Ring of Sha-Anu-ishu',
  'translate Akkadian to English: x _kur_-_mesz ni_ x [...]',
  'translate English to Akkadian: He seats the chanters and retires to the side room. At the time of making the chanters rise, the king makes the chanters rise. He provides for the House of God in the house of Dagan.',
  'translate English to Akkadian: If Mars comes close to the front of the moon and stands there: the moon god will resettle a ruined land.',
  'translate Akkadian to English: _igi_ (m)a-bi-[x x]',
  'translate Akkadian to English: [...] _mul#-mul_ [...]',
  'translate English to Sumerian: Sin-kashid, king of Uruk and king of Amnanum, provider of the Eanna, when the Eanna, he built, for Nishi-inishu the erish-dingir priestess of Lugalbanda, his beloved daughter who for his life had been inst

## Tokenize the Data

In [21]:
tokenizer = AutoTokenizer.from_pretrained(base_model_id, model_max_length=model_max_length)

In [22]:

if tokenizer.model_max_length == model_max_length:
    print("tokenizer.model_max_length == model_max_length")
tokenizer.model_max_length

tokenizer.model_max_length == model_max_length


512

In [23]:
print("pad", tokenizer.pad_token, tokenizer.pad_token_id)
print("eos", tokenizer.eos_token, tokenizer.eos_token_id)
print("unk", tokenizer.unk_token, tokenizer.unk_token_id)

pad <pad> 0
eos </s> 1
unk <unk> 2


In [24]:
ccc = 0
sum_src_chars_per_token = 0.0
num_src_chars_per_token = 0
sum_tgt_chars_per_token = 0.0
num_tgt_chars_per_token = 0


def preprocess_function(examples):
    global ccc, sum_src_chars_per_token, sum_tgt_chars_per_token, num_src_chars_per_token, num_tgt_chars_per_token
    # print(examples) run this wihtout this comment if you want the true debug++ experience
    inputs = [example for example in examples["source"]]
    targets = [example for example in examples["target"]]
    model_inputs = tokenizer(inputs, max_length=model_max_length, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=model_max_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]

    nexamples = len(inputs)
    for i in range(nexamples):
        nchar = len(inputs[i])
        ntoks = len(model_inputs["input_ids"][i])
        if ntoks > 0:
            sum_src_chars_per_token += nchar / ntoks
            num_src_chars_per_token += 1
        nchar = len(targets[i])
        ntoks = len(model_inputs["labels"][i])
        if ntoks > 0:
            sum_tgt_chars_per_token += nchar / ntoks
            num_tgt_chars_per_token += 1

    ccc += 1
    if ccc == 1:
        print(model_inputs["input_ids"][0])
        print(model_inputs["labels"][0])
        nchar = len(targets[0])
        ntoks = len(model_inputs["labels"][0])
        print(nchar, ntoks, nchar / ntoks)

    return model_inputs


tokenized_translations = translations.map(preprocess_function, batched=True)
tokenized_translations

Map:   0%|          | 0/210247 [00:00<?, ? examples/s]



[13959, 4823, 1258, 8603, 12, 1566, 10, 3, 834, 17, 23, 834, 18, 1824, 23, 3, 1258, 7, 18, 4987, 3, 8758, 18, 11054, 3, 17, 9, 18, 2644, 3, 834, 9, 18, 7, 1629, 834, 3, 7, 1000, 18, 9, 18, 17, 76, 3, 1629, 18, 5082, 50, 18, 1824, 23, 3, 17, 76, 18, 9, 18, 52, 76, 20, 18, 29, 76, 3, 834, 1259, 122, 591, 18, 1259, 122, 591, 834, 1]
[37, 540, 19, 1866, 1551, 5, 466, 1322, 19, 3907, 11, 7347, 5, 2372, 3, 60, 15044, 6, 9953, 6, 42, 13326, 19, 3, 12186, 5, 1]
114 27 4.222222222222222


Map:   0%|          | 0/23361 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['source', 'target', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 210247
    })
    test: Dataset({
        features: ['source', 'target', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 23361
    })
})

In [25]:
avg_src_chars_per_token = sum_src_chars_per_token / num_src_chars_per_token
avg_tgt_chars_per_token = sum_tgt_chars_per_token / num_tgt_chars_per_token
print("avg_src_chars_per_token", "=", avg_src_chars_per_token)
print("avg_tgt_chars_per_token", "=", avg_tgt_chars_per_token)

avg_src_chars_per_token = 2.5541844322267124
avg_tgt_chars_per_token = 2.0533358474041217


In [26]:
tokenized_translations["train"] = tokenized_translations["train"].remove_columns(["source", "target"])
tokenized_translations["test"] = tokenized_translations["test"].remove_columns(["source", "target"])
tokenized_translations

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 210247
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 23361
    })
})

In [27]:
tokenized_translations["train"][0]["labels"][:10]

[37, 540, 19, 1866, 1551, 5, 466, 1322, 19, 3907]

## Load the Model

In [28]:
model = AutoModelForSeq2SeqLM.from_pretrained(finetune_model_id if is_finetune else base_model_id,
                                              max_length=model_max_length, )

In [29]:
model.config

T5Config {
  "_name_or_path": "t5-small",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "max_length": 512,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "ma

## Train

In [30]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
data_collator


DataCollatorForSeq2Seq(tokenizer=T5TokenizerFast(name_or_path='t5-small', vocab_size=32100, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<extra_id

In [31]:
from torch import optim

model_path = f"../results/{model_id}-{num_train_epochs}-{batch_size}-last"

training_args = Seq2SeqTrainingArguments(
    output_dir=model_path,
    evaluation_strategy="epoch",
    learning_rate=2.0e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=num_train_epochs,
    fp16=has_cuda,
    include_tokens_per_second=True

)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_translations["train"],
    eval_dataset=tokenized_translations["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    optimizers=(optim.AdamW(model.parameters(), lr=2.0e-5), None),
)

print(training_args.__dict__, "\n" * 3)
print(trainer.__dict__)

Num processes: 1
Process index: 0
Local process index: 0
Device: cuda
, '_n_gpu': 1, '__cached__setup_devices': device(type='cuda', index=0), 'deepspeed_plugin': None} 



	0: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	32000: AddedToken("<extra_id_99>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	32001: AddedToken("<extra_id_98>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	32002: AddedToken("<extra_id_97>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	32003: AddedToken("<extra_id_96>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	32004: AddedToken("<extra_id_95>", rstrip=False, lst

In [32]:
trainer.train()
trainer.save_model()  # Save the trained model

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,2.4132,2.170969
2,2.1649,1.922171
3,2.0003,1.779234
4,1.8767,1.681896
5,1.8287,1.609259
6,1.7721,1.5515
7,1.6797,1.506443
8,1.6439,1.467707
9,1.6171,1.436006
10,1.5775,1.409167


## Sample

In [33]:
pipeline = TranslationPipeline(model=model.to("cpu"), tokenizer=tokenizer, max_length=model_max_length)

In [34]:
pipeline

<transformers.pipelines.text2text_generation.TranslationPipeline at 0x709b56c9c2b0>

In [35]:
pipeline("translate English to French: hello my name is Frank")



[{'translation_text': 'a-ma-a _mu_-ia (d)_utu_'}]

In [36]:
source_test = translations["test"][0]["source"]
target_test = translations["test"][0]["target"]
print(source_test)
print("-" * 80)
print(target_test)

translate English to Akkadian: Nabû-apal-iddin ..., son of Ubru-DN,
--------------------------------------------------------------------------------
(m)(d)_pa_(*)—A—_asz_ [x x] A (m)_suhusz_(*)#—[(d)x]


In [37]:
def translate(text):
    return pipeline(text)

print(source_test)
print(translate(source_test))

translate English to Akkadian: Nabû-apal-iddin ..., son of Ubru-DN,
[{'translation_text': '(disz)(d)_ag_-_ad_-_mu_ [x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x '}]


## Save to Huggingface

In [38]:
model_path = os.path.relpath(f"../results/{model_id}-{num_train_epochs}-{batch_size}-last")
trainer.save_model(model_path)
model_path

'../results/t5-small-bi-akksux-en-20240112-074206-30-8-last'

In [39]:
tokenizer.save_pretrained(model_path)

('../results/t5-small-bi-akksux-en-20240112-074206-30-8-last/tokenizer_config.json',
 '../results/t5-small-bi-akksux-en-20240112-074206-30-8-last/special_tokens_map.json',
 '../results/t5-small-bi-akksux-en-20240112-074206-30-8-last/tokenizer.json')

# Generate a table for the output to be tested.

In [40]:
csv_file_path = f"../results/{model_id}-{num_train_epochs}-{batch_size}-last-eval.csv"
csv_file_path_log = f"../results/{model_id}-{num_train_epochs}-{batch_size}-last-log.csv"


In [41]:
import pandas as pd

In [42]:
pd.DataFrame(trainer.state.log_history).to_csv(csv_file_path)

In [89]:
data = pd.DataFrame().new
data["input"] = tokenized_translations["test"]["source"]
data["target"] = tokenized_translations["test"]["target"]
data["predicted"] = tokenized_translations["test"]["labels"]
data.to_csv(csv_file_path)

KeyError: "Column source not in the dataset. Current columns in the dataset: ['input_ids', 'attention_mask', 'labels']"