# Pretrain Translator

Based on: https://huggingface.co/docs/transformers/tasks/translation

In [1]:
import sys, os, datetime, copy
import json
import torch
import random
import glob
import numpy as np
from tqdm import tqdm
from transformers import AutoTokenizer, PreTrainedTokenizerFast
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import TranslationPipeline
from datasets import load_dataset, Dataset, DatasetDict
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace, Split, ByteLevel
from tokenizers.processors import TemplateProcessing
import tokenizers.pre_tokenizers
import tokenizers.processors
import tokenizers.decoders

In [2]:
import cdli
import languages

asdasdadadasdasdasdadadasdasdasdadadasdasdasdadadasdasdasdadadasdasdasdadadasdasdasdadadasdasdasdada... (truncated)


In [3]:
os.environ["WANDB_NOTEBOOK_NAME"] = "PretrainTranslator.ipynb"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

source_langs = set(["akk", "sux"])

# target_langs = set(["en", "it", "es", "fr", "de"])
target_langs = set(["en"])

base_model_id = "t5-small" # can be base tho.

max_vocab_size = 50_000
model_max_length = 512
batch_size = 8 if os.path.basename(base_model_id).startswith("t5-small") else 128

num_train_sequences = 524_288 * 128
num_warmup_sequences = 10_000 * 128

warmup_learning_rate = 0.01

use_paragraphs = True
use_lines = True


In [4]:
date_id = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
flags = ""
suffix = ""
if use_paragraphs:
    flags += "-p"
if use_lines:
    flags += "-l"
model_id = f"{os.path.basename(base_model_id)}-pre{flags}-{''.join(sorted(list(source_langs)))}-{''.join(sorted(list(target_langs)))}-{date_id}{suffix}"
model_id

't5-small-pre-p-l-akksux-en-20240108-221858'

In [5]:
has_cuda = torch.cuda.is_available()
device = torch.cuda.device(0) if has_cuda else "cpu"
has_cuda, device

(True, <torch.cuda.device at 0x782baa0ed060>)

In [6]:
!nvidia-smi

Mon Jan  8 22:19:00 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 545.29.06              Driver Version: 545.29.06    CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3060 Ti     Off | 00000000:0A:00.0  On |                  N/A |
| 48%   49C    P5              29W / 220W |    990MiB /  8192MiB |      9%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                         

## Load Training Data

In [7]:
avg_src_chars_per_token = 2.6712177445735397

In [8]:
def get_prefix(src_lang, tgt_lang):
    s = languages.all_languages[src_lang]
    t = languages.all_languages[tgt_lang]
    return f"translate {s} to {t}: "
    
get_prefix("suxts", "es") # test, ofcourse we are not going to be translating sumerian to god damn spanish. Turkish maybe? deffinetly english!

'translate Sumerian to Spanish: '

In [9]:
publications = cdli.get_atf()

Downloading https://github.com/cdli-gh/data/raw/master/cdliatf_unblocked.atf
Parsing atf


In [10]:
len(publications), "publications"

In [11]:
def target_ok(target_text):
    if len(target_text) == 0:
        return False
    if len(set(target_text.replace(" ", ""))) < 2:
        return False
    return True
    

def test_target_ok(text):
    ok = target_ok(text)
    print(ok, repr(text))
    
test_target_ok("")
test_target_ok(" ")
test_target_ok("xx xxx x")
test_target_ok(".. . .. ")
test_target_ok("Hi")

False ''
False ' '
False 'xx xxx x'
False '.. . .. '
True 'Hi'


In [12]:
wmax_num_tokens = model_max_length - 192

def wrap_paragraph(paragraph, lines, src_lang, tgt_lang):
    ptag, pline_start_index, pline_end_index = paragraph
    wline_ranges = []
    wline_tok_len = 0.0
    
    def start_new_line(pline_index):
#         print("start", pline_index)
        wline_ranges.append((pline_index, pline_index + 1))
        
    def append_line(pline_index):
#         print("append", pline_index)
        r = wline_ranges[-1]
        if r[1] == pline_index:
            wline_ranges[-1] = (r[0], r[1] + 1)
        else:
            print(f"Missing line: got {pline_index}, expected {r[1]}: {wline_ranges}")

    for pline_index in range(pline_start_index, pline_end_index):
        pline_num_toks = len(lines[pline_index].text) / avg_src_chars_per_token + 1.0
        if len(wline_ranges) == 0 or (wline_tok_len + pline_num_toks > wmax_num_tokens):
            start_new_line(pline_index)
            wline_tok_len = 0.0
        else:
            append_line(pline_index)
        wline_tok_len += pline_num_toks
    return wline_ranges



In [13]:
dataset_index = json.load(open("../data/dataset_index.json", "rt"))
print(dataset_index.keys())

dict_keys(['akk', 'sux'])


In [14]:
print(len(dataset_index["akk"]["train"]), "akk train")
print(len(dataset_index["akk"]["test"]), "akk test")
print(len(dataset_index["sux"]["train"]), "sux train")
print(len(dataset_index["sux"]["test"]), "sux test")

876 akk train
102 akk test
3745 sux train
402 sux test


In [15]:
from asyncio import sleep


def get_pubs_sources():
    debug_fail_state_counter : int = 0
    added_sources = set()

    def add_line_ranges(s, area, b, e):
        ls = " ".join([x.text for x in area.lines[b:e]])
        ls = (s, " ".join(ls.split(" ")))
        added_sources.add(ls)
        for t in target_langs:
            lt = " ".join([(x.languages[t] if t in x.languages else "") for x in area.lines[b:e]])
            lt = (t, " ".join(lt.split(" ")))
            added_sources.add(lt)


    for s in source_langs:
        for t in target_langs:
            print("Preparing", s, "to", t)            
            st_prefix = get_prefix(s, t)
            ts_prefix = get_prefix(t, s)
            for pub in tqdm([p for p in publications if p.language==s]):
                for area in pub.text_areas:
                    if not any(x for x in area.lines if t in x.languages):
                        continue
                    if use_paragraphs:
                        paragraphs = area.lines_to_paragraphs(s, tgt_lang=pub.language)
                        line_ranges = []                
                        for p in paragraphs:             
                            try:
                                wlines = wrap_paragraph(p, area.lines, s, t)
                                line_ranges.extend(wlines)
                            except Exception as e:
                                pass
                            #     print(f"Failed :: {e} {debug_fail_state_counter}, type={type(area.lines)}")
                            #     # print(f"p = {p}\narea.lines = {area.lines}\ns = {s}\nt = {t}")
                            #     debug_fail_state_counter += 1
                            #     sleep(1)
                        print("="*50, len(area.lines))
                        for b, e in line_ranges:
                            add_line_ranges(s, area, b, e)
                    if use_lines:
                        for i, _ in enumerate(area.lines):
                            add_line_ranges(s, area, i, i + 1)
#     random.shuffle(new_sourceandtargets)
#     return Dataset.from_dict({"source": [x[0] for x in new_sourceandtargets], "target": [x[1] for x in new_sourceandtargets]})
    return added_sources

all_sources = get_pubs_sources()
all_sources = list(all_sources)
len(all_sources)



Preparing sux to en


 27%|██▋       | 27449/99858 [00:00<00:00, 274459.51it/s]



 66%|██████▌   | 65575/99858 [00:00<00:00, 334417.01it/s]



100%|██████████| 99858/99858 [00:00<00:00, 233196.32it/s]

Preparing akk to en



100%|██████████| 21962/21962 [00:00<00:00, 177068.88it/s]






128130

In [16]:
all_sources[1]

('en', 'a kind of insect')

In [17]:
all_sources[:5]

[('en', 'labor of the hirelings, on orders of Su-...;'),
 ('en', 'a kind of insect'),
 ('akk', 'i-id-da-an-szi'),
 ('sux', '[...] x x-sze3#? nindaba# [...]'),
 ('en', 'to return what was their land to their control,')]

## Train the Tokenizer

In [18]:
tokenizer_txt_path = os.path.abspath("tokenizer_training_data.txt")
with open(tokenizer_txt_path, "wb") as f:
    for lang, line in tqdm(all_sources):
        f.write(bytes(line, "utf8"))
        f.write(b'\n')
tokenizer_txt_path

100%|██████████| 128130/128130 [00:00<00:00, 1645200.23it/s]


'/home/bruhpc/Documents/cuneiform-stuff/CuneiformTranslators/tools/tokenizer_training_data.txt'

In [19]:
!tail tokenizer_training_data.txt

[mu ha-am]-mu#-ra-pi2# [lugal-e geszkim-ti] an# {d}en-lil2-bi-da# [x x e-mu-ut]-ba-lum x x [... ri-im{d}]suen [...]
he brought down diorite,
1(disz) masz2-gal#
_{iti}apin u4 1(u) 5(disz)-kam2#_
22 strings of dates, dried(?),
ad-da
Bring out a pit in the sand dunes(?), thus(?) is your appropriate (role)
gesz bi2-du3
a#-ru#-a# ur!-{d}ab-u2
year: “The priestess of Nanna with a goat was determined” (Šulgi 43).


In [20]:
special_tokens=["<pad>", "</s>", "<unk>", "[...]"]
additional_special_tokens = [f"<extra_id_{i}>" for i in range(100)]
all_special_tokens = special_tokens + additional_special_tokens
all_special_tokens[:10]

['<pad>',
 '</s>',
 '<unk>',
 '[...]',
 '<extra_id_0>',
 '<extra_id_1>',
 '<extra_id_2>',
 '<extra_id_3>',
 '<extra_id_4>',
 '<extra_id_5>']

In [21]:
tokenizer = Tokenizer(BPE(unk_token="<unk>"))
trainer = BpeTrainer(vocab_size=max_vocab_size, special_tokens=all_special_tokens)

# print(tokenizer.pre_tokenizer)
tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.ByteLevel(add_prefix_space=False) # Split("\n", "removed")
tokenizer.post_processor = TemplateProcessing(
    single="$0 </s>",
    pair="$A </s> $B:1 </s>:1",
    special_tokens=[(x, i) for i, x in enumerate(all_special_tokens)],
)
# tokenizer.post_processor = tokenizers.processors. tokenizers.processors.ByteLevel(trim_offsets=False)
tokenizer.decoder = tokenizers.decoders.ByteLevel()
tokenizer.model_max_length=model_max_length
files = [tokenizer_txt_path]
tokenizer.train(files, trainer)





In [22]:
tokenizer.get_vocab_size()

36784

In [23]:
test_txt = all_sources[6][1]
test_tokens = tokenizer.encode(test_txt).ids
print(test_txt)
print(test_tokens)
print(tokenizer.decode(ids=test_tokens))

That storm that knows no mother, that storm that knows no father
[1639, 1038, 479, 3000, 772, 1236, 115, 479, 1038, 479, 3000, 772, 982, 1]
That storm that knows no mother, that storm that knows no father


In [24]:
# tokenizer.decode(tokenizer.encode("Hello, my name is Frank").ids)

In [25]:
ptokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer, model_max_len=model_max_length)
ptokenizer.model_max_length = model_max_length
ptokenizer.pad_token = "<pad>"
ptokenizer.pad_token_id = tokenizer.encode("<pad>").ids[0]
ptokenizer.eos_token = "</s>"
ptokenizer.eos_token_id = tokenizer.encode("</s>").ids[0]
ptokenizer.unk_token = "<unk>"
ptokenizer.unk_token_id = tokenizer.encode("<unk>").ids[0]

In [26]:
ptokenizer.decode(ptokenizer.encode("Hello, my name is Frank"))

'Hello, my name is Frank</s>'

In [27]:
ptokenizer.model_max_length

512

In [28]:
tokenizer = ptokenizer

## Build the Train Dataset

In [29]:
all_sources_dataset = Dataset.from_dict({"source": [x[1] for x in all_sources[:1000]]})

In [30]:
all_sources_dataset[11]

{'source': 'i-{d}utu kur-ra ba-e-dab5-be2'}

In [31]:
dataset = all_sources_dataset.train_test_split(test_size=0.1)

In [32]:
original_tests = dataset["test"]
original_tests

Dataset({
    features: ['source'],
    num_rows: 100
})

## Tokenize the Data

In [33]:
tokenizer.model_max_length

512

In [34]:
print("pad", tokenizer.pad_token, tokenizer.pad_token_id)
print("eos", tokenizer.eos_token, tokenizer.eos_token_id)
print("unk", tokenizer.unk_token, tokenizer.unk_token_id)

pad <pad> 0
eos </s> 1
unk <unk> 2


In [35]:
def corrupt_sources(sources):
    
    nsources = len(sources)
    print("-"*10, nsources)
    targets = []
    for i in range(nsources):
        print(len(sources[i]))
    return sources

In [36]:
ccc = 0
sum_src_chars_per_token = 0.0
num_src_chars_per_token = 0
sum_tgt_chars_per_token = 0.0
num_tgt_chars_per_token = 0

def preprocess_function(examples):
    global ccc, sum_src_chars_per_token, sum_tgt_chars_per_token, num_src_chars_per_token, num_tgt_chars_per_token
#     print(examples)
    inputs = [example for example in examples["source"]]
    
    model_inputs = tokenizer(inputs, max_length=model_max_length, truncation=True)
    
    with tokenizer.as_target_tokenizer():
#         labels = tokenizer(targets, max_length=model_max_length, truncation=True)
        labels = corrupt_sources(model_inputs["input_ids"])

    model_inputs["labels"] = labels
    
    nexamples = len(inputs)
    for i in range(nexamples):
        nchar = len(inputs[i])
        ntoks = len(model_inputs["input_ids"][i])
        if ntoks > 0:
            sum_src_chars_per_token += nchar / ntoks
            num_src_chars_per_token += 1
    
    ccc += 1
    if ccc == 1:
        print(model_inputs["input_ids"][0])
        print(model_inputs["labels"][0])

    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)
tokenized_dataset

Map:   0%|          | 0/900 [00:00<?, ? examples/s]

---------- 900
34
14
14
5
8
3
11
4
7
10
17
33
6
5
10
28
8
8
5
8
6
9
14
12
6
12
57
17
11
9
7
11
4
9
13
4
8
33
18
11
9
36
5
10
19
15
8
43
28
7
14
11
19
6
11
20
12
21
18
12
10
13
5
29
44
19
6
8
6
21
16
13
24
13
11
15
6
9
16
11
5
3
5
12
8
15
4
16
10
30
4
4
13
7
5
5
9
11
7
15
28
8
14
6
12
11
14
16
6
9
4
6
4
11
14
3
28
7
8
8
3
7
26
14
4
6
13
13
19
6
7
8
17
18
15
8
18
14
13
5
5
8
12
22
18
7
4
28
7
11
4
10
9
9
7
20
14
9
4
5
9
14
7
27
5
17
7
28
4
10
12
12
19
8
8
4
18
9
6
8
10
13
8
15
9
21
12
7
7
12
7
17
7
13
14
15
9
7
15
4
22
6
19
20
18
7
12
5
21
8
5
15
7
13
6
4
15
13
25
16
13
40
9
4
3
10
6
8
13
32
16
10
8
9
12
28
6
14
20
12
19
6
12
10
18
26
14
8
16
10
17
9
21
3
7
14
11
8
9
17
8
13
25
18
16
16
13
16
3
12
28
18
5
29
7
9
17
7
6
10
26
10
5
14
9
11
11
6
6
11
16
10
23
14
12
18
16
10
14
12
41
12
23
9
13
33
11
7
9
9
18
4
7
14
27
10
6
3
4
36
15
7
5
9
40
6
20
11
23
11
33
4
14
5
10
20
7
5
28
14
7
6
11
5
11
10
5
24
18
14
19
16
15
21
7
4
22
7
13
15
10
16
9
14
16
6
9
11
25
5
4
11
17
28
13
7
15
44
8
5
5
31
6



Map:   0%|          | 0/100 [00:00<?, ? examples/s]

---------- 100
7
7
4
23
16
13
5
20
3
19
4
5
30
15
22
17
7
11
9
5
32
15
6
10
10
13
7
12
5
24
35
13
9
29
31
6
13
13
9
12
10
9
7
6
62
2
24
7
19
9
11
18
9
4
5
11
18
18
17
10
19
12
20
23
11
6
9
10
20
10
3
9
37
12
27
12
12
17
3
14
12
3
16
23
6
27
17
9
10
19
63
25
15
6
6
17
6
12
2
11


DatasetDict({
    train: Dataset({
        features: ['source', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 900
    })
    test: Dataset({
        features: ['source', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
})

In [37]:
avg_src_chars_per_token = sum_src_chars_per_token / num_src_chars_per_token
print("avg_src_chars_per_token", "=", avg_src_chars_per_token)

avg_src_chars_per_token = 2.6098093722429088


In [38]:
tokenized_dataset["train"] = tokenized_dataset["train"].remove_columns(["source"])
tokenized_dataset["test"] = tokenized_dataset["test"].remove_columns(["source"])
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 900
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
})

In [39]:
source_max_length = max([len(x["input_ids"]) for x in tokenized_dataset["train"]])
source_max_length

74

In [40]:
tokenized_dataset["train"][0]["labels"]

[321,
 122,
 116,
 468,
 383,
 121,
 650,
 121,
 116,
 768,
 323,
 121,
 116,
 273,
 116,
 286,
 116,
 722,
 124,
 1046,
 122,
 323,
 121,
 116,
 391,
 116,
 405,
 116,
 501,
 116,
 2518,
 116,
 170,
 1]

## Load the Model

In [41]:
base_model = AutoModelForSeq2SeqLM.from_pretrained(base_model_id, 
                                                   max_length=model_max_length)

In [42]:
model_config = copy.deepcopy(base_model.config)
model_config.vocab_size = tokenizer.vocab_size
model_config.max_length, model_config.vocab_size

(512, 36784)

In [43]:
# BE FUCKING CAREFULL. STOP ALL PROCESSES AND OPEN SYSTEM MONITOR. ELSE WILL CRASH PC.!!!!

model = AutoModelForSeq2SeqLM.from_config(model_config)

## Train

In [44]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
# data_collator

In [45]:
num_train_sequences = len(tokenized_dataset["train"])
num_train_sequences

900

In [46]:
from torch import optim
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

warm = int(0.1 * num_train_sequences/batch_size)
learning_rate = 5e-5
num_train_epochs = 10

training_args = Seq2SeqTrainingArguments(
    output_dir=f"../results/{model_id}",
    evaluation_strategy="steps",
    eval_steps=int(0.5 * num_train_sequences/batch_size),
    warmup_steps=warm,
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=num_train_epochs,
    fp16=has_cuda,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)


In [47]:
trainer.train()

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss


TrainOutput(global_step=1130, training_loss=5.626481155800608, metrics={'train_runtime': 75.8095, 'train_samples_per_second': 118.719, 'train_steps_per_second': 14.906, 'total_flos': 67072436207616.0, 'train_loss': 5.626481155800608, 'epoch': 10.0})

## Sample

In [48]:
pipeline = TranslationPipeline(model=model.to("cpu"), tokenizer=tokenizer, max_length=model_max_length)

In [49]:
pipeline

<transformers.pipelines.text2text_generation.TranslationPipeline at 0x782bde9cb790>

In [50]:
pipeline("translate English to French: hello my name is Frank")

[{'translation_text': '1(disz)'}]

In [51]:
source_test = {"sux": "# (gesz)esi - |_gisz-kal_|# = %a u2-szu-u2-um # gesz-kinx(|_hi_xNUN|) - |_gisz-hi_xNUN| = %a ki-isz-ka-nu-um # (gesz)szennur - |_gisz_-((x))-_gisz_%_gisz_| = %a sza-lu-ru-u2-um # (gesz)mes - |_gisz-mes_| = %a s,u2-ul-mu-u2 # (gesz)mes - = %a me-su-um # (gesz)mes - = %a me-er-tu-u2-um # (gesz)ha-lu-ub2 - |_gisz-ha-lu-sze3_| = %a hu-lu-pu-um # (gesz)targulx(|_ma2-kak_|) - |_gisz-ma2-kak_| = %a te-er-gu-ul-lu-u2 # (gesz)madal - |_gisz-bu_| = %a er-ru-u2 # (gesz)madal - = %a ma-tu-u2-um", "en": "ebony birch? plum a black wood a kind of tree a kind of tree a kind of ?cherry? tree mooring post peg? a kind of pole"}["test"][0]["source"]
target_test = translations["test"][0]["target"]
print(source_test)
print("-"*80)
print(target_test)

NameError: name 'translations' is not defined

In [52]:
def translate(text):
    return pipeline(text)

translate(source_test)

NameError: name 'source_test' is not defined

In [None]:
tests = original_tests
def sample(num_samples=100):
    for i in range(min(num_samples, tests.num_rows)):
        t = tests[i]
    #     print(t)
        src = t["source"]
        tgt = t["target"]
        query = src
        pred = pipeline(query)[0]["translation_text"]
        print("-"*48)
        print("QUERY ", query)
        print("TARGET", tgt)
        print("PRED  ", pred)
    #     break
    
sample()

## Save to Huggingface

In [53]:
# make model path
model_path = f"../results/{model_id}"
model_path = os.path.relpath(model_path)
trainer.save_model(model_path)
model_path

'../results/t5-small-pre-p-l-akksux-en-20240108-221858'

In [54]:
tokenizer.save_pretrained(model_path)

('../results/t5-small-pre-p-l-akksux-en-20240108-221858/tokenizer_config.json',
 '../results/t5-small-pre-p-l-akksux-en-20240108-221858/special_tokens_map.json',
 '../results/t5-small-pre-p-l-akksux-en-20240108-221858/tokenizer.json')