In [1]:
from data import TranslationDataset, DataLoader
import config
from tokenizer import Tokenizer
import torch
from typing import Tuple

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = Tokenizer(model_path=f"{config.llama_path}/tokenizer.model")
ds = TranslationDataset(
    dataset_hf_id="de-en",
    source_lang="de",
    target_lang="en",
    split="validation",
    tokenizer=tokenizer
)
dl = DataLoader(
    dataset=ds,
    batch_size=32,
    tokenizer=tokenizer
)

In [3]:
op = ds[0]
print(f"Non trainable part: {tokenizer.decode(op[0][:op[1]])}")
print(f"Trainable part: {tokenizer.decode(op[0][op[1]:])}")

Non trainable part: <|begin_of_text|>Translate: De: Eine republikanische Strategie, um der Wiederwahl von Obama entgegenzutreten En: 
Trainable part: A Republican strategy to counter the re-election of Obama<|end_of_text|>


In [4]:
print(f"Non trainable part: {op[0][:op[1]]}")
print(f"Trainable part: {op[0][op[1]:]}")

Non trainable part: [128000, 28573, 25, 220, 1951, 25, 52410, 107684, 276, 10782, 29323, 648, 11, 4543, 2761, 468, 22970, 73065, 6675, 7250, 1218, 713, 4469, 89, 332, 86985, 2998, 25, 220]
Trainable part: [32, 9540, 8446, 311, 5663, 279, 312, 43733, 315, 7250, 128001]


In [5]:
# 1st token
print(f"for 1st token prediction:")
inp, tgt = op[0][op[1]-1:-1], op[0][op[1]:]
assert len(inp) == len(tgt)
assert inp[1:] == tgt[:-1]
print(f"input: {inp}")
print(f"target: {tgt}")

# 2nd token
print(f"for 2nd token prediction:")
inp, tgt = op[0][op[1]-1:-2], op[0][op[1]+1:]
assert len(inp) == len(tgt)
assert inp[2:] == tgt[:-2]
print(f"input: {inp}")
print(f"target: {tgt}")

print(f"for 3rd token prediction:")
inp, tgt = op[0][op[1]-1:-3], op[0][op[1]+2:]
assert len(inp) == len(tgt)
assert inp[3:] == tgt[:-3]
print(f"input: {inp}")
print(f"target: {tgt}")

for 1st token prediction:
input: [220, 32, 9540, 8446, 311, 5663, 279, 312, 43733, 315, 7250]
target: [32, 9540, 8446, 311, 5663, 279, 312, 43733, 315, 7250, 128001]
for 2nd token prediction:
input: [220, 32, 9540, 8446, 311, 5663, 279, 312, 43733, 315]
target: [9540, 8446, 311, 5663, 279, 312, 43733, 315, 7250, 128001]
for 3rd token prediction:
input: [220, 32, 9540, 8446, 311, 5663, 279, 312, 43733]
target: [8446, 311, 5663, 279, 312, 43733, 315, 7250, 128001]


In [6]:
tokens, start_positions = ds[2:2+10]
len(tokens), len(start_positions)

(10, 10)

In [7]:
for tokens, start_positions in dl:
    print(tokens.shape, start_positions.shape)

torch.Size([32, 133]) torch.Size([32, 1])
torch.Size([32, 108]) torch.Size([32, 1])
torch.Size([32, 155]) torch.Size([32, 1])
torch.Size([32, 184]) torch.Size([32, 1])
torch.Size([32, 162]) torch.Size([32, 1])
torch.Size([32, 178]) torch.Size([32, 1])
torch.Size([32, 136]) torch.Size([32, 1])
torch.Size([32, 179]) torch.Size([32, 1])
torch.Size([32, 82]) torch.Size([32, 1])
torch.Size([32, 84]) torch.Size([32, 1])
torch.Size([32, 127]) torch.Size([32, 1])
torch.Size([32, 130]) torch.Size([32, 1])
torch.Size([32, 116]) torch.Size([32, 1])
torch.Size([32, 228]) torch.Size([32, 1])
torch.Size([32, 131]) torch.Size([32, 1])
torch.Size([32, 143]) torch.Size([32, 1])
torch.Size([32, 107]) torch.Size([32, 1])
torch.Size([32, 142]) torch.Size([32, 1])
torch.Size([32, 136]) torch.Size([32, 1])
torch.Size([32, 176]) torch.Size([32, 1])
torch.Size([32, 127]) torch.Size([32, 1])
torch.Size([32, 118]) torch.Size([32, 1])
torch.Size([32, 187]) torch.Size([32, 1])
torch.Size([32, 170]) torch.Size([32