Minimal training script for a tiny decoder-only (causal LM) model.

- Uses a tokenizer saved by train_tokenizer.py (e.g., in fin_lemma.tokenizer)
- Loads dataset JSON created by to_dataset.py with fields: {"input": ..., "output": ...}
- Builds training sequences: <bos> INPUT <lemma> OUTPUT <eos>
- Trains a very small model for demonstration purposes
- Saves the trained model and tokenizer to output directory


In [1]:
import os
import argparse
import json

In [2]:
import torch
from torch.utils.data import Dataset
from datasets import Dataset as HFDataset

In [3]:
import transformers

In [4]:
def create_tiny_gpt2_config(vocab_size):
    """Return a very small GPT-2 style config for quick CPU training.

    Keep dimensions tiny to fit in memory and train quickly.
    """
    return transformers.GPT2Config(
        vocab_size=vocab_size,
        n_positions=128,
        n_ctx=128,
        n_embd=128,
        n_layer=2,
        n_head=2,
        bos_token_id=vocab_size - 2,  # will be overwritten by tokenizer mapping on resize
        eos_token_id=vocab_size - 1,  # ditto; safe defaults
    )

In [5]:
# Module-level parameters (friendly for Jupyter but not so great for production)
tokenizer_dir = "lemma.tokenizer"
data_path = "dataset.json"
out_dir = "fin_lemma.model"
block_size = 128
batch_size = 16
epochs = 15
lr = 5e-4
warmup_steps = 0
weight_decay = 0.0
seed = 42

In [6]:
# Make output directory
os.makedirs(out_dir, exist_ok=True)

In [7]:
# Load tokenizer
tokenizer = transformers.PreTrainedTokenizerFast.from_pretrained(tokenizer_dir)

In [8]:
t=tokenizer("v ä k i v a l t a i s t e n ADJ Case=Gen Degree=Pos Derivation=Inen Number=Plur ::: v ä k i # v a l t a i n e n")
tokenizer.decode(t["input_ids"])

'v ä k i v a l t a i s t e n ADJ Case=Gen Degree=Pos Derivation=Inen Number=Plur ::: v ä k i # v a l t a i n e n'

In [9]:
# Load dataset directly from JSON
dataset = HFDataset.from_json(data_path)
dataset = dataset.shuffle(seed=seed)
dataset = dataset.train_test_split(test_size=0.05, seed=seed)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

print(f"Train examples: {len(train_dataset)}")
print(f"Eval examples: {len(eval_dataset)}")



Train examples: 154674
Eval examples: 8141


In [10]:
# Build full sequences and tokenize using map
def tokenize_function(example):
    # Build full sequence: input + <lemma> + target + <eos>
    text = f"<bos> {example['input']} <lemma> {example['target']} <eos>"
    return tokenizer(
        text,
        truncation=True,
        max_length=block_size,
        padding=False,
        return_tensors=None
    )

In [11]:
# Tokenize and drop original text columns to avoid collator string errors
cols = dataset["train"].column_names if hasattr(dataset, "keys") else dataset.column_names
dataset = dataset.map(tokenize_function, remove_columns=cols)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

Map:   0%|          | 0/8141 [00:00<?, ? examples/s]

In [12]:
# Initialize tiny model
config = create_tiny_gpt2_config(vocab_size=len(tokenizer))
model = transformers.GPT2LMHeadModel(config)

In [13]:
# Training arguments
training_args = transformers.TrainingArguments(
    output_dir=out_dir,
    per_device_train_batch_size=batch_size,
    num_train_epochs=epochs,
    learning_rate=lr,
    warmup_steps=warmup_steps,
    weight_decay=weight_decay,
    lr_scheduler_type="linear",
    logging_steps=50,
    save_steps=5000,
    save_total_limit=1,
    do_eval=True,
    eval_strategy="steps",
    eval_steps=5000,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    seed=seed,
    report_to=[],  # disable W&B etc.
    remove_unused_columns=True,
    fp16=False,
)

In [14]:
# Data collator: mask input portion, train on target portion
class DataCollatorForCompletionLM(transformers.DataCollatorForLanguageModeling):
    def __init__(self, tokenizer, lemma_token_id):
        super().__init__(tokenizer=tokenizer, mlm=False)
        self.lemma_token_id = lemma_token_id
        
    def __call__(self, features):
        batch = super().__call__(features)
        labels = batch["labels"].clone()
        
        # Mask everything before and including <lemma> token
        for i, input_ids in enumerate(batch["input_ids"]):
            lemma_pos = (input_ids == self.lemma_token_id).nonzero(as_tuple=True)[0]
            if len(lemma_pos) > 0:
                labels[i, :lemma_pos[0] + 1] = -100
            else:
                labels[i] = -100
        
        batch["labels"] = labels
        return batch

In [15]:
lemma_token_id = tokenizer.convert_tokens_to_ids("<lemma>")
data_collator = DataCollatorForCompletionLM(tokenizer, lemma_token_id)

In [16]:
# Callback: sample predictions every 5000 steps
class EvalCallback(transformers.TrainerCallback):
    def on_step_end(self, args, state, control, **kwargs):
        if state.global_step % 5000 == 0 and state.global_step > 0:
            print(f"\n=== Step {state.global_step} - Sample Predictions ===")
            
            # Pick 5 random eval examples
            import random
            indices = random.sample(range(len(eval_dataset)), min(5, len(eval_dataset)))
            
            model.eval()
            with torch.no_grad():
                for i, idx in enumerate(indices):
                    example = eval_dataset[idx]
                    input_ids = torch.tensor(example["input_ids"]).unsqueeze(0).to(model.device)
                    
                    # Find <lemma> token position
                    lemma_pos = (input_ids[0] == lemma_token_id).nonzero(as_tuple=True)[0]
                    if len(lemma_pos) > 0:
                        # Generate from <lemma> token onwards
                        prompt = input_ids[:, :lemma_pos[0] + 1].to(model.device)
                        
                        generated = model.generate(
                            prompt,
                            max_new_tokens=50,
                            do_sample=False,
                            pad_token_id=tokenizer.pad_token_id,
                            eos_token_id=tokenizer.eos_token_id
                        )
                        
                        # Decode and print
                        input_text = tokenizer.decode(input_ids[0], skip_special_tokens=False)
                        generated_text = tokenizer.decode(generated[0], skip_special_tokens=False)
                        
                        print(f"\nExample {i+1}:")
                        print(f"Input:  {input_text}")
                        print(f"Output: {generated_text}")
            
            model.train()

In [17]:
# Assemble trainer
trainer = transformers.Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    callbacks=[EvalCallback()]
)

  trainer = transformers.Trainer(


[2025-10-06 17:49:37,150] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/opt/rh/gcc-toolset-13/root/usr/libexec/gcc/x86_64-redhat-linux/13/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlvsym'
/opt/rh/gcc-toolset-13/root/usr/libexec/gcc/x86_64-redhat-linux/13/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlopen'
/opt/rh/gcc-toolset-13/root/usr/libexec/gcc/x86_64-redhat-linux/13/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlclose'
/opt/rh/gcc-toolset-13/root/usr/libexec/gcc/x86_64-redhat-linux/13/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlerror'
/opt/rh/gcc-toolset-13/root/usr/libexec/gcc/x86_64-redhat-linux/13/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlsym'
collect2: error: ld returned 1 exit status


[2025-10-06 17:49:42,418] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False


In [18]:
# Train
trainer.train()

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss
5000,0.6129,0.406633
10000,0.4268,0.258427
15000,0.3263,0.198146
20000,0.2865,0.173647
25000,0.2904,0.157919
30000,0.2191,0.137031
35000,0.2172,0.131471
40000,0.2002,0.118166
45000,0.2216,0.104444
50000,0.1705,0.106261



=== Step 5000 - Sample Predictions ===

Example 1:
Input:  <bos> s u u n t a a n NOUN Case=Ill Number=Sing <lemma> s u u n t a <eos>
Output: <bos> s u u n t a a n NOUN Case=Ill Number=Sing <lemma> s u u n t a <eos>

Example 2:
Input:  <bos> y k s i l ö k s i NOUN Case=Tra Number=Sing <lemma> y k s i l ö <eos>
Output: <bos> y k s i l ö k s i NOUN Case=Tra Number=Sing <lemma> y k s i l ö <eos>

Example 3:
Input:  <bos> e i AUX Number=Sing Person=3 Polarity=Neg VerbForm=Fin Voice=Act <lemma> e i <eos>
Output: <bos> e i AUX Number=Sing Person=3 Polarity=Neg VerbForm=Fin Voice=Act <lemma> e i <eos>

Example 4:
Input:  <bos> . PUNCT <lemma> . <eos>
Output: <bos> . PUNCT <lemma> . <eos>

Example 5:
Input:  <bos> k u i n SCONJ <lemma> k u i n <eos>
Output: <bos> k u i n SCONJ <lemma> k u i n <eos>

=== Step 10000 - Sample Predictions ===

Example 1:
Input:  <bos> u u d e l t a ADJ Case=Abl Degree=Pos Number=Sing <lemma> u u s i <eos>
Output: <bos> u u d e l t a ADJ Case=Abl Degree=Pos Number=



TrainOutput(global_step=145020, training_loss=0.20552800493295595, metrics={'train_runtime': 1321.8912, 'train_samples_per_second': 1755.144, 'train_steps_per_second': 109.706, 'total_flos': 198387754752000.0, 'train_loss': 0.20552800493295595, 'epoch': 15.0})

In [19]:
# Save artifacts
trainer.save_model(out_dir)
tokenizer.save_pretrained(out_dir)

('fin_lemma.model/tokenizer_config.json',
 'fin_lemma.model/special_tokens_map.json',
 'fin_lemma.model/tokenizer.json')

In [20]:
print(f"Model and tokenizer saved to: {out_dir}")

Model and tokenizer saved to: fin_lemma.model
