In [1]:
import os
from datasets import load_dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    Trainer, 
    TrainingArguments, 
    DataCollatorForLanguageModeling
)
import torch

In [2]:
# Paths
TRAIN_FILE = "data/train.json"
VAL_FILE = "data/val.json"
MODEL_NAME = "gpt2" # Small model for demonstration
OUTPUT_DIR = "models/bmw-gpt2"

In [3]:
def main():
    # 1. Load Dataset
    dataset = load_dataset("json", data_files={"train": TRAIN_FILE, "validation": VAL_FILE})

    # 2. Tokenizer
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    # GPT-2 doesn't have a pad token, so we use eos_token
    tokenizer.pad_token = tokenizer.eos_token

    def tokenize_function(examples):
        return tokenizer(examples["text"], truncation=True, max_length=512)

    tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

    # 3. Data Collator (Handles dynamic padding)
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

    # 4. Model
    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

    # 5. Training Arguments
    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        overwrite_output_dir=True,
        num_train_epochs=5,              # Short run for demo
        per_device_train_batch_size=2,   # Small batch for compatibility
        per_device_eval_batch_size=2,
        eval_strategy="steps",
        eval_steps=5,                   # Evaluate frequently to show logs
        logging_steps=5,
        save_steps=10,
        learning_rate=5e-5,
        weight_decay=0.01,
        use_cpu=not torch.cuda.is_available(), # Fallback if no GPU
        report_to="none"                 # Disable wandb for simple local demo
    )

    # 6. Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        data_collator=data_collator,
    )

    print("Starting training...")
    trainer.train()
    
    print("Saving model...")
    trainer.save_model(OUTPUT_DIR)
    tokenizer.save_pretrained(OUTPUT_DIR)
    print(f"Model saved to {OUTPUT_DIR}")

if __name__ == "__main__":
    main()

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Starting training...


`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss
5,3.5271,3.262907
10,3.2499,3.112149
15,2.9852,3.022456
20,2.8515,2.954504
25,2.677,2.909955
30,2.6877,2.873495
35,2.3091,2.847753
40,2.4575,2.83169
45,2.3707,2.826689


Saving model...
Model saved to models/bmw-gpt2
