In [1]:
# ===============================================
# FINAL KAGGLE SCRIPT: TikZ -> Natural Language GPT-2 Fine-tuning
# ===============================================

import os
import sys
import subprocess
import shutil

# -----------------------------
# INSTALL REQUIRED PACKAGES
# -----------------------------
subprocess.run([sys.executable, "-m", "pip", "install", "--upgrade", "pip"])
subprocess.run([sys.executable, "-m", "pip", "install",
                "torch",
                "transformers==4.34.0",
                "datasets>=2.20.0",
                "accelerate",
                "pyarrow==19.0.0"
               ])

# -----------------------------
# IMPORT AFTER INSTALL
# -----------------------------
import torch
from torch.utils.data import Dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import load_dataset

# -----------------------------
# CONFIGURATION
# -----------------------------
LOCAL_MODEL_PATH = "/kaggle/input/gpt2-local"   # uploaded GPT-2 model
WRITABLE_MODEL_PATH = "/kaggle/working/gpt2-local-copy"
OUTPUT_DIR = "/kaggle/working/tikz_gpt2_finetuned"
BATCH_SIZE = 2
EPOCHS = 1
MAX_LEN = 512
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

# -----------------------------
# COPY MODEL TO WRITABLE LOCATION
# -----------------------------
if not os.path.exists(WRITABLE_MODEL_PATH):
    shutil.copytree(LOCAL_MODEL_PATH, WRITABLE_MODEL_PATH)
print("Copied GPT-2 files to writable folder:", WRITABLE_MODEL_PATH)

# -----------------------------
# LOAD LOCAL GPT-2
# -----------------------------
tokenizer = GPT2Tokenizer.from_pretrained(WRITABLE_MODEL_PATH, local_files_only=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = GPT2LMHeadModel.from_pretrained(WRITABLE_MODEL_PATH, local_files_only=True)
model.to(device)
print("✅ Model and tokenizer loaded successfully from local files")

# -----------------------------
# LOAD TikZ DATASET
# -----------------------------
dataset = load_dataset("nllg/datikz-v2", split="train")

print("Dataset columns:", dataset.column_names)
print("Example item:", dataset[0])

# -----------------------------
# CUSTOM DATASET CLASS
# -----------------------------
class TikzDataset(Dataset):
    def __init__(self, dataset, tokenizer, max_len=512):
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.texts = []
        for item in dataset:
            # Using flexible key handling
            tikz_code = item.get("tikz_code") or item.get("code") or item.get("source") or ""
            nl_instr = item.get("description") or item.get("target") or ""
            self.texts.append(f"<TIKZ> {tikz_code} <NL> {nl_instr}")

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        input_ids = enc["input_ids"].squeeze()
        attention_mask = enc["attention_mask"].squeeze()
        return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": input_ids}

# -----------------------------
# PREPARE DATASET
# -----------------------------
train_dataset = TikzDataset(dataset, tokenizer, max_len=MAX_LEN)

# -----------------------------
# TRAINING ARGUMENTS
# -----------------------------
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    save_strategy="epoch",
    logging_steps=50,
    save_total_limit=2,
    fp16=True if device == "cuda" else False,
    report_to="none"
)

# -----------------------------
# TRAINER INITIALIZATION
# -----------------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer
)

# -----------------------------
# START TRAINING
# -----------------------------
trainer.train()

# -----------------------------
# SAVE FINE-TUNED MODEL
# -----------------------------
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print(f"✅ Fine-tuned model saved to {OUTPUT_DIR}")


Collecting pip
  Downloading pip-25.2-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.2-py3-none-any.whl (1.8 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.8/1.8 MB 22.6 MB/s eta 0:00:00
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.2
Collecting transformers==4.34.0
  Downloading transformers-4.34.0-py3-none-any.whl.metadata (121 kB)
Collecting pyarrow==19.0.0
  Downloading pyarrow-19.0.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers==4.34.0)
  Downloading huggingface_hub-0.35.3-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<0.15,>=0.14 (from transformers==4.34.0)
  Downloading tokenizers-0.14.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers==4.34.0)
  

ERROR: Cannot install datasets==2.20.0, datasets==2.21.0, datasets==3.0.0, datasets==3.0.1, datasets==3.0.2, datasets==3.1.0, datasets==3.2.0, datasets==3.3.0, datasets==3.3.1, datasets==3.3.2, datasets==3.4.0, datasets==3.4.1, datasets==3.5.0, datasets==3.5.1, datasets==3.6.0, datasets==4.0.0, datasets==4.1.0, datasets==4.1.1, datasets==4.2.0, pyarrow==19.0.0, transformers and transformers==4.34.0 because these package versions have conflicting dependencies.
ERROR: ResolutionImpossible: for help visit https://pip.pypa.io/en/latest/topics/dependency-resolution/#dealing-with-dependency-conflicts
2025-10-23 08:20:37.105232: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1761207637.274289      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1

Using device: cuda
Copied GPT-2 files to writable folder: /kaggle/working/gpt2-local-copy
✅ Model and tokenizer loaded successfully from local files


README.md:   0%|          | 0.00/785 [00:00<?, ?B/s]

data/train-00000-of-00009-1ba2669275ec2c(…):   0%|          | 0.00/410M [00:00<?, ?B/s]

data/train-00001-of-00009-f994347fd16d78(…):   0%|          | 0.00/445M [00:00<?, ?B/s]

data/train-00002-of-00009-be813d6d08dad7(…):   0%|          | 0.00/457M [00:00<?, ?B/s]

data/train-00003-of-00009-5cd5c0466f303f(…):   0%|          | 0.00/437M [00:00<?, ?B/s]

data/train-00004-of-00009-c417b80528687e(…):   0%|          | 0.00/423M [00:00<?, ?B/s]

data/train-00005-of-00009-722d02ebd0caaa(…):   0%|          | 0.00/443M [00:00<?, ?B/s]

data/train-00006-of-00009-8188eaafbeca1b(…):   0%|          | 0.00/374M [00:00<?, ?B/s]

data/train-00007-of-00009-3192ca07ba28f0(…):   0%|          | 0.00/428M [00:00<?, ?B/s]

data/train-00008-of-00009-0e8a359d9b760f(…):   0%|          | 0.00/433M [00:00<?, ?B/s]

data/test-00000-of-00001-e9214e6870e54ff(…):   0%|          | 0.00/28.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/94532 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/442 [00:00<?, ? examples/s]

Dataset columns: ['caption', 'code', 'image', 'pdf', 'uri', 'origin', 'date']
Example item: {'caption': 'An overview of the object extraction procedure used for the Relational Network experiments by Santoro et al. (2017), illustrating how to use individual pixels of a CNN representation as objects.', 'code': '\\documentclass[crop,tikz]{standalone}\n\\usepackage{tikz}\n\n\\usetikzlibrary{arrows,decorations.pathmorphing,backgrounds,positioning}\n\n\\definecolor{echoreg}{HTML}{2cb1e1}\n\\definecolor{olivegreen}{rgb}{0,0.6,0}\n\\definecolor{mymauve}{rgb}{0.58,0,0.82}\n\n\\usepackage{etoolbox}\n\n\\newtoggle{redraw}\n\\newtoggle{redraw2}\n\n\\tikzset{%\npics/cube/.style args={#1/#2/#3/#4}{code={%\n\t\\begin{scope}[line width=#4mm]\n\t\\begin{scope}\n\t\\clip (-#1,-#2,0) -- (#1,-#2,0) -- (#1,#2,0) -- (-#1,#2,0) -- cycle;\n\t\\filldraw (-#1,-#2,0) -- (#1,-#2,0) -- (#1,#2,0) -- (-#1,#2,0) -- cycle;\n\t\\end{scope}\n\\iftoggle{redraw}{%\n}{%\n\t\\begin{scope}\n\t\\clip (-#1,-#2,0) -- (-#1-#3,-#

  trainer = Trainer(
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
50,0.9188
100,0.7096
150,0.6606
200,0.5889
250,0.5746
300,0.5892
350,0.5611
400,0.5771
450,0.525
500,0.5589


✅ Fine-tuned model saved to /kaggle/working/tikz_gpt2_finetuned
