In [5]:

!pip install -q --upgrade transformers datasets accelerate


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m503.6/503.6 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf-cu12 25.6.0 requires pyarrow<20.0.0a0,>=14.0.0; platform_machine == "x86_64", but you have pyarrow 21.0.0 which is incompatible.
pylibcudf-cu12 25.6.0 requires pyarrow<20.0.0a0,>=14.0.0; platform_machine == "x86_64", but you have pyarrow 21.0.0 which is incompatible.[0m[31m
[0m

In [None]:
# ==========================
# Robust T5-small fine-tune on ProofWriter (Colab-ready)
# Automatically handles older/newer transformers APIs
# ==========================
import inspect
import os
import pprint

# Print versions for debugging
import transformers, datasets
print("transformers version:", transformers.__version__)
print("datasets version:", datasets.__version__)

from datasets import load_dataset
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    TrainingArguments,
    Trainer,
)

# Try to import seq2seq helpers (not required, but preferred if available)
try:
    from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
    _has_seq2seq = True
except Exception:
    Seq2SeqTrainer = None
    Seq2SeqTrainingArguments = None
    DataCollatorForSeq2Seq = None
    _has_seq2seq = False

# 1) Load dataset
dataset = load_dataset("D3xter1922/proofwriter-dataset")
print(dataset)
pprint.pprint(dataset["train"][0])  # inspect one sample

# 2) Tokenizer and Model
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# ensure tokenizer has pad token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# 3) Preprocessing function (handles the 'translation' nested field)
def preprocess_function(examples):
    # examples["translation"] is a list of dicts for batched=True
    inputs = []
    targets = []

    for item in examples["translation"]:
        # item is usually {'en': 'input text ...', 'ro': 'target ...'}
        if isinstance(item, dict):
            # prefer 'en' and 'ro' keys, otherwise fallback to first/second values
            input_text = item.get("en") or item.get("input") or list(item.values())[0]
            target_text = item.get("ro") or item.get("target") or item.get("output") or (list(item.values())[1] if len(item.values()) > 1 else "")
        else:
            # defensive fallback
            input_text = str(item)
            target_text = ""
        inputs.append(input_text)
        targets.append(target_text)

    model_inputs = tokenizer(inputs, padding="max_length", truncation=True, max_length=512)

    # Tokenize labels (targets) and replace pad token id's by -100 so they are ignored in loss
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, padding="max_length", truncation=True, max_length=128)

    label_ids = labels["input_ids"]
    # replace pad token id's with -100
    label_ids = [[(l if l != tokenizer.pad_token_id else -100) for l in seq] for seq in label_ids]

    model_inputs["labels"] = label_ids
    return model_inputs

# 4) Apply preprocessing (this prints "Map: 7%" etc. while running)
tokenized_datasets = dataset.map(preprocess_function, batched=True)

# 5) Build training_args robustly:
desired_training_kwargs = {
    "output_dir": "./t5-proofwriter",
    "eval_strategy": "epoch",           # prefer new name
    "learning_rate": 3e-5,
    "per_device_train_batch_size": 8,
    "per_device_eval_batch_size": 8,
    "num_train_epochs": 3,
    "weight_decay": 0.01,
    "logging_dir": "./logs",
    "logging_steps": 100,
    "save_steps": 500,
    "save_total_limit": 2,
    "predict_with_generate": True,      # may or may not be accepted by your installed TrainingArguments
}

# Decide which TrainingArguments class to instantiate:
TrainingArgsClass = None
TrainerClass = None

# Prefer Seq2SeqTrainingArguments + Seq2SeqTrainer if available and supports predict_with_generate
if _has_seq2seq:
    try:
        sig = inspect.signature(Seq2SeqTrainingArguments.__init__)
        if "predict_with_generate" in sig.parameters:
            TrainingArgsClass = Seq2SeqTrainingArguments
            TrainerClass = Seq2SeqTrainer
    except Exception:
        pass

# Fallback to plain TrainingArguments
if TrainingArgsClass is None:
    TrainingArgsClass = TrainingArguments
    TrainerClass = Trainer

# Filter kwargs to only those accepted by the chosen TrainingArgsClass
init_params = inspect.signature(TrainingArgsClass.__init__).parameters
accepted = set(init_params.keys()) - {"self", "kwargs"}

# allow aliases between 'eval_strategy' <-> 'evaluation_strategy' (some versions use one name)
aliases = {
    "eval_strategy": "evaluation_strategy",
    "evaluation_strategy": "eval_strategy",
}

final_kwargs = {}
for k, v in desired_training_kwargs.items():
    if k in accepted:
        final_kwargs[k] = v
    else:
        # try alias
        alias = aliases.get(k)
        if alias and alias in accepted:
            final_kwargs[alias] = v
        else:
            # skip unsupported arg (safe)
            print(f"Note: '{k}' not supported by {TrainingArgsClass.__name__}; skipping it.")

print(f"Using TrainingArgs class: {TrainingArgsClass.__name__}")
print("Final training args:")
pprint.pprint(final_kwargs)

training_args = TrainingArgsClass(**final_kwargs)

# 6) Data collator
data_collator = None
if _has_seq2seq and DataCollatorForSeq2Seq is not None:
    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# 7) Initialize Trainer (or Seq2SeqTrainer)
trainer_init_kwargs = {
    "model": model,
    "args": training_args,
    "train_dataset": tokenized_datasets["train"],
    # prefer validation split for eval
    "eval_dataset": tokenized_datasets.get("validation", tokenized_datasets.get("test", None)),
}

# attach tokenizer & data collator if available
trainer_init_kwargs["tokenizer"] = tokenizer
if data_collator is not None:
    trainer_init_kwargs["data_collator"] = data_collator

trainer = TrainerClass(**trainer_init_kwargs)

# 8) Train
trainer.train()

# 9) Save
model.save_pretrained("./t5-proofwriter")
tokenizer.save_pretrained("./t5-proofwriter")

# 10) Evaluate
eval_results = trainer.evaluate()
print("Evaluation Results:", eval_results)

# 11) Quick sample generation (optional)
example_input = "question: The tiger visits the rabbit. context: sent1: The cow is round. sent2: The cow needs the lion. sent3: The cow needs the rabbit."
inputs = tokenizer(example_input, return_tensors="pt", truncation=True, padding="longest").to(model.device)
with torch.no_grad():
    out = model.generate(**inputs, max_new_tokens=64)
gen = tokenizer.decode(out[0], skip_special_tokens=True)
print("Sample generation:", gen)


transformers version: 4.57.0
datasets version: 4.0.0
DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 41388
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 6012
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 11820
    })
})
{'translation': {'en': '$answer$ ; $proof$ ; $question$ = The tiger visits the '
                       'rabbit. ; $context$ = sent1: The cow is round. sent2: '
                       'The cow needs the lion. sent3: The cow needs the '
                       'rabbit. sent4: The cow sees the lion. sent5: The cow '
                       'visits the rabbit. sent6: The lion is round. sent7: '
                       'The rabbit is kind. sent8: The rabbit visits the '
                       'tiger. sent9: The tiger is big. sent10: The tiger is '
                       'kind. sent11: The tiger sees the rabbit. sent12: The '
                       'tiger visits the

Map:   0%|          | 0/41388 [00:00<?, ? examples/s]



Map:   0%|          | 0/6012 [00:00<?, ? examples/s]

Map:   0%|          | 0/11820 [00:00<?, ? examples/s]

Using TrainingArgs class: Seq2SeqTrainingArguments
Final training args:
{'eval_strategy': 'epoch',
 'learning_rate': 3e-05,
 'logging_dir': './logs',
 'logging_steps': 100,
 'num_train_epochs': 3,
 'output_dir': './t5-proofwriter',
 'per_device_eval_batch_size': 8,
 'per_device_train_batch_size': 8,
 'predict_with_generate': True,
 'save_steps': 500,
 'save_total_limit': 2,
 'weight_decay': 0.01}


  trainer = TrainerClass(**trainer_init_kwargs)
  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mharshita-gupta3021[0m ([33mharshita-gupta3021-institute-of-technology-management[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




Epoch,Training Loss,Validation Loss
