In [1]:
import os
import random
import torch
import pandas as pd
from datasets import load_dataset, load_from_disk
from transformers import (
    GPT2Tokenizer,
    GPT2LMHeadModel,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments
)
from peft import LoraConfig, get_peft_model

In [2]:
# -----------------------------
# 0. Configuration
# -----------------------------
INPUT_CSV_PATH = "../3A2M_EXTENDED.csv"
OUTPUT_DIR = "./gpt2-ner2directions-optimized"
TOKENIZED_CACHE = "./tokenized_data"
TRAIN_FILE = "train_ner2dir.txt"
VAL_FILE = "val_ner2dir.txt"
BLOCK_SIZE = 512  # smaller blocks to reduce padding waste
NUM_PROC = 4      # CPU cores for preprocessing

In [3]:
# -----------------------------
# 1. Load & preprocess CSV into text files
# -----------------------------
df = pd.read_csv(INPUT_CSV_PATH)
if "NER" not in df.columns or "directions" not in df.columns:
    raise ValueError("CSV must contain 'NER' and 'directions' columns.")
DF = df.dropna(subset=["NER", "directions"]).reset_index(drop=True)

def format_example_from_ner(ner_text: str, directions_text: str) -> str:
    parts = ["NER:"]
    for ent in ner_text.split(","):
        ent = ent.strip()
        if ent:
            parts.append(f"- {ent}")
    parts.append("")
    parts.append("Directions:")
    for idx, step in enumerate(directions_text.split("\n"), start=1):
        step = step.strip()
        if step:
            parts.append(f"{idx}. {step}")
    return "\n".join(parts) + "\n\n"

examples = [format_example_from_ner(row["NER"], row["directions"]) for _, row in DF.iterrows()]
random.seed(42)
random.shuffle(examples)
split_idx = int(0.9 * len(examples))
with open(TRAIN_FILE, "w", encoding="utf-8") as f:
    f.writelines(examples[:split_idx])
with open(VAL_FILE, "w", encoding="utf-8") as f:
    f.writelines(examples[split_idx:])

In [4]:
# -----------------------------
# 2. Load datasets and tokenize (cache)
# -----------------------------
raw_datasets = load_dataset(
    "text",
    data_files={"train": TRAIN_FILE, "validation": VAL_FILE}
)
if not os.path.isdir(TOKENIZED_CACHE):
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    tokenizer.pad_token = tokenizer.eos_token
    def tokenize_function(examples):
        return tokenizer(
            examples["text"],
            return_attention_mask=True,
            padding=False,
            truncation=False
        )
    tokenized = raw_datasets.map(
        tokenize_function,
        batched=True,
        num_proc=NUM_PROC,
        remove_columns=["text"]
    )
    tokenized.save_to_disk(TOKENIZED_CACHE)
else:
    tokenized = load_from_disk(TOKENIZED_CACHE)

def group_texts(examples):
    concatenated = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = (len(concatenated["input_ids"]) // BLOCK_SIZE) * BLOCK_SIZE
    result = {k: [concatenated[k][i : i + BLOCK_SIZE] for i in range(0, total_length, BLOCK_SIZE)]
              for k in concatenated.keys()}
    result["labels"] = result["input_ids"].copy()
    return result

lm_datasets = tokenized.map(
    group_texts,
    batched=True,
    num_proc=NUM_PROC,
    remove_columns=tokenized["train"].column_names
)

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Map (num_proc=4):   0%|          | 0/27071072 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1153 > 1024). Running this sequence through the model will result in indexing errors


Token indices sequence length is longer than the specified maximum sequence length for this model (1140 > 1024). Running this sequence through the model will result in indexing errors


Token indices sequence length is longer than the specified maximum sequence length for this model (1063 > 1024). Running this sequence through the model will result in indexing errors


Token indices sequence length is longer than the specified maximum sequence length for this model (1225 > 1024). Running this sequence through the model will result in indexing errors


Map (num_proc=4):   0%|          | 0/3006866 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1935 > 1024). Running this sequence through the model will result in indexing errors


Token indices sequence length is longer than the specified maximum sequence length for this model (1075 > 1024). Running this sequence through the model will result in indexing errors


Token indices sequence length is longer than the specified maximum sequence length for this model (1153 > 1024). Running this sequence through the model will result in indexing errors


Token indices sequence length is longer than the specified maximum sequence length for this model (2397 > 1024). Running this sequence through the model will result in indexing errors


Saving the dataset (0/4 shards):   0%|          | 0/27071072 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3006866 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/27071072 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/3006866 [00:00<?, ? examples/s]

In [5]:
# -----------------------------
# 3. Model + PEFT LoRA setup
# -----------------------------
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained(model_name)
lora_config = LoraConfig(
    task_type="CAUSAL_LM",
    inference_mode=False,
    r=8,
    lora_alpha=16,
    lora_dropout=0.05
)
model = get_peft_model(model, lora_config)
# model.gradient_checkpointing_enable()
# NOTE: On Kaggle P100 (CUDA 6.0), torch.compile with Triton will error. Skip compilation.



In [6]:
# -----------------------------
# 4. Training setup
# -----------------------------
os.makedirs(OUTPUT_DIR, exist_ok=True)
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    num_train_epochs=2,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    learning_rate=3e-5,
    weight_decay=0.01,
    fp16=True,
    report_to="none",           # disable all logging integrations
    do_eval=True,
    eval_steps=5000,
    save_steps=5000,
    save_total_limit=2,
    logging_steps=1000,
    logging_dir=os.path.join(OUTPUT_DIR, "logs"),
    optim="adamw_torch"
)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["validation"],
    data_collator=data_collator,
    # remove_unused_columns=False
)



No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [7]:
# -----------------------------
# 5. Train & save
# -----------------------------
trainer.train()
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"Fine-tuned model saved to {OUTPUT_DIR}")

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
1000,2.6122
2000,2.3874
3000,2.3401
4000,2.3147
5000,2.2953
6000,2.28
7000,2.2717
8000,2.2601
9000,2.252
10000,2.2441






Fine-tuned model saved to ./gpt2-ner2directions-optimized


In [8]:
# -----------------------------
# 6. Inference helper
# -----------------------------
def generate_directions_from_ner(ner_list, max_new_tokens=150):
    tokenizer = GPT2Tokenizer.from_pretrained(OUTPUT_DIR)
    model = GPT2LMHeadModel.from_pretrained(OUTPUT_DIR)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    prompt = "NER:\n" + "\n".join(f"- {e}" for e in ner_list) + "\n\nDirections:\n"
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    output_ids = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        num_beams=5,
        no_repeat_ngram_size=2,
        early_stopping=True,
        pad_token_id=tokenizer.pad_token_id
    )
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)[len(prompt):].strip()

In [9]:
sample = ["onion", "garlic", "tomatoes", "olive oil", "basil", "salt", "pepper"]
print("Generated Directions:\n", generate_directions_from_ner(sample, 120))

Generated Directions:
 1. Preheat oven to 350\u00b0F. Grease a 9 x 13-inch baking dish. In a large bowl, combine the onion, garlic, tomatoes, basil, salt, pepper, and pepper. Mix well. Sprinkle with remaining ingredients. Bake in preheated oven for 10-12 minutes or until golden brown. Remove from oven and allow to cool completely before slicing into bite-sized pieces. Cool completely on a wire rack in the refrigerator for at least 1 hour before serving. Makes 1 1/2 cups. Serves 8.NER:"- ["
