In [1]:
import os
import random
import torch
import pandas as pd
from transformers import (
    GPT2Tokenizer,
    GPT2LMHeadModel,
    TextDataset,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# # -----------------------------
# # 1. Load and preprocess the CSV
# # -----------------------------
# os.environ["WANDB_DISABLED"] = "true"

# INPUT_CSV_PATH = "3A2M_EXTENDED.csv"
OUTPUT_DIR = "./gpt2-ner2directions"
TRAIN_FILE = "train_ner2dir.txt"
VAL_FILE = "val_ner2dir.txt"

# # Load the DataFrame
# df = pd.read_csv(INPUT_CSV_PATH)# [:int(1e)]

# # Ensure NER and directions columns exist; drop rows missing either
# if "NER" not in df.columns or "directions" not in df.columns:
#     raise ValueError("CSV must contain 'NER' and 'directions' columns.")
# df = df.dropna(subset=["NER", "directions"]).reset_index(drop=True)

In [3]:
# # -----------------------------
# # 2. Format examples as text blocks
# # -----------------------------
# def format_example_from_ner(ner_text: str, directions_text: str) -> str:
#     """
#     Combine the NER text and directions text into a single string block:
    
#     NER:
#     - entity1
#     - entity2
#     ...

#     Directions:
#     1. step one
#     2. step two
#     ...
    
#     Ends with two newlines as a delimiter.
#     """
#     parts = []
#     parts.append("NER:")
#     # Split NER on commas; adjust if your NER uses newlines or another delimiter
#     for ent in ner_text.split(","):
#         ent = ent.strip()
#         if ent:
#             parts.append(f"- {ent}")
#     parts.append("")  # blank line between sections
#     parts.append("Directions:")
#     # Split directions on newline; keep existing multi‐line steps
#     for idx, step in enumerate(directions_text.split("\n"), start=1):
#         step = step.strip()
#         if step:
#             parts.append(f"{idx}. {step}")
#     # Two newlines to separate examples
#     return "\n".join(parts) + "\n\n"

# # Build a list of formatted examples
# examples = []
# for _, row in df.iterrows():
#     ner_text = row["NER"]
#     dir_text = row["directions"]
#     formatted = format_example_from_ner(ner_text, dir_text)
#     examples.append(formatted)

In [4]:
# # -----------------------------
# # 3. Split into train/validation
# # -----------------------------
# random.seed(42)
# random.shuffle(examples)

# split_idx = int(0.9 * len(examples))
# train_texts = examples[:split_idx]
# val_texts = examples[split_idx:]

# # Write out train and validation files
# with open(TRAIN_FILE, "w", encoding="utf-8") as f_train:
#     for ex in train_texts:
#         f_train.write(ex)

# with open(VAL_FILE, "w", encoding="utf-8") as f_val:
#     for ex in val_texts:
#         f_val.write(ex)

In [None]:
from datasets import load_dataset
# -----------------------------
# 4. Load GPT-2 tokenizer/model and prepare a Dataset
# -----------------------------
MODEL_NAME = "gpt2"  # or "gpt2-medium", "gpt2-large" if you have enough VRAM

# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
model     = GPT2LMHeadModel.from_pretrained(MODEL_NAME)

# (Optional) If you wish to add a pad token or any special tokens, do it here:
# tokenizer.add_special_tokens({"pad_token": "<|pad|>"})
# model.resize_token_embeddings(len(tokenizer))

# Use the Hugging Face datasets library to load the text files
raw_datasets = load_dataset(
    "text",
    data_files={
        "train": TRAIN_FILE,
        "validation": VAL_FILE
    }
)
# raw_datasets = {"train": Dataset, "validation": Dataset}

# Tokenization function
def tokenize_function(examples):
    # `examples["text"]` is a list of training/validation examples (strings)
    return tokenizer(
        examples["text"],
        return_attention_mask=True,
        return_special_tokens_mask=False,
    )

# Apply tokenization in batches
tokenized_datasets = raw_datasets.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"],
    desc="Tokenizing examples",
)

# Now group the tokenized examples into blocks of `block_size` (1024)
block_size = 1024

def group_texts(examples):
    """
    Concatenate all input_ids in a batch, then split into chunks of length block_size.
    Also create `labels` that are identical to `input_ids` (for causal LM).
    """
    concatenated = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = (len(concatenated["input_ids"]) // block_size) * block_size

    # Drop the remainder if it doesn’t fit evenly
    result = {}
    for k in concatenated.keys():
        chunks = [
            concatenated[k][i : i + block_size]
            for i in range(0, total_length, block_size)
        ]
        result[k] = chunks

    # Set labels = input_ids for next‐token prediction
    result["labels"] = result["input_ids"].copy()
    return result

# Group and reorder datasets
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    remove_columns=tokenized_datasets["train"].column_names,
    desc=f"Grouping into blocks of {block_size}",
)

# Now lm_datasets["train"] and lm_datasets["validation"] each have columns:
#  - input_ids (list of length block_size)
#  - attention_mask (list of length block_size)
#  - labels (same as input_ids)

Tokenizing examples:   0%|          | 14000/27071072 [00:01<54:17, 8306.68 examples/s] Token indices sequence length is longer than the specified maximum sequence length for this model (1140 > 1024). Running this sequence through the model will result in indexing errors
Tokenizing examples: 100%|██████████| 27071072/27071072 [52:38<00:00, 8569.62 examples/s] 
Tokenizing examples: 100%|██████████| 3006866/3006866 [05:52<00:00, 8537.81 examples/s]
Grouping into blocks of 1024:  67%|██████▋   | 18194000/27071072 [19:43<09:31, 15528.17 examples/s]

In [None]:
# -----------------------------
# 5. Configure TrainingArguments and Trainer
# -----------------------------
os.makedirs(OUTPUT_DIR, exist_ok=True)

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,

    # Training hyperparameters
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=200,

    # Evaluation and logging
    # evaluation_strategy="steps",
    do_eval=True,
    eval_steps=500,
    logging_steps=100,
    save_steps=500,
    save_total_limit=2,

    # Mixed precision if supported
    fp16=True if torch.cuda.is_available() else False,

    # Logging directory for TensorBoard
    logging_dir=os.path.join(OUTPUT_DIR, "logs"),
    dataloader_num_workers=4
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["validation"],
    data_collator=data_collator
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
# -----------------------------
# 6. Train and save the model
# -----------------------------
trainer.train()
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print(f"Fine-tuned model saved to {OUTPUT_DIR}")

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss


Fine-tuned model saved to ./gpt2-ner2directions


In [None]:
# -----------------------------
# 7. Inference: generate directions from a NER list
# -----------------------------
def generate_directions_from_ner(ner_list, max_new_tokens=150):
    """
    Given a Python list of entity strings (e.g. ["sugar", "flour", "eggs"]),
    format the prompt as during training and let GPT-2 generate the “Directions:”.
    """
    prompt_lines = ["NER:"]
    for ent in ner_list:
        prompt_lines.append(f"- {ent.strip()}")
    prompt_lines.append("")  # blank line
    prompt_lines.append("Directions:")
    prompt = "\n".join(prompt_lines) + "\n"

    # Tokenize with attention_mask
    encoding = tokenizer(
        prompt,
        return_tensors="pt",
        padding=False,
        truncation=False,
        return_attention_mask=True
    )
    input_ids = encoding["input_ids"]
    attention_mask = encoding["attention_mask"]

    if torch.cuda.is_available():
        model.to("cuda")
        input_ids = input_ids.to("cuda")
        attention_mask = attention_mask.to("cuda")

    # Generate up to max_new_tokens beyond the prompt
    output_ids = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=input_ids.shape[-1] + max_new_tokens,
        num_beams=5,
        no_repeat_ngram_size=2,
        early_stopping=True,
        pad_token_id=tokenizer.pad_token_id
    )

    full_output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    # Strip off the prompt itself; return only generated directions
    generated = full_output[len(prompt):].strip()
    return generated

In [None]:
# Example inference
sample_ner = ["onion", "garlic", "tomatoes", "olive oil", "basil", "salt", "pepper"]
print("=== Sample NER List ===")
print(sample_ner)
print("\n=== Generated Directions ===")
print(generate_directions_from_ner(sample_ner, max_new_tokens=120))

=== Sample NER List ===
['onion', 'garlic', 'tomatoes', 'olive oil', 'basil', 'salt', 'pepper']

=== Generated Directions ===
1. Preheat oven to 400 degrees.
2. In a large bowl, combine onion, tomatoes, garlic, basil, salt, pepper, and salt. Mix well. Cover with plastic wrap and bake at 350 degrees for 20 minutes or until the tomatoes are golden brown. Remove from the oven and allow to cool completely before serving.
