In [1]:
import os
import random
import torch
import pandas as pd
from transformers import (
    GPT2Tokenizer,
    GPT2LMHeadModel,
    TextDataset,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# -----------------------------
# 1. Load and preprocess the CSV
# -----------------------------
INPUT_CSV_PATH = "3A2M_EXTENDED.csv"
OUTPUT_DIR = "./gpt2-ner2directions"
TRAIN_FILE = "train_ner2dir.txt"
VAL_FILE = "val_ner2dir.txt"

# Load the DataFrame
df = pd.read_csv(INPUT_CSV_PATH)

# Ensure NER and directions columns exist; drop rows missing either
if "NER" not in df.columns or "directions" not in df.columns:
    raise ValueError("CSV must contain 'NER' and 'directions' columns.")
df = df.dropna(subset=["NER", "directions"]).reset_index(drop=True)

In [3]:
# -----------------------------
# 2. Format examples as text blocks
# -----------------------------
def format_example_from_ner(ner_text: str, directions_text: str) -> str:
    """
    Combine the NER text and directions text into a single string block:
    
    NER:
    - entity1
    - entity2
    ...

    Directions:
    1. step one
    2. step two
    ...
    
    Ends with two newlines as a delimiter.
    """
    parts = []
    parts.append("NER:")
    # Split NER on commas; adjust if your NER uses newlines or another delimiter
    for ent in ner_text.split(","):
        ent = ent.strip()
        if ent:
            parts.append(f"- {ent}")
    parts.append("")  # blank line between sections
    parts.append("Directions:")
    # Split directions on newline; keep existing multi‐line steps
    for idx, step in enumerate(directions_text.split("\n"), start=1):
        step = step.strip()
        if step:
            parts.append(f"{idx}. {step}")
    # Two newlines to separate examples
    return "\n".join(parts) + "\n\n"

# Build a list of formatted examples
examples = []
for _, row in df.iterrows():
    ner_text = row["NER"]
    dir_text = row["directions"]
    formatted = format_example_from_ner(ner_text, dir_text)
    examples.append(formatted)

In [4]:
# -----------------------------
# 3. Split into train/validation
# -----------------------------
random.seed(42)
random.shuffle(examples)

split_idx = int(0.9 * len(examples))
train_texts = examples[:split_idx]
val_texts = examples[split_idx:]

# Write out train and validation files
with open(TRAIN_FILE, "w", encoding="utf-8") as f_train:
    for ex in train_texts:
        f_train.write(ex)

with open(VAL_FILE, "w", encoding="utf-8") as f_val:
    for ex in val_texts:
        f_val.write(ex)

In [5]:
# -----------------------------
# 4. Load GPT-2 tokenizer/model and create datasets
# -----------------------------
MODEL_NAME = "gpt2"  # or "gpt2-medium", "gpt2-large" if you have enough VRAM
tokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME)

# (Optional) If you wish to add a pad token or any special tokens, do it here:
# tokenizer.add_special_tokens({"pad_token": "<|pad|>"})
# and then resize the model embeddings after loading the model.

model = GPT2LMHeadModel.from_pretrained(MODEL_NAME)

# If special tokens were added, uncomment:
# model.resize_token_embeddings(len(tokenizer))

# Build TextDataset for training and validation
def load_text_dataset(file_path: str, tokenizer, block_size: int = 1024):
    return TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size,
        overwrite_cache=True
    )

train_dataset = load_text_dataset(TRAIN_FILE, tokenizer, block_size=1024)
val_dataset = load_text_dataset(VAL_FILE, tokenizer, block_size=1024)

# Data collator: causal LM (no masking)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)




KeyboardInterrupt: 

In [None]:
# -----------------------------
# 5. Configure TrainingArguments and Trainer
# -----------------------------
os.makedirs(OUTPUT_DIR, exist_ok=True)

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,

    # Training hyperparameters
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=200,

    # Evaluation and logging
    evaluation_strategy="steps",
    eval_steps=500,
    logging_steps=100,
    save_steps=500,
    save_total_limit=2,

    # Mixed precision if supported
    fp16=True if torch.cuda.is_available() else False,

    # Logging directory for TensorBoard
    logging_dir=os.path.join(OUTPUT_DIR, "logs"),
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator
)

In [None]:
# -----------------------------
# 6. Train and save the model
# -----------------------------
trainer.train()
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print(f"Fine-tuned model saved to {OUTPUT_DIR}")

# -----------------------------
# 7. Inference: generate directions from a NER list
# -----------------------------
def generate_directions_from_ner(ner_list, max_new_tokens=150):
    """
    Given a Python list of entity strings (e.g. ["sugar", "flour", "eggs"]),
    format the prompt as during training and let GPT-2 generate the “Directions:”.
    """
    prompt_lines = ["NER:"]
    for ent in ner_list:
        prompt_lines.append(f"- {ent.strip()}")
    prompt_lines.append("")  # blank line
    prompt_lines.append("Directions:")
    prompt = "\n".join(prompt_lines) + "\n"

    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    if torch.cuda.is_available():
        model.to("cuda")
        input_ids = input_ids.to("cuda")

    # Generate up to max_new_tokens beyond the prompt
    output_ids = model.generate(
        input_ids,
        max_length=input_ids.shape[-1] + max_new_tokens,
        num_beams=5,
        no_repeat_ngram_size=2,
        early_stopping=True,
        pad_token_id=tokenizer.eos_token_id
    )

    full_output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    # Strip off the prompt itself; return only generated directions
    generated = full_output[len(prompt):].strip()
    return generated

In [None]:
sample_ner = ["onion", "garlic", "tomatoes", "olive oil", "basil", "salt", "pepper"]
print("=== Sample NER List ===")
print(sample_ner)
print("\n=== Generated Directions ===")
print(generate_directions_from_ner(sample_ner, max_new_tokens=120))