In [None]:
# notebooks/03_generate_training_data.ipynb

import os
import sys
import json
import random
import time
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_google_genai import ChatGoogleGenerativeAI

# Add src to path
sys.path.append(os.path.abspath('../src'))
from config import RAW_DATA_DIR, LLM_MODEL_NAME

# --- CONFIG ---
OUTPUT_FILE = "../data/dnd_training_pairs.jsonl"
NUM_CHUNKS_TO_PROCESS = 2000

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 1. Load Rules
print("ðŸ“– Loading PDFs...")
rule_docs = PyPDFLoader(os.path.join(RAW_DATA_DIR, "dnd_rules.pdf")).load()
all_docs = rule_docs


ðŸ“– Loading PDFs...


In [None]:
# 2. Chunking 
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
all_chunks = text_splitter.split_documents(all_docs)

# Random sample to save money/time
selected_chunks = random.sample(all_chunks, min(NUM_CHUNKS_TO_PROCESS, len(all_chunks)))
print(f"âœ… Selected {len(selected_chunks)} text blocks for data generation.")


âœ… Selected 800 text blocks for data generation.


In [4]:
# 3. Initialize Gemini
llm = ChatGoogleGenerativeAI(model=LLM_MODEL_NAME, temperature=0.7)


Both GOOGLE_API_KEY and GEMINI_API_KEY are set. Using GOOGLE_API_KEY.


In [None]:
# 4. Generator Loop
print("ðŸš€ Starting Data Generation...")
dataset = []

for i, chunk in enumerate(selected_chunks):
    text_content = chunk.page_content.replace("\n", " ")
    
    # Prompt:
    prompt = f"""
    Given the RULEBOOK EXCERPT, write exactly 3 one-sentence table utterances that fit this excerpt.

    Style mix (exactly):
    1) a direct rules question (ends with "?")
    2) normal table talk / roleplay line (no "?")
    3) DM-style ruling or clarification statement (no "?")

    Rules:
    - Each utterance must be answerable from the excerpt and specific to it.
    - Paraphrase; do not copy >4 consecutive words.
    - Max 25 words each.

    RULEBOOK EXCERPT: "{text_content}"
    
    Format output ONLY as a JSON list of strings, no markdown, no extra text, e.g.: ["scenario 1", "scenario 2", "scenario 3"]
    """
    
    try:
        response = llm.invoke(prompt).content
        # Clean up code blocks if Gemini adds them
        response = response.replace("```json", "").replace("```", "").strip()
        questions = json.loads(response)
        
        for q in questions:
            # We save: {"query": question, "pos": text_content}
            dataset.append({"query": q, "pos": text_content})
            
        print(f"Chunk {i+1}/{len(selected_chunks)}: Generated {len(questions)} pairs.", end="\r")
        time.sleep(1) # Safety delay for API limits
        
    except Exception as e:
        print(f"Skipped chunk {i} due to error: {e}")

ðŸš€ Starting Data Generation...
Chunk 800/800: Generated 3 pairs.

In [6]:
# 5. Save to JSONL
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
    for entry in dataset:
        f.write(json.dumps(entry) + "\n")

print(f"\nðŸŽ‰ Saved {len(dataset)} training pairs to {OUTPUT_FILE}")


ðŸŽ‰ Saved 2400 training pairs to ../data/dnd_training_pairs.jsonl
