In [1]:
# notebooks/01_setup_and_ingest.ipynb

# %% [markdown]
# # 01. Setup and Knowledge Ingestion
# 
# ## Goal
# We will load two PDF sources (Rules and Campaign), chunk them, tag them with metadata, and save them to a Vector Database.
#
# ## Prerequisites
# * Ensure you have `dnd_rules.pdf` AND `campaign.pdf` in `data/raw/`. 
# * (If you only have one file, just comment out the second loader code).

# %%
# 1. Imports
import os
import sys
import shutil

# Add src to path to import config
sys.path.append(os.path.abspath('../src'))
from config import RAW_DATA_DIR, VECTOR_STORE_DIR, EMBEDDING_MODEL_PATH

from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# %% [markdown]
# ## Step 1: Load PDFs and Assign Metadata
# We need to label the data so we can later ask: "What did the Campaign book say?" vs "What did the Rule book say?"

# %%
# Define your filenames here
RULE_PDF = os.path.join(RAW_DATA_DIR, "dnd_rules.pdf")
CAMPAIGN_PDF = os.path.join(RAW_DATA_DIR, "campaign.pdf")

documents = []

# --- Load Rules ---
if os.path.exists(RULE_PDF):
    print(f"Loading Rules: {RULE_PDF}")
    rule_loader = PyPDFLoader(RULE_PDF)
    rule_docs = rule_loader.load()
    # Add Metadata Tag
    for doc in rule_docs:
        doc.metadata["source_type"] = "rulebook"
    documents.extend(rule_docs)
else:
    print(f"⚠️ Warning: {RULE_PDF} not found.")

# --- Load Campaign ---
if os.path.exists(CAMPAIGN_PDF):
    print(f"Loading Campaign: {CAMPAIGN_PDF}")
    camp_loader = PyPDFLoader(CAMPAIGN_PDF)
    camp_docs = camp_loader.load()
    # Add Metadata Tag
    for doc in camp_docs:
        doc.metadata["source_type"] = "campaign"
    documents.extend(camp_docs)
else:
    print(f"⚠️ Warning: {CAMPAIGN_PDF} not found.")

print(f"✅ Total Pages Loaded: {len(documents)}")

Loading Rules: c:\Users\pfeil\My Drive\Studys\CAS-NLP-Uni-Bern\Module 4  NLP Transformers\dungeon_master_copilot\data\raw\dnd_rules.pdf
Loading Campaign: c:\Users\pfeil\My Drive\Studys\CAS-NLP-Uni-Bern\Module 4  NLP Transformers\dungeon_master_copilot\data\raw\campaign.pdf
✅ Total Pages Loaded: 850


In [3]:
# %% [markdown]
# ## Step 2: Split Text into Chunks

# %%
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=300,
    separators=["\n\n", "\n", ".", " ", ""]
)

splits = text_splitter.split_documents(documents)
print(f"Created {len(splits)} chunks.")

# Verify metadata is preserved
if len(splits) > 0:
    print(f"Example Metadata: {splits[0].metadata}")

Created 1444 chunks.
Example Metadata: {'producer': 'WeasyPrint 58.1', 'creator': 'pandoc', 'creationdate': '', 'title': 'System Reference Document 5.1', 'source': 'c:\\Users\\pfeil\\My Drive\\Studys\\CAS-NLP-Uni-Bern\\Module 4  NLP Transformers\\dungeon_master_copilot\\data\\raw\\dnd_rules.pdf', 'total_pages': 818, 'page': 0, 'page_label': '1', 'source_type': 'rulebook'}


In [4]:
# %% [markdown]
# ## Step 3: Initialize Embeddings and Create Vector Store
# **Note:** If a vector store already exists, we clear it first to avoid duplicate data during testing.

# %%
import shutil
import time
import gc

# Define a helper to force-delete read-only files (common on Windows)
def on_error(func, path, exc_info):
    import stat
    if not os.access(path, os.W_OK):
        os.chmod(path, stat.S_IWUSR)
        func(path)
    else:
        raise

# Initialize Local Fine-Tuned Embeddings
print(f"Loading Embedding Model from: {EMBEDDING_MODEL_PATH}")

# Check if the folder exists (Critical for local models)
if os.path.isabs(EMBEDDING_MODEL_PATH) and not os.path.exists(EMBEDDING_MODEL_PATH):
    raise FileNotFoundError(
        f"❌ Could not find model at {EMBEDDING_MODEL_PATH}.\n"
    )

model_kwargs = {
    "trust_remote_code": True, 
    "device": "cpu"
}

embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_PATH,
    model_kwargs=model_kwargs,
    show_progress=True
)

print("✅ Model loaded successfully.")

# Clear old DB if exists (Fresh Start)
if os.path.exists(VECTOR_STORE_DIR):
    print("Attempting to clear old Vector Store...")
    
    # 1. Force Python to release file handles
    # If vectorstore existed in memory from a previous run, this kills it.
    if 'vectorstore' in globals():
        del vectorstore
    gc.collect() 
    
    # 2. Try to delete with retries
    try:
        shutil.rmtree(VECTOR_STORE_DIR, onerror=on_error)
        print("Cleared old Vector Store.")
    except Exception as e:
        print(f"⚠️ Could not fully delete folder: {e}")
        print("   (This is common with Google Drive. We will try to overwrite anyway.)")

# Create and Save
print("Creating new Vector Store (this may take time)...")
# Add a small sleep to let the OS catch up
time.sleep(1)

vectorstore = Chroma.from_documents(
    documents=splits,
    embedding=embedding_model,
    persist_directory=VECTOR_STORE_DIR
)

print(f"✅ Vector Store successfully saved to {VECTOR_STORE_DIR}")

Loading Embedding Model from: c:\Users\pfeil\My Drive\Studys\CAS-NLP-Uni-Bern\Module 4  NLP Transformers\dungeon_master_copilot\Output\fine_tuned_qwen_dnd


  embedding_model = HuggingFaceEmbeddings(
The tokenizer you are loading from 'c:\Users\pfeil\My Drive\Studys\CAS-NLP-Uni-Bern\Module 4  NLP Transformers\dungeon_master_copilot\Output\fine_tuned_qwen_dnd' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.


✅ Model loaded successfully.
Attempting to clear old Vector Store...
⚠️ Could not fully delete folder: [WinError 5] Access is denied: 'c:\\Users\\pfeil\\My Drive\\Studys\\CAS-NLP-Uni-Bern\\Module 4  NLP Transformers\\dungeon_master_copilot\\data\\vector_store'
   (This is common with Google Drive. We will try to overwrite anyway.)
Creating new Vector Store (this may take time)...


Batches: 100%|██████████| 46/46 [44:45<00:00, 58.38s/it] 


✅ Vector Store successfully saved to c:\Users\pfeil\My Drive\Studys\CAS-NLP-Uni-Bern\Module 4  NLP Transformers\dungeon_master_copilot\data\vector_store
