1.Load & Filter PDF

In [None]:
from langchain_community.document_loaders import UnstructuredPDFLoader

loader = UnstructuredPDFLoader(
    file_path="Cryptography and Network Security, 3rd Edition, by Behrouz A Forouzan and Depdeep.pdf",
    mode="elements",                  # Get structured blocks
    strategy="fast",                # Use layout-aware + OCR parsing
    extract_images_in_pdf=False,      # Pull image content
    infer_table_structure=False,      # Parse tables
    languages=["eng"]              # OCR language
)

raw_docs = loader.load()


In [None]:
print(f"Number of documents (elements) extracted: {len(raw_docs)}")

if len(raw_docs) > 0:
    print("\n--- First 5 extracted elements ---")
    for i, doc in enumerate(raw_docs[:5]):
        print(f"\nElement {i} (Category: {doc.metadata.get('category')}):")
        print(doc.page_content[:500]) # Print first 500 characters of content
        print(f"Page: {doc.metadata.get('page_number')}")
        # If infer_table_structure=True, look for 'text_as_html' or 'text_as_csv' in metadata for tables
        if doc.metadata.get('category') == 'Table':
            print(f"Table HTML: {doc.metadata.get('text_as_html', 'N/A')[:200]}...")
    if len(raw_docs) > 5:
        print("\n(and more elements...)")
else:
    print("No documents were extracted. The PDF might be unparsable.")

In [None]:
raw_docs

Filter by page range

In [None]:
start_page = 27
end_page = 44

# 2. Filter the raw_docs
docs = []
for doc in raw_docs:
    page_num = doc.metadata.get('page_number') # Use .get() for safer access

    # Check if page_num is not None and falls within the specified range
    if page_num is not None and start_page <= page_num <= end_page:
        docs.append(doc)

In [None]:
docs

Clean text

In [None]:
relevant_docs = [
    doc for doc in docs
    if doc.metadata.get('category') in ['NarrativeText', 'Title', 'ListItem']
]

In [None]:
for d in relevant_docs:
    d.page_content = d.page_content.strip().replace("\n", " ")

In [None]:
relevant_docs

2. Chunk the Documents

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings

# Set line length / overlap so chunks keep logical coherence
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=100,
    separators=["\n\n", "\n", " ", ""]
)

# After splitting...
chunks = text_splitter.split_documents(relevant_docs)

# Generate embeddings for vector store
embedder = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [1]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings

from nltk.tokenize import sent_tokenize
import nltk
from tqdm import tqdm
import os
import shutil

nltk.download("punkt")

# -------------------------------
# ✅ Custom sentence-based chunker
# -------------------------------
class SentenceSplitter:
    def __init__(self, sentences_per_chunk=5, overlap=1):
        self.sentences_per_chunk = sentences_per_chunk
        self.overlap = overlap

    def split_text(self, text):
        sentences = sent_tokenize(text)
        chunks = []
        step = self.sentences_per_chunk - self.overlap
        for i in range(0, len(sentences), step):
            chunk = " ".join(sentences[i:i + self.sentences_per_chunk])
            chunks.append(chunk)
        return chunks

# -------------------------------
# 1. Load PDF
# -------------------------------
loader = PyPDFLoader("Cryptography and Network Security, 3rd Edition, by Behrouz A Forouzan and Depdeep.pdf")
documents = loader.load()

# -------------------------------
# 2. Sentence-based chunking
# -------------------------------
splitter = SentenceSplitter(sentences_per_chunk=5, overlap=1)
chunks = []

print("📖 Splitting documents into sentence-based chunks...")
for doc in tqdm(documents, desc="Chunking"):
    splits = splitter.split_text(doc.page_content)
    for s in splits:
        # copy the doc and replace page_content with the new sentence chunk
        chunks.append(doc.model_copy(update={"page_content": s}))

print(f"✅ Total sentence-based chunks: {len(chunks)}")



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dhili\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


📖 Splitting documents into sentence-based chunks...


Chunking: 100%|██████████| 752/752 [00:00<00:00, 2531.11it/s]


✅ Total sentence-based chunks: 3676


In [2]:
chunks

[Document(metadata={'producer': 'PDFTron PDFNet, V7.10742', 'creator': 'PyPDF', 'creationdate': '2013-08-01T13:36:51+00:00', 'moddate': '2020-04-24T15:20:31+00:00', 'author': 'iccp2_134', 'title': 'for38482_fm.fm', 'source': 'Cryptography and Network Security, 3rd Edition, by Behrouz A Forouzan and Depdeep.pdf', 'total_pages': 752, 'page': 0, 'page_label': '1'}, page_content='INTRODUCTION\nTO\nCRYPTOGRAPHY\nAND\nNETWORK SECURITY'),
 Document(metadata={'producer': 'PDFTron PDFNet, V7.10742', 'creator': 'PyPDF', 'creationdate': '2013-08-01T13:36:51+00:00', 'moddate': '2020-04-24T15:20:31+00:00', 'author': 'iccp2_134', 'title': 'for38482_fm.fm', 'source': 'Cryptography and Network Security, 3rd Edition, by Behrouz A Forouzan and Depdeep.pdf', 'total_pages': 752, 'page': 1, 'page_label': '2'}, page_content='McGraw-Hill Forouzan Networking Series\nTitles by Behrouz A. Forouzan:\nCryptography and Network Security\nData Communications and Networking\nTCP/IP Protocol Suite\nLocal Area Networks

3. Embed and Store in Vector DB (FAISS)

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

embedder = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

vectorstore = FAISS.from_documents(chunks, embedder)
vectorstore.save_local("chapter_1_cryptography_semantic_chunking")

retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

  embedder = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")


4. Retrieval Function

In [None]:
def build_context(query, k=3):
    # Use retriever to get top-k relevant documents
    relevant_docs = retriever.invoke(query, config={"configurable": {"top_k": k}})    
    # Combine context to pass into LLM
    context = "\n\n".join(doc.page_content for doc in relevant_docs)
    return context

5. Setup Question Generation Chain

In [5]:
# import os
# from dotenv import load_dotenv
# from langchain.llms import Together
# from langchain.prompts import PromptTemplate
# from langchain.chains import LLMChain
# load_dotenv()

# # Get the API key from the environment
# together_api_key = os.getenv("TOGETHER_API_KEY")

# llm = Together(
#     model="meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
#     temperature=0.3,
#     together_api_key=together_api_key
# )
from langchain_community.chat_models import ChatOllama
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

# ⚙️ Create the Ollama LLM object
llm = ChatOllama(
    model="llama3.1:8b",  # You can use any model pulled by Ollama: e.g., llama3, mistral, codellama, etc.
    temperature=0.3
)

prompt = PromptTemplate(
    input_variables=["context"],
    template = """
You are an AI question generator for academic exams.

Your task is to:
1. Read the provided academic or technical context.
2. Generate **one** relevant and insightful **exam-style question** that tests conceptual understanding.
3. Create a **detailed rubric** for a 10-mark question, with **exactly 3 to 5 bullet points**.
   - Each point should describe **what a good answer must contain**.
   - Each bullet must specify the **marks** allocated.
   - All marks must **sum to exactly 10**.

🛑 **Guidelines:**
- Do **not** include any explanations, instructions, or follow-up text after the rubric.
- The rubric should use **clear academic language**.
- Avoid repeating information in multiple rubric points.
- Do not refer back to the context in the question (avoid "According to the passage…").
- Use **neutral and formal academic tone**.
- Do not hallucinate or invent facts not implied in the context.

📌 **Output Format (strictly follow this):**

Question: <Insert your question here>
Rubric:
- <Point 1> - <marks>
- <Point 2> - <marks>
- <Point 3> - <marks>
[optional: - <Point 4> - <marks>]
[optional: - <Point 5> - <marks>]

---

Context:
{context}
"""
)

chain = LLMChain(llm=llm, prompt=prompt)

  llm = ChatOllama(
  chain = LLMChain(llm=llm, prompt=prompt)


6. Generate Question from a Query

In [None]:
import json
import os
import re
from dotenv import load_dotenv

load_dotenv()
os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGSMITH_API_KEY")
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = os.getenv("LANGCHAIN_PROJECT")


def generate_question_and_rubric(query, chain, folder, chapter, filename=None, num_questions=1, prompt_version="v1"):
    # 1. Build context from the query/topic
    context = build_context(query)

    # 2. Get model info for smart file naming
    # Safely extract model name and sanitize it for filenames
    model_raw = getattr(chain.llm, "model", "unknown-model")
    model_name = re.sub(r'[^a-zA-Z0-9_-]', '_', model_raw)

    # Handle temperature format (e.g., 0.3 → 0_3)
    temp = str(chain.llm.temperature).replace(".", "_")

    # Add chapter and prompt version
    chapter_str = f"_chapter{chapter}"
    version_str = f"_{prompt_version}"

    # 3. Smart filename
    if filename is None:
        filename = f"{model_name}_temp{temp}{chapter_str}{version_str}.json"

    filepath = os.path.join(folder, filename)
    os.makedirs(folder, exist_ok=True)

    # 4. Load existing data
    if os.path.exists(filepath):
        with open(filepath, "r", encoding="utf-8") as f:
            data = json.load(f)
    else:
        data = []

    # 5. Loop to generate questions
    for i in range(num_questions):
        response = chain.invoke(
    {"context": context},
    config={
        "run_name": f"qgen_ch{chapter}_v{prompt_version}",
        "tags": ["qgen", f"chapter{chapter}", f"v{prompt_version}", "rubric"],
        "metadata": {
            "topic": query,
            "model": chain.llm.model,
            "temperature": chain.llm.temperature,
            "version": prompt_version,
            "chapter": chapter,
            "retrieved_chunks": context[:1500]  # Limit to avoid huge metadata blobs
        }
    }
)
        text = response["text"]

        try:
            question = text.split("Question:")[1].split("Rubric:")[0].strip()
            rubric_block = text.split("Rubric:")[1].strip()
            rubric_points = re.findall(r"- (.+)", rubric_block)
        except IndexError:
            print(f"⚠️ Format issue on question #{i+1}. Skipped.\n{text}")
            continue

        result = {
            "question": question,
            "rubric": rubric_points,
            "prompt_version": prompt_version
        }

        data.append(result)

        print(f"\n✅ Q{i+1} Saved to: {filepath}")
        print(f"🧠 Question: {question}")
        print("📏 Rubric:")
        for j, point in enumerate(rubric_points, 1):
            print(f"  {j}. {point}")

    # 6. Save to JSON
    with open(filepath, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2)


In [7]:
generate_question_and_rubric(
    query="symmetric encryption",
    chain=chain,
    folder=r"C:\Users\dhili\Desktop\SRIP\week2\Dataset",
    chapter=1,
    prompt_version="v1_semantic_chunking",
    num_questions=5
)


✅ Q1 Saved to: C:\Users\dhili\Desktop\SRIP\week2\Dataset\llama3_1_8b_temp0_3_chapter1_v1_semantic_chunking.json
🧠 Question: What is the primary advantage of using symmetric-key cryptography over asymmetric-key cryptography?
📏 Rubric:
  1. Clearly explains why symmetric-key cryptography is faster than asymmetric-key cryptography - 4 marks
  2. Provides a concise definition of what makes a function a "trapdoor one-way function" and its relevance to decryption - 3 marks
  3. Briefly describes how the shared secret key enables both encryption and decryption in symmetric-key cryptography, without repeating information from the context - 3 marks

✅ Q2 Saved to: C:\Users\dhili\Desktop\SRIP\week2\Dataset\llama3_1_8b_temp0_3_chapter1_v1_semantic_chunking.json
🧠 Question: What is the primary advantage of using symmetric-key cryptography over asymmetric-key cryptography?
📏 Rubric:
  1. Clearly explains why symmetric-key cryptography is faster than asymmetric-key cryptography - 4 marks
  2. Provi