In [None]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings

from nltk.tokenize import sent_tokenize
import nltk
from tqdm import tqdm
import os
import shutil

nltk.download("punkt")

# -------------------------------
# ✅ Custom sentence-based chunker
# -------------------------------
class SentenceSplitter:
    def __init__(self, sentences_per_chunk=5, overlap=1):
        self.sentences_per_chunk = sentences_per_chunk
        self.overlap = overlap

    def split_text(self, text):
        sentences = sent_tokenize(text)
        chunks = []
        step = self.sentences_per_chunk - self.overlap
        for i in range(0, len(sentences), step):
            chunk = " ".join(sentences[i:i + self.sentences_per_chunk])
            chunks.append(chunk)
        return chunks

# -------------------------------
# 1. Load PDF
# -------------------------------
loader = PyPDFLoader("Cryptography and Network Security, 3rd Edition, by Behrouz A Forouzan and Depdeep.pdf")
documents = loader.load()

# -------------------------------
# 2. Sentence-based chunking
# -------------------------------
splitter = SentenceSplitter(sentences_per_chunk=5, overlap=1)
chunks = []

print("📖 Splitting documents into sentence-based chunks...")
for doc in tqdm(documents, desc="Chunking"):
    splits = splitter.split_text(doc.page_content)
    for s in splits:
        # copy the doc and replace page_content with the new sentence chunk
        chunks.append(doc.model_copy(update={"page_content": s}))

print(f"✅ Total sentence-based chunks: {len(chunks)}")


📖 Splitting documents into paragraph-based chunks...


Chunking: 100%|██████████| 752/752 [00:00<00:00, 288891.43it/s]

✅ Total paragraph-based chunks: 729





In [2]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

embedder = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

vectorstore = FAISS.from_documents(chunks, embedder)
vectorstore.save_local("chapter_1_cryptography_paragraph_chunking")

retriever = vectorstore.as_retriever(search_kwargs={"k":5})

  embedder = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")


In [3]:
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
import re

from langchain_community.chat_models import ChatOllama
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

# ⚙️ Create the Ollama LLM object
llm = ChatOllama(
    model="llama3.1:8b",  # You can use any model pulled by Ollama: e.g., llama3, mistral, codellama, etc.
    temperature=0.3
)

# Define the HyDE-style prompt
hyde_question_prompt = PromptTemplate(
    input_variables=["topic", "num"],
    template="""
Generate {num} distinct, high-quality exam-style questions on the topic: "{topic}".

Each question should:
- Focus on a different concept
- Be suitable for a university-level exam
- Avoid repeating comparisons

Format:
1. Question one?
2. Question two?
3. ...
"""
)

# Create a separate chain for HyDE generation
hyde_chain = LLMChain(llm=llm, prompt=hyde_question_prompt)

# Function to generate HyDE questions
def generate_hypothetical_questions(query, chain, num=3):
    response = hyde_chain.invoke({"topic": query, "num": num})

    # Clean split
    questions = re.findall(r"\d+\.\s*(.+)", response["text"])
    if not questions:
        questions = [q.strip("-• ").strip() for q in response["text"].split("\n") if q.strip()]

    questions = questions[:num]

    # 🔍 Print the questions for visibility
    print(f"\n🌀 HyDE Questions for topic: '{query}':")
    for i, q in enumerate(questions, 1):
        print(f"  {i}. {q}")

    return questions


  llm = ChatOllama(
  hyde_chain = LLMChain(llm=llm, prompt=hyde_question_prompt)


In [4]:
def build_context_with_hyde(query, chain, retriever, k_per_q=4, num_hyde_qs=5):
    hypo_questions = generate_hypothetical_questions(query, chain, num=num_hyde_qs)
    print("🌀 Generated HyDE Questions:")
    for q in hypo_questions:
        print(f"  • {q}")
        
    all_docs = []
    for q in hypo_questions:
        results = retriever.get_relevant_documents(q)
        all_docs.extend(results[:k_per_q])
    
    # Remove duplicates based on content
    seen = set()
    unique_chunks = []
    for doc in all_docs:
        if doc.page_content not in seen:
            seen.add(doc.page_content)
            unique_chunks.append(doc)
    
    print(f"🔍 Retrieved {len(unique_chunks)} unique chunks from {len(hypo_questions)} HyDE questions.")
    context = "\n\n".join(doc.page_content for doc in unique_chunks)
    return context


In [None]:
import os
from dotenv import load_dotenv
from langchain.llms import Together
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
load_dotenv()

# Get the API key from the environment
together_api_key = os.getenv("TOGETHER_API_KEY")

llm = Together(
    model="meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
    temperature=0.3,
    together_api_key=together_api_key
)
# from langchain_community.chat_models import ChatOllama
# from langchain.prompts import PromptTemplate
# from langchain.chains import LLMChain

# # ⚙️ Create the Ollama LLM object
# llm = ChatOllama(
#     model="llama3.1:8b",  # You can use any model pulled by Ollama: e.g., llama3, mistral, codellama, etc.
#     temperature=0.3
# )

prompt = PromptTemplate(
    input_variables=["context"],
    template = """
You are an AI question generator for academic exams.

Your task is to:
1. Read the provided academic or technical context.
2. Generate **one** relevant and insightful **exam-style question** that tests conceptual understanding.
3. Create a **detailed rubric** for a 10-mark question, with **exactly 3 to 5 bullet points**.
   - Each point should describe **what a good answer must contain**.
   - Each bullet must specify the **marks** allocated.
   - All marks must **sum to exactly 10**.

🛑 **Guidelines:**
- Do **not** include any explanations, instructions, or follow-up text after the rubric.
- The rubric should use **clear academic language**.
- Avoid repeating information in multiple rubric points.
- Do not refer back to the context in the question (avoid "According to the passage…").
- Use **neutral and formal academic tone**.
- Do not hallucinate or invent facts not implied in the context.

📌 **Output Format (strictly follow this):**

Question: <Insert your question here>
Rubric:
- <Point 1> - <marks>
- <Point 2> - <marks>
- <Point 3> - <marks>
[optional: - <Point 4> - <marks>]
[optional: - <Point 5> - <marks>]

---

Context:
{context}
"""
)

chain = LLMChain(llm=llm, prompt=prompt)

In [6]:
import json
import os
import re
from dotenv import load_dotenv

load_dotenv()
os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGSMITH_API_KEY")
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = os.getenv("LANGCHAIN_PROJECT")


def generate_question_and_rubric(query, chain, retriever, hyde_chain, folder, chapter, filename=None, num_questions=1, prompt_version="v1"):
    # --------------------------
    # 🔍 1. Build context using HyDE
    # --------------------------
    def build_context_with_hyde(query, hyde_chain, retriever, k_per_q=3, num_hyde_qs=5):
        # Get hypothetical questions
        hyde_prompt = {
            "topic": query,
            "num": num_hyde_qs
        }
        hyde_response = hyde_chain.invoke(hyde_prompt)
        questions = re.findall(r"\d+\.\s*(.+)", hyde_response["text"])
        if not questions:
            questions = [q.strip("-• ").strip() for q in hyde_response["text"].split("\n") if q.strip()]
        questions = questions[:num_hyde_qs]

        print(f"\n🌀 HyDE Questions for topic: '{query}':")
        for i, q in enumerate(questions, 1):
            print(f"  {i}. {q}")

        # Retrieve chunks from retriever
        all_docs = []
        for q in questions:
            results = retriever.get_relevant_documents(q)
            all_docs.extend(results[:k_per_q])

        # Remove duplicates
        seen = set()
        unique_chunks = []
        for doc in all_docs:
            if doc.page_content not in seen:
                seen.add(doc.page_content)
                unique_chunks.append(doc)

        print(f"🔍 Retrieved {len(unique_chunks)} unique chunks from {len(questions)} HyDE questions.")

        context = "\n\n".join(doc.page_content for doc in unique_chunks)
        print(f"🧾 Context preview (first 500 chars):\n{context[:500]}")
        return context

    context = build_context_with_hyde(query, hyde_chain, retriever)

    # --------------------------
    # 🧠 2. Generate questions from context
    # --------------------------
    model_raw = getattr(chain.llm, "model", "unknown-model")
    model_name = re.sub(r'[^a-zA-Z0-9_-]', '_', model_raw)
    temp = str(chain.llm.temperature).replace(".", "_")
    chapter_str = f"_chapter{chapter}"
    version_str = f"_{prompt_version}"
    if filename is None:
        filename = f"{model_name}_temp{temp}{chapter_str}{version_str}.json"

    filepath = os.path.join(folder, filename)
    os.makedirs(folder, exist_ok=True)

    if os.path.exists(filepath):
        with open(filepath, "r", encoding="utf-8") as f:
            data = json.load(f)
    else:
        data = []

    for i in range(num_questions):
        response = chain.invoke(
            {"context": context},
            config={
                "run_name": f"qgen_ch{chapter}_v{prompt_version}",
                "tags": ["qgen", f"chapter{chapter}", f"v{prompt_version}", "rubric"],
                "metadata": {
                    "topic": query,
                    "model": chain.llm.model,
                    "temperature": chain.llm.temperature,
                    "version": prompt_version,
                    "chapter": chapter,
                    "retrieved_chunks": context[:1500]
                }
            }
        )
        text = response["text"]
        try:
            question = text.split("Question:")[1].split("Rubric:")[0].strip()
            rubric_block = text.split("Rubric:")[1].strip()
            rubric_points = re.findall(r"- (.+)", rubric_block)
        except IndexError:
            print(f"⚠️ Format issue on question #{i+1}. Skipped.\n{text}")
            continue

        result = {
            "question": question,
            "rubric": rubric_points,
            "prompt_version": prompt_version
        }

        data.append(result)

        print(f"\n✅ Q{i+1} Saved to: {filepath}")
        print(f"🧠 Question: {question}")
        print("📏 Rubric:")
        for j, point in enumerate(rubric_points, 1):
            print(f"  {j}. {point}")

    with open(filepath, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2)


In [2]:
generate_question_and_rubric(
    query="symmetric encryption",
    chain=chain,
    retriever=retriever,
    hyde_chain=hyde_chain,
    folder=r"C:\Users\dhili\Desktop\SRIP\week2\Dataset",
    chapter=1,
    prompt_version="v3_hyde_para_chunking_k_per_q_4num_hyde_qs_5",
    num_questions=5
)


NameError: name 'chain' is not defined

In [6]:
import os
import re
import json
from dotenv import load_dotenv
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.chat_models import ChatOllama
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from tqdm import tqdm

# --- Load API keys ---
load_dotenv()
os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGSMITH_API_KEY")
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = os.getenv("LANGCHAIN_PROJECT")

# --- PDF Chunking (Paragraph-based) ---
class ParagraphSplitter:
    def __init__(self, paragraphs_per_chunk=3, overlap=1):
        self.paragraphs_per_chunk = paragraphs_per_chunk
        self.overlap = overlap

    def split_text(self, text):
        paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
        chunks = []
        step = self.paragraphs_per_chunk - self.overlap
        for i in range(0, len(paragraphs), step):
            chunk = "\n\n".join(paragraphs[i:i + self.paragraphs_per_chunk])
            chunks.append(chunk)
        return chunks

# --- Load & Split PDF ---
loader = PyPDFLoader("Cryptography and Network Security, 3rd Edition, by Behrouz A Forouzan and Depdeep.pdf")
documents = loader.load()

splitter = ParagraphSplitter(paragraphs_per_chunk=3, overlap=1)
chunks = []
for doc in tqdm(documents, desc="Chunking"):
    splits = splitter.split_text(doc.page_content)
    for s in splits:
        chunks.append(doc.model_copy(update={"page_content": s}))

# --- Embed & Store ---
embedder = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = FAISS.from_documents(chunks, embedder)
vectorstore.save_local("chapter_1_cryptography_paragraph_chunking")
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

# --- HyDE Chain ---
llm = ChatOllama(model="llama3.1:8b", temperature=0.3)
hyde_prompt = PromptTemplate(
    input_variables=["topic", "num"],
    template="""Generate {num} distinct, high-quality exam-style questions on the topic: "{topic}".\n\nEach question should:\n- Focus on a different concept\n- Be suitable for a university-level exam\n- Avoid repeating comparisons\n\nFormat:\n1. Question one?\n2. Question two?\n3. ..."""
)
hyde_chain = LLMChain(llm=llm, prompt=hyde_prompt)

# --- Rubric Generation Chain ---
question_prompt = PromptTemplate(
    input_variables=["context"],
    template="""
You are an AI question generator for academic exams.

📌 Read the provided academic context.
🎯 Your goal is to:

1. Generate **one** well-formed, exam-style **question** (short-answer or descriptive).
2. Provide a **detailed rubric** with **3 to 5 bullet points** totaling **exactly 10 marks**.
3. 🚫 Do NOT add extra analysis or explanations outside the rubric.

🧠 Important:
- Use only the information within the context.
- Use formal academic language.
- Do NOT refer back to "the passage" or "text above".

⚠️ Strict output format (do NOT change):

Question: <Insert your question here>
Rubric:
- <Point 1> - <marks>
- <Point 2> - <marks>
- <Point 3> - <marks>
[optional: - <Point 4> - <marks>]
[optional: - <Point 5> - <marks>]

---

Context:
{context}
"""
)
chain = LLMChain(llm=llm, prompt=question_prompt)

# --- Full Function ---
def generate_question_and_rubric(query, chain, retriever, hyde_chain, folder, chapter, filename=None, num_questions=1, prompt_version="v1"):
    def build_context_with_hyde(query, hyde_chain, retriever, k_per_q=4, num_hyde_qs=5):
        hyde_response = hyde_chain.invoke({"topic": query, "num": num_hyde_qs})
        questions = re.findall(r"\d+\.\s*(.+)", hyde_response["text"])
        questions = questions[:num_hyde_qs]
        print(f"\n🌀 HyDE Questions for topic: '{query}':")
        for q in questions: print(f"  • {q}")
        all_docs = []
        for q in questions:
            results = retriever.get_relevant_documents(q)
            all_docs.extend(results[:k_per_q])
        seen, unique_chunks = set(), []
        for doc in all_docs:
            if doc.page_content not in seen:
                seen.add(doc.page_content)
                unique_chunks.append(doc)
        print(f"🔍 Retrieved {len(unique_chunks)} unique chunks from {len(questions)} HyDE questions.")
        return "\n\n".join(doc.page_content for doc in unique_chunks)

    context = build_context_with_hyde(query, hyde_chain, retriever)
    model_raw = getattr(chain.llm, "model", "unknown-model")
    model_name = re.sub(r'[^a-zA-Z0-9_-]', '_', model_raw)
    temp = str(chain.llm.temperature).replace(".", "_")
    chapter_str = f"_chapter{chapter}"
    version_str = f"_{prompt_version}"
    if filename is None:
        filename = f"{model_name}_temp{temp}{chapter_str}{version_str}.json"
    filepath = os.path.join(folder, filename)
    os.makedirs(folder, exist_ok=True)

    if os.path.exists(filepath):
        with open(filepath, "r", encoding="utf-8") as f:
            data = json.load(f)
    else:
        data = []

    for i in range(num_questions):
        response = chain.invoke({"context": context})
        text = response["text"]

        try:
            # ✅ Check if structure is present
            if "Question:" not in text or "Rubric:" not in text:
                raise ValueError("Missing required format markers (Question:/Rubric:)")

            question = text.split("Question:")[1].split("Rubric:")[0].strip()
            rubric_block = text.split("Rubric:")[1].strip()

            # ✅ First try: strict format with marks
            rubric_points = re.findall(r"- (.+?) - (\d+)", rubric_block)
            if rubric_points:
                rubric_points = [f"{desc.strip()} - {mark.strip()}" for desc, mark in rubric_points]
            else:
                # 🔁 Fallback: relaxed bullet-only match
                rubric_points = re.findall(r"- (.+)", rubric_block)

            if not question or not rubric_points:
                raise ValueError("Parsed question or rubric is empty")

        except Exception as e:
            print(f"⚠️ Format issue on question #{i+1}. Skipped.\n{text}\nError: {e}")
            continue

        result = {
            "question": question,
            "rubric": rubric_points,
            "prompt_version": prompt_version
        }

        data.append(result)
        print(f"\n✅ Q{i+1} Saved to: {filepath}")
        print(f"🧠 Question: {question}")
        print("📏 Rubric:")
        for j, point in enumerate(rubric_points, 1):
            print(f"  {j}. {point}")


    with open(filepath, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2)

# --- 🔥 Execute ---
generate_question_and_rubric(
    query="symmetric encryption",
    chain=chain,
    retriever=retriever,
    hyde_chain=hyde_chain,
    folder=r"C:\Users\dhili\Desktop\SRIP\week2\Dataset",
    chapter=1,
    prompt_version="v3_hyde_para_chunking_k_per_q_4num_hyde_qs_5",
    num_questions=5
)


Chunking: 100%|██████████| 752/752 [00:00<00:00, 183967.14it/s]



🌀 HyDE Questions for topic: 'symmetric encryption':
  • **Key Management**
  • **Data Integrity**
  • **Side-Channel Attacks**
  • **Key Exchange**
  • **Pseudorandom Number Generation**
🔍 Retrieved 18 unique chunks from 5 HyDE questions.
⚠️ Format issue on question #1. Skipped.
This is a collection of text from various chapters in a book on cryptography. I'll provide a summary and answer any specific questions you may have.

**Summary**

The text covers various topics related to cryptography, including:

1. **Symmetric-key encryption**: The text explains the concept of symmetric-key encryption, where both parties use the same key for encryption and decryption.
2. **Asymmetric-key encryption**: The text introduces asymmetric-key encryption, also known as public-key encryption, where each party has a pair of keys: a public key for encryption and a private key for decryption.
3. **Cryptanalysis**: The text defines cryptanalysis as the science and art of breaking secret codes.
4. **Rando