In [1]:
import logging
from typing import List
import os
import faiss
import numpy as np
import torch
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, InputExample, losses
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForSeq2SeqLM
from PyPDF2 import PdfReader
from langchain_text_splitters import RecursiveCharacterTextSplitter

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

MODEL_DIR="models"
# Can change back to "mistralai/Mistral-7B-v0.3" afterwards
EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
LLM_MODEL_NAME = "google/flan-t5-large"
CHUNK_SIZE = 500
CHUNK_OVERLAP = 100
TOP_K = 4




In [2]:
def load_pdf(path: str) -> str:
    logger.info(f"Loading PDF: {path}")
    reader = PdfReader(path)
    text = ""
    for page in reader.pages:
        text += page.extract_text() + "\n"
    return text

def chunk_text(text: str, chunk_sz: int, ol: int) -> List[str]:
    logger.info("Chunking text with RecursiveCharacterTextSplitter")

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_sz,
        chunk_overlap=ol,
        length_function=len,
        separators=[
            "\n\n",
            ". ",
            "\n",
            " ",
            ""
        ]
    )

    chunks = splitter.split_text(text)
    logger.info(f"Created {len(chunks)} chunks")
    chunks = [
        c.strip()
        for c in chunks
        if len(c.strip()) > 150
        and not c.strip().startswith((">>>", "```"))
        and not c.strip().lower().startswith(("chapter", "table of contents"))
    ]
    return chunks

In [3]:
class MiniRAG:
    def __init__(self):
        logger.info("Initializing model")

        self.embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME)

        self.tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME)
        self.llm = AutoModelForSeq2SeqLM.from_pretrained(LLM_MODEL_NAME)


        self.index = None
        self.chunks = []

    def build_index(self, documents: List[str]):
        logger.info("Creating embeddings")
        embeddings = self.embedding_model.encode(documents, show_progress_bar=True)
        embeddings = np.array(embeddings).astype("float32")

        dim = embeddings.shape[1]
        faiss.normalize_L2(embeddings)
        self.index = faiss.IndexFlatIP(dim)
        self.index.add(embeddings)
        self.chunks = documents

        logger.info("FAISS index created")

    def retrieve(self, query: str, k: int = TOP_K) -> List[str]:
        logger.info("Retrieving relevant chunks")
        query_embedding = self.embedding_model.encode([query]).astype("float32")
        faiss.normalize_L2(query_embedding)
        _, indices = self.index.search(query_embedding, k)
        scores, indices = self.index.search(query_embedding, k)
        for i, idx in enumerate(indices[0]):
            print(f"Score {i}: {scores[0][i]:.3f}")
        return [self.chunks[i] for i in indices[0]]

    def generate(self, query: str, context_chunks: List[str]) -> str:
        logger.info("Generating answer")
        context = "\n\n".join(context_chunks)
        prompt = (
            "You are a technical assistant. "
            "Answer the question clearly and concisely using ONLY the information in the context. "
            "You may summarize or combine information across chunks. "
            "Ignore code snippets unless they directly answer the question. "
            "If the context is completely irrelevant, respond exactly: 'Not found in context.'\n\n"
            f"Context:\n{context}\n\n"
            f"Question:\n{query}\n\nAnswer:"
        )


        inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
        output_ids = self.llm.generate(**inputs, max_new_tokens=200)
        return self.tokenizer.decode(output_ids[0], skip_special_tokens=True)


    def ask(self, query: str) -> str:
        retrieved_chunks = self.retrieve(query)
        for i, c in enumerate(retrieved_chunks):
            print(f"\n Retrieved chunk {i+1} ")
            print(c[:300])

        return self.generate(query, retrieved_chunks)

In [4]:
rag = MiniRAG()

text = load_pdf("s1.pdf")
chunks = chunk_text(text, CHUNK_SIZE, CHUNK_OVERLAP)

rag.build_index(chunks)

question = "When does overfitting happen?"
answer = rag.ask(question)

print(answer)

INFO:__main__:Initializing model
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
INFO:__main__:Loading PDF: s1.pdf
INFO:__main__:Chunking text with RecursiveCharacterTextSplitter
INFO:__main__:Created 4123 chunks
INFO:__main__:Creating embeddings


Batches:   0%|          | 0/117 [00:00<?, ?it/s]

INFO:__main__:FAISS index created
INFO:__main__:Retrieving relevant chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:__main__:Generating answer


Score 0: 0.621
Score 1: 0.606
Score 2: 0.606
Score 3: 0.604

 Retrieved chunk 1 
. If a model suffers from overfitting, we also 
say that the model has a high variance, which can be caused by having too many parameters, leading 
to a model that is too complex given the underlying data

 Retrieved chunk 2 
is one approach to tackling the problem of overfitting by adding additional information and thereby 
shrinking the parameter values of the model to induce a penalty against complexity

 Retrieved chunk 3 
. To address this problem of overfitting, 
we can collect more training data, reduce the complexity of the model, or increase the regularization 
parameter, for example.
For unregularized models, it can also help to decrease the number of features via feature selection 
(Chapter 4) or feature extrac

 Retrieved chunk 4 
. The reason for the overfitting is that our model is too complex for 
the given training data. Common solutions to reduce the generalization error are as follows:
• 

In [5]:

PDF_FILES = ["s1.pdf", "s2.pdf"]
OUTPUT_MODEL_PATH = "finetuned_model_local"

BASE_EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
LLM_MODEL_NAME = "google/flan-t5-large"
CHUNK_SIZE = 500
CHUNK_OVERLAP = 100
TOP_K = 4

def load_pdfs(file_paths: List[str]) -> str:
    combined_text = ""
    for path in file_paths:
        if os.path.exists(path):
            logger.info(f"Loading PDF: {path}")
            reader = PdfReader(path)
            for page in reader.pages:
                text = page.extract_text()
                if text:
                    combined_text += text + "\n"
        else:
            logger.warning(f"File not found: {path}")
    return combined_text

def chunk_text(text: str, chunk_sz: int, ol: int) -> List[str]:
    logger.info("Chunking text with RecursiveCharacterTextSplitter")

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_sz,
        chunk_overlap=ol,
        length_function=len,
        separators=[
            "\n\n",
            ". ",
            "\n",
            " ",
            ""
        ]
    )

    chunks = splitter.split_text(text)
    logger.info(f"Created {len(chunks)} chunks")
    chunks = [
        c.strip()
        for c in chunks
        if len(c.strip()) > 150
        and not c.strip().startswith((">>>", "```"))
        and not c.strip().lower().startswith(("chapter", "table of contents"))
    ]
    return chunks

In [6]:
def run_fine_tuning(train_chunks: List[str], base_model_name: str, output_path: str):
    logger.info(f"Starting fine-tuning on {len(train_chunks)} chunks...")
    
    model = SentenceTransformer(base_model_name)
    
    train_examples = []
    for chunk in train_chunks:
        train_examples.append(
            InputExample(texts=[chunk, chunk])
        )


    train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
    
    train_loss = losses.MultipleNegativesRankingLoss(model)
    
    model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        epochs=1,
        show_progress_bar=True,
        output_path=output_path
    )
    logger.info(f"Fine-tuning complete. Model saved to: {output_path}")


In [7]:
class FineTunedRAG:
    def __init__(self, embedding_model_path: str):
        logger.info("Initializing RAG with Fine-Tuned Model")

        self.embedding_model = SentenceTransformer(embedding_model_path)
        
        self.tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME)
        self.llm = AutoModelForSeq2SeqLM.from_pretrained(LLM_MODEL_NAME)
        self.index = None
        self.chunks = []

    def build_index(self, documents: List[str]):
        logger.info("Creating embeddings for index")
        embeddings = self.embedding_model.encode(documents, show_progress_bar=True)
        embeddings = np.array(embeddings).astype("float32")
        dim = embeddings.shape[1]
        faiss.normalize_L2(embeddings)
        self.index = faiss.IndexFlatIP(dim)
        self.index.add(embeddings)
        self.chunks = documents
        logger.info("FAISS index built.")

    def retrieve(self, query: str, k: int = TOP_K) -> List[str]:
        query_embedding = self.embedding_model.encode([query]).astype("float32")
        faiss.normalize_L2(query_embedding)
        _, indices = self.index.search(query_embedding, k)
        scores, indices = self.index.search(query_embedding, k)
        for i, idx in enumerate(indices[0]):
            print(f"Score {i}: {scores[0][i]:.3f}")
        return [self.chunks[i] for i in indices[0]]

    def generate(self, query: str, context_chunks: List[str]) -> str:
        context = "\n\n".join(context_chunks)
        prompt = (
            "You are a technical assistant. "
            "Answer the question clearly and concisely using ONLY the information in the context. "
            "You may summarize or combine information across chunks. "
            "Ignore code snippets unless they directly answer the question. "
            "If the context is completely irrelevant, respond exactly: 'Not found in context.'\n\n"
            f"Context:\n{context}\n\n"
            f"Question:\n{query}\n\nAnswer:"
        )

        inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
        output_ids = self.llm.generate(**inputs, max_new_tokens=200)
        return self.tokenizer.decode(output_ids[0], skip_special_tokens=True)

    def ask(self, query: str) -> str:
        logger.info(f"Querying: {query}")
        retrieved_chunks = self.retrieve(query)
        for i, c in enumerate(retrieved_chunks):
            print(f"\nRetrieved chunk {i+1}")
            print(c[:300])

        return self.generate(query, retrieved_chunks)

In [8]:
raw_text = load_pdfs(PDF_FILES)

doc_chunks = chunk_text(raw_text, CHUNK_SIZE, CHUNK_OVERLAP)

print("\n--- Phase 1: Fine-Tuning Embeddings ---\n")
run_fine_tuning(doc_chunks, BASE_EMBEDDING_MODEL, OUTPUT_MODEL_PATH)

print("\n--- Phase 2: Running RAG ---\n")
rag = FineTunedRAG(embedding_model_path=OUTPUT_MODEL_PATH)
rag.build_index(doc_chunks)

question = "What is underfitting?" 
    
answer = rag.ask(question)
print(question)
print(answer)

INFO:__main__:Loading PDF: s1.pdf


INFO:__main__:Loading PDF: s2.pdf
INFO:__main__:Chunking text with RecursiveCharacterTextSplitter
INFO:__main__:Created 8171 chunks
INFO:__main__:Starting fine-tuning on 7562 chunks...
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2



--- Phase 1: Fine-Tuning Embeddings ---



Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]



Step,Training Loss


INFO:sentence_transformers.SentenceTransformer:Save model to finetuned_model_local
INFO:__main__:Fine-tuning complete. Model saved to: finetuned_model_local
INFO:__main__:Initializing RAG with Fine-Tuned Model
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: finetuned_model_local



--- Phase 2: Running RAG ---



INFO:__main__:Creating embeddings for index


Batches:   0%|          | 0/237 [00:00<?, ?it/s]

INFO:__main__:FAISS index built.
INFO:__main__:Querying: What is underfitting?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Score 0: 0.517
Score 1: 0.507
Score 2: 0.503
Score 3: 0.498

Retrieved chunk 1
. We can also see that the 
training accuracy increases for training datasets with fewer than 250 examples, and the gap between 
validation and training accuracy widens—an indicator of an increasing degree of overfitting.
Addressing over- and underfitting with validation curves
Validation curves are

Retrieved chunk 2
. As we discussed in Chapter 3, A Tour of Machine Learning 
Classifiers Using Scikit-Learn, overfitting means the model fits the parameters too closely with regard 
to the particular observations in the training dataset but does not generalize well to new data; we say 
that the model has a high vari

Retrieved chunk 3
. The reason for the overfitting is that our model is too complex for 
the given training data. Common solutions to reduce the generalization error are as follows:
• Collect more training data
• Introduce a penalty for complexity via regularization
• Choose a simpler model with fe

In [9]:
question = "Explain the concept of overfitting."
answer = rag.ask(question)
print(question)
print(answer)

INFO:__main__:Querying: Explain the concept of overfitting.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Score 0: 0.606
Score 1: 0.601
Score 2: 0.579
Score 3: 0.573

Retrieved chunk 1
. If a model suffers from overfitting, we also 
say that the model has a high variance, which can be caused by having too many parameters, leading 
to a model that is too complex given the underlying data

Retrieved chunk 2
. As we discussed in Chapter 3, A Tour of Machine Learning 
Classifiers Using Scikit-Learn, overfitting means the model fits the parameters too closely with regard 
to the particular observations in the training dataset but does not generalize well to new data; we say 
that the model has a high vari

Retrieved chunk 3
. To address this problem of overfitting, 
we can collect more training data, reduce the complexity of the model, or increase the regularization 
parameter, for example.
For unregularized models, it can also help to decrease the number of features via feature selection 
(Chapter 4) or feature extrac

Retrieved chunk 4
is one approach to tackling the problem of overfitting by

In [10]:
question = "How can overfitting be prevented?"
answer = rag.ask(question)
print(question)
print(answer)

INFO:__main__:Querying: How can overfitting be prevented?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Score 0: 0.701
Score 1: 0.643
Score 2: 0.597
Score 3: 0.593

Retrieved chunk 1
. To address this problem of overfitting, 
we can collect more training data, reduce the complexity of the model, or increase the regularization 
parameter, for example.
For unregularized models, it can also help to decrease the number of features via feature selection 
(Chapter 4) or feature extrac

Retrieved chunk 2
. The reason for the overfitting is that our model is too complex for 
the given training data. Common solutions to reduce the generalization error are as follows:
• Collect more training data
• Introduce a penalty for complexity via regularization
• Choose a simpler model with fewer parameters
• Re

Retrieved chunk 3
. 
Then, to prevent overfitting, we can apply one or multiple regularization schemes to achieve good 
generalization performance on new data, such as the held-out test dataset.
In Chapters 3 and 4, we covered L1 and L2 regularization. Both techniques can prevent or reduce the 
eff

In [11]:
question = "What is the ensemble method?"
answer = rag.ask(question)
print(question)
print(answer)

INFO:__main__:Querying: What is the ensemble method?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Score 0: 0.726
Score 1: 0.657
Score 2: 0.623
Score 3: 0.618

Retrieved chunk 1
. This section will introduce a basic explanation 
of how ensembles work and why they are typically recognized for yielding a good generalization 
performance.
In this chapter, we will focus on the most popular ensemble methods that use the majority voting 
principle. Majority voting simply means th

Retrieved chunk 2
. Depending on the 
technique, the ensemble can be built from different classification algorithms, for example, decision 
trees, support vector machines, logistic regression classifiers, and so on. Alternatively, we can also use 
the same base classification algorithm, fitting different subsets of t

Retrieved chunk 3
. For example, assuming that we col-
lected predictions from 10 experts, ensemble methods would allow us to strategically combine those 
predictions by the 10 experts to come up with a prediction that was more accurate and robust than the 
predictions by each individual expert. As

EVALUATION SCALE:

1 = Incorrect / hallucinated;
2 = Partially correct but incomplete or weakly grounded;
3 = Mostly correct and grounded, minor omissions;
4 = Fully correct, complete, and grounded in context;

In [12]:
import re

def llm_as_judge(question: str, answer: str, context: str, tokenizer, llm) -> int:
    """
    Uses the LLM to evaluate the answer on a scale of 1-4.
    """
    prompt = f"""
Evaluate the following answer on a scale of 1-4 based on the provided context:

Question: {question}

Answer: {answer}

Context: {context}

Evaluation Scale:
1 = Incorrect / hallucinated;
2 = Partially correct but incomplete or weakly grounded;
3 = Mostly correct and grounded, minor omissions;
4 = Fully correct, complete, and grounded in context;

Provide only the number (1, 2, 3, or 4) as your response.
"""

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
    output_ids = llm.generate(**inputs, max_new_tokens=10, do_sample=False)
    response = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()

    match = re.search(r'\b([1-4])\b', response)
    if match:
        return int(match.group(1))
    else:
        try:
            return int(response.split()[0]) if response.split()[0].isdigit() else 1
        except:
            return 1

question = "Explain the concept of overfitting"
retrieved_chunks = rag.retrieve(question)
context = "\n\n".join(retrieved_chunks)
answer = rag.generate(question, retrieved_chunks)

score = llm_as_judge(question, answer, context, rag.tokenizer, rag.llm)

print(f"Question: {question}")
print(f"Answer: {answer}")
print(f"LLM-as-a-Judge Score: {score}")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Score 0: 0.628
Score 1: 0.609
Score 2: 0.600
Score 3: 0.593
Question: Explain the concept of overfitting
Answer: The reason for the overfitting is that our model is too complex for the given training data.
LLM-as-a-Judge Score: 4


In [13]:
question = "How can overfitting be prevented?"

retrieved_chunks = rag.retrieve(question)
context = "\n\n".join(retrieved_chunks)

answer = rag.generate(question, retrieved_chunks)

score = llm_as_judge(question, answer, context, rag.tokenizer, rag.llm)

print(f"Question: {question}")
print(f"Answer: {answer}")
print(f"LLM-as-a-Judge Score: {score}")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Score 0: 0.701
Score 1: 0.643
Score 2: 0.597
Score 3: 0.593
Question: How can overfitting be prevented?
Answer: To address this problem of overfitting, we can collect more training data, reduce the complexity of the model, or increase the regularization parameter, for example.
LLM-as-a-Judge Score: 4


In [None]:
def is_explicit(question: str) -> bool:
    """
    Check if the question contains explicit or inappropriate content.
    """
    explicit_keywords = ["kill", "murder", "bomb", "drug", "fuck", "shit", "sex", "porn", "nude", "ass", "bitch", "bastard"]
    return any(keyword in question.lower() for keyword in explicit_keywords)

def guarded_ask(rag, question: str) -> str:
    """
    Guarded version of ask that checks for explicit content and off-topic questions.
    """
    if is_explicit(question):
        return "Warning: Inappropriate question detected."
            
    query_embedding = rag.embedding_model.encode([question]).astype("float32")
    faiss.normalize_L2(query_embedding)
    scores, indices = rag.index.search(query_embedding, TOP_K)
    for i, idx in enumerate(indices[0]):
        print(f"Score {i}: {scores[0][i]:.3f}")
    
    if max(scores[0]) < 0.5:
        return "Warning: This question appears to be off-topic. I can only answer questions related to machine learning concepts from the provided documents."
    else:
        answer = rag.ask(question)
        return answer

In [44]:
# Test the guardrail with three questions
question1 = "how can overfitting be prevented?"
answer1 = guarded_ask(rag, question1)
print(f"Question: {question1}")
print(f"Answer: {answer1}")
print()

question2 = "who won the 2018 world cup?"
answer2 = guarded_ask(rag, question2)
print(f"Question: {question2}")
print(f"Answer: {answer2}")
print()

question3 = "how to make a bomb"
answer3 = guarded_ask(rag, question3)
print(f"Question: {question3}")
print(f"Answer: {answer3}")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:root:Received query: how can overfitting be prevented?
INFO:root:Query passed safety guard
INFO:root:Retrieving relevant chunks


Score 0: 0.714
Score 1: 0.660
Score 2: 0.614
Score 3: 0.606


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:root:Retrieved chunk 1 with score 0.714
INFO:root:Retrieved chunk 2 with score 0.660
INFO:root:Retrieved chunk 3 with score 0.614
INFO:root:Chunk 1 preview:
. To address this problem of overfitting, 
we can collect more training data, reduce the complexity of the model, or increase the regularization 
parameter, for example.
For unregularized models, it c
INFO:root:Chunk 2 preview:
. The reason for the overfitting is that our model is too complex for 
the given training data. Common solutions to reduce the generalization error are as follows:
• Collect more training data
• Intro
INFO:root:Chunk 3 preview:
is one approach to tackling the problem of overfitting by adding additional information and thereby 
shrinking the parameter values of the model to induce a penalty against complexity


Question: how can overfitting be prevented?
Answer: Answer: How can overfitting be prevented? Answer:



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Score 0: 0.213
Score 1: 0.209
Score 2: 0.203
Score 3: 0.191
Question: who won the 2018 world cup?



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Score 0: 0.262
Score 1: 0.244
Score 2: 0.243
Score 3: 0.239
Question: how to make a bomb


In [None]:
import tkinter as tk
from tkinter import scrolledtext

def on_ask():
    question = entry.get()
    answer = guarded_ask(rag, question)
    if not answer.startswith("Warning"):
        idx = answer.find("Answer:")
        if idx != -1:
            answer = answer[idx + 7:].strip()
    output_text.delete(1.0, tk.END)
    output_text.insert(tk.END, f"Question: {question}\n\nAnswer: {answer}")

root = tk.Tk()
root.title("RAG Question GUI")

tk.Label(root, text="Enter your question:").pack()
entry = tk.Entry(root, width=50)
entry.pack()
tk.Button(root, text="Ask", command=on_ask).pack()
output_text = scrolledtext.ScrolledText(root, width=80, height=20)
output_text.pack()

root.mainloop()

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:root:Received query: How can overfitting be prevented?
INFO:root:Query passed safety guard
INFO:root:Retrieving relevant chunks


Score 0: 0.714
Score 1: 0.660
Score 2: 0.614
Score 3: 0.606


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:root:Retrieved chunk 1 with score 0.714
INFO:root:Retrieved chunk 2 with score 0.660
INFO:root:Retrieved chunk 3 with score 0.614
INFO:root:Chunk 1 preview:
. To address this problem of overfitting, 
we can collect more training data, reduce the complexity of the model, or increase the regularization 
parameter, for example.
For unregularized models, it c
INFO:root:Chunk 2 preview:
. The reason for the overfitting is that our model is too complex for 
the given training data. Common solutions to reduce the generalization error are as follows:
• Collect more training data
• Intro
INFO:root:Chunk 3 preview:
is one approach to tackling the problem of overfitting by adding additional information and thereby 
shrinking the parameter values of the model to induce a penalty against complexity


KeyboardInterrupt: 