Title:Generative AI and RAG <br>
Name:Rachael Muriuki <br>
Student ID:CS-DA01-25122 <br>

1.Install Required libraries

In [None]:
#Install required libraries
!pip install -q langchain langchain-community transformers sentence-transformers faiss-cpu pypdf

2.Import libraries

In [None]:
#Import PDF loader, splitter, embedding models, vecor store and LLMs
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM, pipeline

3.Load the PDF Document

In [None]:
#Load PDF file
loader = PyPDFLoader("The-Sustainable-Development-Goals-Report-2025.pdf")
docs = loader.load()

4.Split the document into chunks

In [None]:
#Split long documents into manageable text chunks for embedding
splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap=50)
chunks = splitter.split_documents(docs)
print(f"Split into {len(chunks)} chunks.")

Split into 561 chunks.


5.Create Embeddings and Vector Store

In [None]:
#Create dense vector embeddingd using a pre-trained sentence transformer
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

#Store the embeddings in a FAISS vector database for efficient similarity search
vectorstore = FAISS.from_documents(chunks, embeddings)
retriever = vectorstore.as_retriever()

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  return forward_call(*args, **kwargs)


6.Load the LLM

In [None]:
#Load the FLAN-T5 model from Hugging Face
model_name = "google/flan-t5-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

#Create a text2text generation pipeline
flan_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer)

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device set to use cpu


7.Define the RAG Query function

In [None]:
#Define a function that retrieves context from the vector store and generates an answer
def query_rag(question):

  #Retrieve the top relevant document chunks based on the question
  relevant_docs = retriever.get_relevant_documents(question)

  #Combine content from top documents
  context = "\n".join([doc.page_content for doc in relevant_docs])

  #Create a prompt with context
  prompt = f"Answer the question using only the context:\n\nContext:\n{context}\n\nQuestion: {question}\n\nAnswer:"

  #Generate the answer
  response = flan_pipeline(
      prompt,
      max_new_tokens=200,
      temperature=0.9,
      top_k=50,
      top_p=0.9,
      do_sample=True
  )

  return response[0]['generated_text']

8.Run queries using RAG

In [None]:
#Example question
question = "Summarize the key points of this document in a paragraph of 200 words."
print(query_rag(question))

  relevant_docs = retriever.get_relevant_documents(question)
  return forward_call(*args, **kwargs)


The Sustainable Development Goals Report 2025 is a report by the United Nations Department of Economic and Social Affairs.


9.Compare with generic LLM response(No retrieval)

In [None]:
#Define a function that uses only yhe question (no context)
def query_no_context(question):
  response = flan_pipeline(
      question,
      max_new_tokens=200,
      temperature=0.9, #Creativity control (lower = deterministic, higher = more diverse)
      top_k=50, #Only sample from the top-k most likey tokens
      top_p=0.9, #Nucleus sampling: only sample from tokens with cumulative prob <=top_p
      do_sample=True #Enables sampling (required for temperature/top-k/top-p to work)
  )
  return response[0]['generated_text']

#Running the same question without RAG
print("Without RAG:\n", query_no_context(question))

Without RAG:
 This is a summary of the key points of the report.


10.Save Vector Store

In [None]:
vectorstore.save_local("my_faiss_index")