In [None]:
# Fully upgrade all core libraries to their latest, compatible versions
!pip install -q -U sentence-transformers transformers accelerate peft PyPDF2 faiss-cpu

In [None]:
# 📁 Upload PDF
from google.colab import files
uploaded = files.upload()

In [None]:
# Suppress warnings from the transformers library
logging.getLogger("transformers").setLevel(logging.ERROR)

In [None]:
# 📄 Read PDF
from PyPDF2 import PdfReader
import logging

def read_pdf(path): return "".join([p.extract_text() for p in PdfReader(path).pages])
pdf_text = read_pdf("GenAI_QA_Project_Interview_Questions.pdf")

# ✂️ Chunk the text
from langchain.text_splitter import CharacterTextSplitter
chunks = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200).split_text(pdf_text)

# 🧠 Embeddings using MiniLM
from sentence_transformers import SentenceTransformer
import numpy as np
embedder = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = np.array([e.numpy() for e in embedder.encode(chunks, convert_to_tensor=True)])

# 📦 Store in FAISS index
import faiss
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)

# 🔍 Retrieve top k chunks
def retrieve_chunks(query, k=3):
    q_emb = embedder.encode([query])[0]
    D, I = index.search(np.array([q_emb]), k)
    return " ".join([chunks[i] for i in I[0]])

# 💬 Load the Flan-T5 model for answering
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import torch
# Use a smaller, faster model for quick inference
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
tok = AutoTokenizer.from_pretrained("google/flan-t5-base")

# 🔁 Ask question
from transformers import pipeline
rag_pipe = pipeline("text2text-generation", model=model, tokenizer=tok)

question = "What are LLMs"

# Retrieve chunks and then truncate the context to fit the model's limit (512 tokens for T5)
context = retrieve_chunks(question)
# Correctly truncate the context to the model's max length and suppress the warning
encoded_context = tok.encode(context, max_length=512, truncation=True, return_tensors="pt")
truncated_context = tok.decode(encoded_context[0], skip_special_tokens=True)


prompt = f"Use the following context to answer the question:\n\nContext: {truncated_context}\n\nQuestion: {question}"
response = rag_pipe(prompt, max_new_tokens=256, do_sample=True)[0]["generated_text"]

print("📌 Answer:\n", response.split("[/INST]")[-1].strip())

📌 Answer:
 LangChain is a framework to build LLM apps. We used it to handle chaining of components like loaders, chunkers, embeddings, and models.
