In [None]:
# install requirements
!pip install langchain sentence-transformers faiss-cpu pdfplumber langchain-community langchain-ollama

In [None]:
# connect colab to google drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pdfplumber
from sentence_transformers import SentenceTransformer
from langchain_community.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain_ollama.llms import OllamaLLM
from langchain_core.prompts import ChatPromptTemplate
from langchain.docstore.document import Document

# extract text from pdf 
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text

# split the text into chunks
def split_into_documents(text, chunk_size=500, overlap=100):
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
    docs = splitter.split_documents([Document(page_content=text)])
    return docs

# embed documents and build FAISS index
def build_faiss_index(docs):
    embedder = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    vectorstore = FAISS.from_documents(docs, embedding=embedder)
    return vectorstore

# create rag pipline
def create_rag_chain(vectorstore):

    # retriever return top 3 relevant chunks using similarity search
    retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})

    prompt_template = """Context:
{context}

Question:
{question}

Answer:"""

    prompt = ChatPromptTemplate.from_template(prompt_template)

    model = OllamaLLM(model="mistral")

    rag_chain = RetrievalQA.from_chain_type(
        llm=model,
        chain_type="stuff", # stuff chain type means all retrieved documents are concatenated into the prompt as context
        retriever=retriever,
        chain_type_kwargs={"prompt": prompt}
    )
    return rag_chain

In [None]:
# target pdf file path
pdf_path = "/content/drive/MyDrive/rag-exercise/Human-Nutrition-2020-Edition.pdf"

# ignore warnings from pdfminer
import logging
logging.getLogger("pdfminer").setLevel(logging.ERROR)

text = extract_text_from_pdf(pdf_path)

# split the text into chunks for embedding
docs = split_into_documents(text)

# build a FAISS index from chunks using huggingface sentence embeddings
vectorstore = build_faiss_index(docs)

# create rag chain 
rag_chain = create_rag_chain(vectorstore)

# print(type(rag_chain)) output: <class 'langchain.chains.retrieval_qa.base.RetrievalQA'>


question = "Can you explain the topic of potassium imbalances as discussed in this content?"
response = rag_chain.run(question)
print("Answer: ", response)
