In [None]:
!pip install langchain sentence-transformers faiss-cpu pdfplumber langchain-community langchain-ollama

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pdfplumber
from sentence_transformers import SentenceTransformer
from langchain_community.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain_ollama.llms import OllamaLLM
from langchain_core.prompts import ChatPromptTemplate
from langchain.docstore.document import Document


def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text


def split_into_documents(text, chunk_size=500, overlap=100):
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
    docs = splitter.split_documents([Document(page_content=text)])
    return docs


def build_faiss_index(docs):
    embedder = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    vectorstore = FAISS.from_documents(docs, embedding=embedder)
    return vectorstore


def create_rag_chain(vectorstore):
    retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})

    prompt_template = """Context:
{context}

Question:
{question}

Answer:"""

    prompt = ChatPromptTemplate.from_template(prompt_template)

    model = OllamaLLM(model="mistral")

    rag_chain = RetrievalQA.from_chain_type(
        llm=model,
        chain_type="stuff",
        retriever=retriever,
        chain_type_kwargs={"prompt": prompt}
    )
    return rag_chain

In [None]:

pdf_path = "/content/drive/MyDrive/rag-exercise/Human-Nutrition-2020-Edition.pdf"

import logging
logging.getLogger("pdfminer").setLevel(logging.ERROR)

text = extract_text_from_pdf(pdf_path)
docs = split_into_documents(text)
vectorstore = build_faiss_index(docs)
rag_chain = create_rag_chain(vectorstore)
# print(type(rag_chain))


question = "Can you explain the topic of potassium imbalances as discussed in this content?"
response = rag_chain.run(question)
print("Answer: ", response)
