# RAG pipeline

In [None]:
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from dotenv import load_dotenv
import os
import requests
import pandas as pd
import re

## 1 - Load document

In [None]:
loader = PyMuPDFLoader("paper.pdf")
documents = loader.load()

## 2 - Split it into Chunks

In [None]:
# Splitting the documents into chunks
splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100
)

chunks = splitter.split_documents(documents)

## 3 - Turn into embeddings and store it into ChromaDB

In [None]:
# The model prooved to be more efficient than other lightweight models when I tried it (lower distances)
embedder = HuggingFaceEmbeddings(
    model_name="BAAI/bge-base-en-v1.5"
)

In [None]:
rag_collection = "rag_collection"
db = Chroma.from_documents(
    chunks,
    embedding=embedder,
    persist_directory="./chroma_db",
    collection_name=rag_collection
)

## 4 - Query the DB to find the most relevant chuncks

In [None]:
query = "Represent this sentence for retrieving relevant passages: who are the authors of this paper ?" # define relevant query right here

In [None]:
retriever = db.as_retriever(search_kwargs={"k": 5})
relevant_chunks = retriever.invoke(query) # looking for the top 5 answers (ie with minimal distance to the query)

## 5 - Combine relevant chunks into context

In [None]:
context = "\n".join([doc.page_content for doc in relevant_chunks])

In [None]:
context

## 6 - connect to the API 

In [None]:
load_dotenv()
API_KEY = os.getenv("API_KEY") # loading the API key from the .env files

In [None]:
# You can change the URL to get better results. I tried this with free tier so I didn't get the best models.
API_URL_ROBERTA = "https://api-inference.huggingface.co/models/deepset/roberta-base-squad2"  # model URL
API_URL_MISTRAL = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.3"

# Use the API token to set headers
headers = {
    "Authorization": f"Bearer {API_KEY}"
}

# different call methods I tried with different models.
def call_llm_api(query, context):
    prompt = f"Context: {context}\n\nQuestion: {query}\nAnswer:"
    response = requests.post(API_URL, headers=headers, json={"inputs": prompt})
    return response.json()[0]["generated_text"]

def call_llm_api_mistral(query, context):
    prompt = f"Context: {context}\n\nQuestion: {query}\nAnswer:"
    response = requests.post(API_URL_MISTRAL, headers=headers, json={"inputs": prompt})
    try:
        data = response.json()
        if isinstance(data, list) and "generated_text" in data[0]:
            return data[0]["generated_text"]
        elif "error" in data:
            print("API Error:", data["error"])
            return None
        else:
            print("Unexpected response format:", data)
            return None
    except Exception as e:
        print("Exception during parsing:", e)
        print("Response text:", response.text)
        return None

def call_llm_api_roberta(query, context):
    payload = {
        "inputs": {
            "question": query,
            "context": context
        }
    }
    response = requests.post(API_URL_ROBERTA, headers=headers, json=payload)
    try:
        data = response.json()
        if "answer" in data:
            return data["answer"]
        elif "error" in data:
            print("API Error:", data["error"])
            return None
        else:
            print("Unexpected response format:", data)
            return None
    except Exception as e:
        print("Exception during parsing:", e)
        print("Response text:", response.text)
        return None

In [None]:
response = call_llm_api_roberta(query=query, context=context)
print(response)

In [None]:
response = call_llm_api_mistral(query=query, context=context)
print(response)

## 7 - make different queries

In [None]:
query_1 = "Represent this sentence for retrieving relevant passages: Provide a concise summary of this paper, highlighting its main objectives, key findings, and conclusions." # define relevant query right here
retriever = db.as_retriever(search_kwargs={"k": 5})
relevant_chunks = retriever.invoke(query_1) # looking for the top 5 answers (ie with minimal distance to the query)

In [None]:
context_1 = "\n".join([doc.page_content for doc in relevant_chunks])

In [None]:
response_1 = call_llm_api_mistral(query=query_1, context=context_1)
print(response_1)

In [None]:
query_2 = "Represent this sentence for retrieving relevant passages: When was this paper published ?" # define relevant query right here
retriever = db.as_retriever(search_kwargs={"k": 5})
relevant_chunks = retriever.invoke(query_2) # looking for the top 5 answers (ie with minimal distance to the query)

In [None]:
context_2 = "\n".join([doc.page_content for doc in relevant_chunks])

In [None]:
response_2 = call_llm_api_mistral(query=query_2, context=context_2)
print(response_2)

## 8 - turn responses into a pandas DF

In [None]:
# Snippet of code to extract the "Answer" from the Mistral response. It may be different for other models.

# Regular expression pattern to extract answers
pattern = r"Answer:\s*(.*)"

# Find all matches
answer = re.findall(pattern, response)

# Display the extracted answers
answer

In [None]:
results = [
    {'question': query, 'answer': re.findall(pattern, response)[0]},
    {'question': query_1 , 'answer': re.findall(pattern, response_1)[0]},
    {'question': query_2, 'answer': re.findall(pattern, response_2)[0]}
]

In [None]:
df = pd.DataFrame(results)
df.index = ['author', 'topic', 'date']

In [None]:
df

The model performs well when analyzing the paper's topic but is less accurate in identifying author names and publication dates. This discrepancy arises because research papers often include extensive references that list numerous author names and dates, which can confuse the model during extraction. To be fair, it is still able to list the authors accurately but performance could be improved by further processing the paper before making the API call.