## Download/Upload PDF

In [None]:
!pip install tqdm
!pip install PyMuPDF
!pip install chromadb
!pip install bitsandbytes accelerate

Collecting PyMuPDF
  Downloading pymupdf-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m112.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.26.6
Collecting chromadb
  Downloading chromadb-1.3.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.2-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting opentelemetry-exporter-otlp

In [None]:
import os
import re
import fitz
import uuid
import textwrap
import requests
import chromadb
import torch, gc
import pandas as pd
from tqdm.auto import tqdm
from spacy.lang.en import English
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.utils import is_flash_attn_2_available
from transformers import BitsAndBytesConfig

In [None]:
file_path = ""

In [None]:
def download_pdf(url: str,
                 file_path: str) -> str:
  if not os.path.exists(file_path):
    print("-> File doesn't exist, downloading...")
    response = requests.get(url)
    filename = file_path
    if response.status_code == 200:
      with open(filename, "wb") as file:
        file.write(response.content)
      return (f"-> File saved successfully with filename: {filename}")
    else:
      return (f"-> Unable to download the file: {response.status_code}")
  else:
    return ("File already exists.")

def upload_pdf(file_path: str) -> str:
  if os.path.exists(file_path):
    return ("File exists.")
  else:
    return ("File doesn't exist.")

if __name__=="__main__":
  print("Menu:")
  print("1. Download PDF")
  print("2. Upload PDF")
  choice = input("Enter your choice: ")

  if choice == "1":
    url = input("Enter the url: ").strip()
    file_path = input("Enter the file path: ").strip()
    if url and file_path:
      file_path += ".pdf"
      print(download_pdf(url, file_path))
    else:
      print("Please provide a valid input.")
  elif choice == "2":
    file_path = input("Enter the file path: ").strip()
    if file_path:
      file_path += ".pdf"
      print(upload_pdf(file_path))
    else:
      print("Please provide a valid input.")

  else:
    print("Wrong Input")

Menu:
1. Download PDF
2. Upload PDF
Enter your choice: 1
Enter the url: https://www.nber.org/system/files/working_papers/w29421/w29421.pdf
Enter the file path: fintech lending
-> File doesn't exist, downloading...
-> File saved successfully with filename: fintech lending.pdf


##Preprocessing

In [None]:
def open_read_pdf(path: str) -> list[dict]:
  pdf = fitz.open(path)
  pages_texts = []
  for page_number, page in tqdm(enumerate(pdf)):
    text = page.get_text()
    text = text.replace("\n", " ").strip()
    pages_texts.append({
        "page_number": page_number+1,
        "text": text
    })
  return pages_texts

In [None]:
def create_chunks(input_list: list, chunk_size: int) -> list[list[str]]:
    return [input_list[i : i+chunk_size] for i in range(0, len(input_list), chunk_size)] # [["a", "b"....10 values],[10 values],....], 10 = chunk_size

In [None]:
def preprocess(nlp: English, pages_texts: list[dict]) -> list[dict]:
  # Getting the sentences from a huge amount of text
  for item in tqdm(pages_texts):
    item["sentences"] = list(nlp(item["text"]).sents)
    item["sentences"] = [str(sentence) for sentence in item["sentences"]] # Sentences are spacy tokens and not string so thats why conversion

  for item in tqdm(pages_texts):
    item["sentences_chunks"] = create_chunks(item["sentences"], chunk_size=10)

  pages_chunks = []
  for item in tqdm(pages_texts): # Selecting every row
    for sentence_chunk in item["sentences_chunks"]: # Selecting every ["a", "b"....] from sentence chunk of a row: [["a", "b"....],...]
      chunk_dict = {}
      chunk_dict["page_number"] = item["page_number"]
      joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip() # "ab...."
      joined_sentence_chunk = re.sub(r"\.([A-Z])", r". \1", joined_sentence_chunk) # .A -> . A

      chunk_dict["sentence_chunk"] = joined_sentence_chunk
      chunk_dict["chunk_token_count"] = len(joined_sentence_chunk)/4 # 1 token ~= 4 words
      pages_chunks.append(chunk_dict) # [{page_number: 41, sentence_chunk: "ab...."}, {page_number: 41, sentence_chunk: "xy...."}]

  df = pd.DataFrame(pages_chunks)
  pages_chunks_over_min_token = df[df["chunk_token_count"]>min_token_length].to_dict(orient="records") # Preserve the tokens in which more than 30*4 words or 30 tokens are present

  # [item['sentence_chunk'] for item in pages_chunks_over_min_token if item['page_number'] == 27]

  return pages_chunks_over_min_token

In [None]:
min_token_length = 30

nlp = English()
nlp.add_pipe("sentencizer")

pages_texts = open_read_pdf(file_path)

pages_chunks_over_min_token = preprocess(nlp, pages_texts)

0it [00:00, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

## Embedding Model

In [None]:
# !pip install --upgrade transformers

In [None]:
embedding_model = SentenceTransformer(model_name_or_path="all-MiniLM-L6-v2", device="cuda")

### Extracting first two pages

In [None]:
def extract_first_two_pages_text(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""

    for page_num in range(min(2, len(doc))):
        page = doc[page_num]
        text += page.get_text("text") + "\n"

    doc.close()
    return text.strip()

## ChromaDB

In [None]:
chroma_client = chromadb.Client()

In [None]:
def create_collections():
  global abstract_collection, fulltext_collection
  abstract_collection = chroma_client.create_collection(name="paper_abstracts", metadata={"hnsw:space": "cosine"})
  fulltext_collection = chroma_client.create_collection(name="paper_fulltexts", metadata={"hnsw:space": "cosine"})

In [None]:
def delete_collections():
  chroma_client.delete_collection(name="paper_abstracts")
  chroma_client.delete_collection(name="paper_fulltexts")
  return "Collections deleted successfully."

In [None]:
def add_pdf_to_parent_collection(pdf_path, parent_collection, embedding_model):
    abstract_text = extract_first_two_pages_text(pdf_path)
    abstract_embedding = embedding_model.encode(abstract_text).tolist()

    doc_id = str(uuid.uuid4())

    parent_collection.add(
        ids=[doc_id],
        embeddings=[abstract_embedding],
        metadatas=[{"pdf_link": pdf_path, "doc_id": doc_id}],
        documents=[abstract_text]
    )
    return doc_id

def add_pdf_to_child_collection(pdf_path, child_collection, embedding_model, pdf_chunks, doc_id):
  for chunk in pdf_chunks:
    text_chunk = chunk["sentence_chunk"]
    page_number = chunk["page_number"]

    chunk_embedding = embedding_model.encode(text_chunk).tolist()

    chunk_id = str(uuid.uuid4())

    child_collection.add(
        ids=[chunk_id],
        embeddings=[chunk_embedding],
        metadatas=[{
            "pdf_link": pdf_path,
            "doc_id": doc_id,
            "page_number": page_number
        }],
        documents=[text_chunk]
    )

In [None]:
def add_pdf_to_collections(pdf_path, parent_collection, child_collection, embedding_model, pdf_chunks):
  doc_id = add_pdf_to_parent_collection(pdf_path, parent_collection, embedding_model)
  add_pdf_to_child_collection(pdf_path, child_collection, embedding_model, pdf_chunks, doc_id)
  return doc_id

In [None]:
existing = {c.name for c in chroma_client.list_collections()}

if "paper_abstracts" not in existing or "paper_fulltexts" not in existing:
    create_collections()

In [None]:
doc_id = add_pdf_to_collections(file_path, abstract_collection, fulltext_collection, embedding_model, pages_chunks_over_min_token)

In [None]:
# def get_wrapped_text(text, wrap_length=80):
#   wrapped_text = textwrap.fill(text, wrap_length)
#   return wrapped_text

In [None]:
DISTANCE_THRESHOLD = 0.5

def search(query_text, parent_collection, child_collection, embedding_model):
  query_embedding = embedding_model.encode(query_text)
  query_embedding_list = query_embedding.tolist()

  parent_results = parent_collection.query(
      query_embeddings=[query_embedding_list],
      n_results=3
  )

  relevant_doc_ids = [meta["doc_id"] for meta in parent_results["metadatas"][0]]

  child_results = child_collection.query(
      query_embeddings=[query_embedding_list],
      n_results=5,
      where={"doc_id": {"$in": relevant_doc_ids}}
  )

  return parent_results, child_results

# query_text = "Explain climate finance"
# parent_results, child_results = search(query_text, abstract_collection, fulltext_collection, embedding_model)

### Loading LLM

In [None]:
if(is_flash_attn_2_available() and torch.cuda.get_device_capability(0)[0] >= 8):
  attn_implementation = "flash_attn_2"
else:
  attn_implementation = "sdpa"

model_id = "meta-llama/Llama-3.2-1B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path = model_id)

# To remove the warning: Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
if tokenizer.pad_token_id is None:
    tokenizer.pad_token = tokenizer.eos_token

llm_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path = model_id,
                                                 torch_dtype=torch.float16,
                                                 quantization_config=None,
                                                 low_cpu_mem_usage=False,
                                                 attn_implementation=attn_implementation)

# To remove the warning: Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
llm_model.config.pad_token_id = llm_model.config.eos_token_id
llm_model.generation_config.pad_token_id = tokenizer.pad_token_id

device = "cuda" if torch.cuda.is_available() else "cpu"
llm_model.to(device)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-05)
    (ro

In [None]:
def prompt_formatter(query: str,
                     context_items: list[dict]) -> str:
  context = "- " + "\n- ".join([item["text"] for item in context_items])

  base_prompt = """Based on the following context items, please answer the query.
Give yourself room to think by extracting relevant passages from the context before answering the query.
Don't return the thinking, only return the answer.
Make sure your answers are as explanatory as possible.
Use the following examples as reference for the ideal answer style.
\nExample 1:
Query: What are the fat-soluble vitamins?
Answer: The fat-soluble vitamins include Vitamin A, Vitamin D, Vitamin E, and Vitamin K. These vitamins are absorbed along with fats in the diet and can be stored in the body's fatty tissue and liver for later use. Vitamin A is important for vision, immune function, and skin health. Vitamin D plays a critical role in calcium absorption and bone health. Vitamin E acts as an antioxidant, protecting cells from damage. Vitamin K is essential for blood clotting and bone metabolism.
\nExample 2:
Query: What are the causes of type 2 diabetes?
Answer: Type 2 diabetes is often associated with overnutrition, particularly the overconsumption of calories leading to obesity. Factors include a diet high in refined sugars and saturated fats, which can lead to insulin resistance, a condition where the body's cells do not respond effectively to insulin. Over time, the pancreas cannot produce enough insulin to manage blood sugar levels, resulting in type 2 diabetes. Additionally, excessive caloric intake without sufficient physical activity exacerbates the risk by promoting weight gain and fat accumulation, particularly around the abdomen, further contributing to insulin resistance.
\nExample 3:
Query: What is the importance of hydration for physical performance?
Answer: Hydration is crucial for physical performance because water plays key roles in maintaining blood volume, regulating body temperature, and ensuring the transport of nutrients and oxygen to cells. Adequate hydration is essential for optimal muscle function, endurance, and recovery. Dehydration can lead to decreased performance, fatigue, and increased risk of heat-related illnesses, such as heat stroke. Drinking sufficient water before, during, and after exercise helps ensure peak physical performance and recovery.
\nNow use the following context items to answer the user query:
{context}
\nRelevant passages: <extract relevant passages from the context here>
User query: {query}
Answer:"""

  base_prompt = base_prompt.format(context=context, query=query)

  dialogue_template = [
        {"role": "user",
        "content": base_prompt}
    ]

  prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                        tokenize=False,
                                        add_generation_prompt=True)

  return prompt

In [None]:
def ask(query: str,
        parent_collection=None,
        child_collection=None,
        embedding_model=None,
        temperature: float = 0.7,
        max_new_tokens: int = 256,
        format_answer_text: bool = True,
        return_answer_only: bool = True):

    parent_results, child_results = search(
        query_text=query,
        parent_collection=parent_collection,
        child_collection=child_collection,
        embedding_model=embedding_model
    )

    context_items = []
    for group_idx, result_group in enumerate(child_results["metadatas"]):
        for i, meta in enumerate(result_group):
            dist = child_results["distances"][group_idx][i]
            if dist > DISTANCE_THRESHOLD:
                continue
            context_items.append({
                "page_number": meta.get("page_number", "N/A"),
                "pdf_link": meta.get("pdf_link", "Not Found"),
                "text": child_results["documents"][group_idx][i]
            })


    if not context_items:
        return "No relevant context found."


    prompt = prompt_formatter(query=query, context_items=context_items)

    device = "cuda" if torch.cuda.is_available() else "cpu"
    input_ids = tokenizer(prompt, return_tensors="pt").to(device)

    outputs = llm_model.generate(
        **input_ids,
        temperature=temperature,
        do_sample=True,
        max_new_tokens=max_new_tokens
    )

    output_text = tokenizer.decode(outputs[0])

    if format_answer_text:
        output_text = output_text.replace(prompt, "").replace(
            "<|begin_of_text|>", ""
        ).replace(
            "<|eot_id|>", ""
        ).replace(
            "Based on the context, the", "The"
        )

    if return_answer_only:
        return output_text

    return output_text, context_items

In [None]:
# gc.collect()
# torch.cuda.empty_cache()

# query = "What is climate finance."
# query = "What is fintech lending."
query = "Some gaps in fintech lending."
# query = "Some gaps in climate finance."
print(f"Query: {query}")
result = ask(query, abstract_collection, fulltext_collection, embedding_model, temperature=0.2, return_answer_only=False)

Query: Some gaps in fintech lending.


In [None]:
# print(result)

In [None]:
if result!="No relevant context found.":
  print(f"Query: {query}")
  print(f"RAG Answer: {(result[0])}\nFound on page number: {result[1][0]["page_number"]}\nPDF Link: {result[1][0]["pdf_link"]}")
else:
  print(f"Query: {query}")
  print(result)

Query: Some gaps in fintech lending.
RAG Answer: The following passages highlight some gaps in fintech lending:

1. **Insufficient market share**: FinTech lending has not yet gained significant market share in the traditional banking sector, with most of its growth coming from the mortgage market.
2. **Limited growth in non-residential lending**: FinTech lending has not displayed significant growth in non-residential lending, whereas it has gained a strong presence in the mortgage market.
3. **Insufficient data on non-traditional lending models**: There is a lack of academic research on FinTech lending in the buy-now-pay-later (BNPL) segment, despite its growing popularity.
4. **Limited understanding of FinTech lending's comparative advantage**: The key comparative advantage of FinTech lenders in the mortgage market is not yet fully understood, and more research is needed to identify their strengths and weaknesses.
5. **Insufficient market capitalization**: FinTech lending companies li