Reference github repo: https://github.com/felipesanma/pdf-comparison/tree/main?tab=readme-ov-file

# Replacong with open source models

In [1]:
%%writefile requirments.txt

PyPDF2 
sentence-transformers 
faiss-cpu 
langchain 
streamlit 
langchain_community 

Overwriting requirments.txt


In [2]:
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFaceHub
import streamlit as st
from tqdm.autonotebook import tqdm, trange
import numpy as np

  from tqdm.autonotebook import tqdm, trange


In [3]:
def get_text_splitter(pdf_file):
    pdf_reader = PdfReader(pdf_file)
    text = ""
    for page in pdf_reader.pages:
        text += page.extract_text()
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000, chunk_overlap=0, length_function=len
    )
    result = text_splitter.split_text(text)
    return result

In [4]:
texts = get_text_splitter("data/GB 25991-2010 Automotive Headlamps with LED Light Sources andor LED Modules (1) (2).pdf")

In [5]:
def create_qa_retrievals(pdf_file_list: list):
    qa_retrievals = []
    embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    for pdf in pdf_file_list:
        texts = get_text_splitter(pdf)
        docsearch = FAISS.from_texts(texts = texts, embedding = embedding_model)
        # st.info(f"Saving {pdf} to vector DB")

        # Using HuggingFaceHub for the LLaMA model
        # llm = HuggingFaceHub(model="meta-llama/LLaMA-7b", model_kwargs={"temperature": 0.1})
        llm = HuggingFaceHub(repo_id="facebook/opt-1.3b", model_kwargs={"temperature": 0.1})
        qa_tmp = RetrievalQA.from_chain_type(
            llm=llm,
            chain_type="stuff",
            retriever=docsearch.as_retriever(
                search_type="similarity", search_kwargs={"k": 2}
            ),
            return_source_documents=True,
        )
        qa_retrievals.append(qa_tmp)

    return qa_retrievals

In [6]:
qa_retrievals = create_qa_retrievals(["data/GB 25991-2010 Automotive Headlamps with LED Light Sources andor LED Modules (1) (2).pdf"])



features type =  <class 'dict'>
features length =  3
features {'input_ids': tensor([[  101,  2795,  1017,  ...,  4618, 23758,   102],
        [  101,  5852,  2005,  ...,     0,     0,     0],
        [  101,  1020,  1012,  ...,     0,     0,     0],
        ...,
        [  101,  4460,  1998,  ...,     0,     0,     0],
        [  101,  1020,  1012,  ...,  3048, 23190,   102],
        [  101,  2031,  3478,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]])}
features type =  <class 'dict'>
features length =  3
features {'input_ids': tensor([[  

`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.
  warn_deprecated(


ValidationError: 1 validation error for HuggingFaceHub
__root__
  Did not find huggingfacehub_api_token, please add an environment variable `HUGGINGFACEHUB_API_TOKEN` which contains it, or pass `huggingfacehub_api_token` as a named parameter. (type=value_error)

In [None]:
type(qa_retrievals), len(qa_retrievals), type(qa_retrievals[0])

In [None]:
qa_retrievals

In [None]:
def ask_to_all_pdfs_sources(query: str, qa_retrievals):
    responses = []
    progress_text = f"Asking '{query}' to all PDFs"
    total_retrievals = len(qa_retrievals)
    my_bar = st.progress(0, text=progress_text)
    for count, qa in enumerate(qa_retrievals):
        result = qa({"query": query})
        tmp_obj = {
            "query": query,
            "response": result["result"],
            "source_document": result["source_documents"][0]
            .metadata["source"]
            .split("-")[1],
        }
        responses.append(tmp_obj)
        percent_complete = (count + 1) * 100 / total_retrievals
        my_bar.progress(int(percent_complete), text=progress_text)

    return responses

In [None]:
query = "What are regulations on color rendering?"

In [None]:
ask_to_all_pdfs_sources(query, qa_retrievals)

#### Function forward in module ~\Desktop\Document Comparision\lang_venv\Lib\site-packages\sentence_transformers\models\Transformer.py expects a dictionary but getting an str

## Repo code with GPT

In [None]:
# import streamlit as st
# from dotenv import load_dotenv
# from langchain import OpenAI
# from langchain.chains import RetrievalQA, RetrievalQAWithSourcesChain
# from langchain.document_loaders import UnstructuredPDFLoader
# from langchain.embeddings.openai import OpenAIEmbeddings
# from langchain.text_splitter import RecursiveCharacterTextSplitter
# from langchain.vectorstores import Chroma
# from PyPDF2 import PdfReader

In [None]:
# def get_text_splitter(pdf_file):
#     pdf_reader = PdfReader(pdf_file)
#     text = ""
#     # st.info(f"Extracting text from PDF {pdf_file.name}")
#     for page in pdf_reader.pages:
#         text += page.extract_text()
#     # st.info(f"Getting Chunks from {pdf_file.name}")
#     text_splitter = RecursiveCharacterTextSplitter(
#         chunk_size=1000, chunk_overlap=0, length_function=len
#     )
#     result = text_splitter.split_text(text)
#     return result


# def create_qa_retrievals(pdf_file_list: list, OPENAI_API_KEY):

#     qa_retrievals = []
#     for pdf in pdf_file_list:
#         # st.info(f"Processing {pdf.name}")
#         texts = get_text_splitter(pdf)
#         # st.info(f"Converting PDF {pdf.name} to embedding")
#         docsearch = Chroma.from_texts(
#             texts,
#             OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY),
#             metadatas=[{"source": f"{i}-{pdf.name}"} for i in range(len(texts))],
#         )
#         st.info(f"Saving {pdf.name} to vector DB")
#         qa_tmp = RetrievalQA.from_chain_type(
#             llm=OpenAI(openai_api_key=OPENAI_API_KEY),
#             chain_type="stuff",
#             retriever=docsearch.as_retriever(
#                 search_type="similarity", search_kwargs={"k": 2}
#             ),
#             return_source_documents=True,
#         )
#         qa_retrievals.append(qa_tmp)

#     return qa_retrievals


# def ask_to_all_pdfs_sources(query: str, qa_retrievals):
#     responses = []
#     progress_text = f"Asking '{query}' to all PDF's"
#     total_retrievals = len(qa_retrievals)
#     my_bar = st.progress(0, text=progress_text)
#     for count, qa in enumerate(qa_retrievals):
#         result = qa({"query": query})
#         tmp_obj = {
#             "query": query,
#             "response": result["result"],
#             "source_document": result["source_documents"][0]
#             .metadata["source"]
#             .split("-")[1],
#         }
#         responses.append(tmp_obj)
#         percent_complete = (count + 1) * 100 / total_retrievals
#         my_bar.progress(int(percent_complete), text=progress_text)

#     return responses