In [14]:
from langchain.chains.conversational_retrieval.base import ConversationalRetrievalChain
from langchain_community.llms.huggingface_hub import HuggingFaceHub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from transformers import T5Tokenizer
from dotenv import load_dotenv
import os
import re

## Load env + llm + embeddings

In [15]:
load_dotenv()
os.environ["HUGGINGFACEHUB_API_TOKEN"] = os.getenv("HUGGINGFACEHUB_API_TOKEN")

# llm
llm = HuggingFaceHub(
            repo_id="google/flan-t5-large",
            model_kwargs={"temperature": 0.5, "max_length": 512}
        )

In [16]:
# embeddings
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/multi-qa-MiniLM-L6-cos-v1"
)

## Read PDF

In [17]:
def read_pdf(pdf_path):
    try:
        loader = PyPDFLoader(pdf_path)
        documents = loader.load()
        return documents
    except FileNotFoundError:
        return "File not found."
    except Exception as e:
        return f"An error occurred: {e}"

# File Path
parent_dir = os.path.dirname(os.getcwd())
pdf_file = os.path.join(parent_dir, "PDF", "AnonCred-RWC.pdf")


pdf_text = read_pdf(pdf_file)

## Preprocess

In [18]:
# Remove PUA (Private Use Area) characters from text
def remove_pua(text):
    # Remove PUA from BMP
    text = re.sub(r'[\ue000-\uf8ff]', '', text)

    # Remove PUA from Supplementary Area A
    text = re.sub(r'[\U000f0000-\U000ffffd]', '', text)

    # Remove PUA from Supplementary Area B
    text = re.sub(r'[\U00100000-\U0010fffd]', '', text)
    return text


In [19]:
for i in range(len(pdf_text)):
    pdf_text[i].page_content = pdf_text[i].page_content.replace("-\n", "") # remove hyphenation
    pdf_text[i].page_content = pdf_text[i].page_content.replace("\n", " ") # remove new lines
    pdf_text[i].page_content = remove_pua(pdf_text[i].page_content ) # remove PUA characters
    pdf_text[i].page_content = re.sub(r'\s+', ' ', pdf_text[i].page_content) # remove extra spaces

In [20]:
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large", use_fast=True, legacy=False)

# split text into chunks
def flan_t5_len(text):
    return len(tokenizer.encode(text, truncation=False))


text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=60,
    length_function=flan_t5_len
)

text_splitter.split_documents(pdf_text)

print("tokenizer done")

tokenizer done


## FAISS

In [21]:
vectorstore = FAISS.from_documents(pdf_text, embeddings)
vectorstore

<langchain_community.vectorstores.faiss.FAISS at 0x20717778470>

## Question Answering

In [22]:
retriever = vectorstore.as_retriever(
    search_kwargs={"k":7},
    search_type="mmr"
)

In [23]:
# Change the verbose parameter to True to see the more details
qa_chain = ConversationalRetrievalChain.from_llm(llm, retriever, return_source_documents=True, verbose=False)
qa_chain

ConversationalRetrievalChain(verbose=False, combine_docs_chain=StuffDocumentsChain(verbose=False, llm_chain=LLMChain(verbose=False, prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\n{context}\n\nQuestion: {question}\nHelpful Answer:"), llm=HuggingFaceHub(client=<InferenceClient(model='google/flan-t5-large', timeout=None)>, repo_id='google/flan-t5-large', task='text2text-generation', model_kwargs={'temperature': 0.5, 'max_length': 512}), output_parser=StrOutputParser(), llm_kwargs={}), document_prompt=PromptTemplate(input_variables=['page_content'], input_types={}, partial_variables={}, template='{page_content}'), document_variable_name='context'), question_generator=LLMChain(verbose=False, prompt=PromptTemplate(input_variables=['chat_history', 'question'], input_

In [24]:
chat_history = []


In [25]:
question = "what is the main concept of this paper?"

result = qa_chain.invoke({
            "question": question,
            "chat_history": chat_history
        })

chat_history.append((question, result["answer"]))

result['answer']

'In this paper we describe a practical digital identity project of a global scale, which solves a number of privacy and scalability problems using the concepts of anonymous credentials and permissioned blockchains.'