USE GPU

To implement simple PDF Document search using Open Source Generative AI model.

## The Code

### Imports

Generate a token by creating a HuggingFace account

In [20]:
from langchain.document_loaders import TextLoader  #for textfiles
from langchain.text_splitter import CharacterTextSplitter #text splitter
from langchain.embeddings import HuggingFaceEmbeddings #for using HugginFace models
from langchain.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain.chains.question_answering import load_qa_chain
from langchain import HuggingFaceHub
from langchain.document_loaders import UnstructuredPDFLoader  #load pdf
from langchain.indexes import VectorstoreIndexCreator #vectorize db index with chromadb
from langchain.chains import RetrievalQA
from langchain.document_loaders import UnstructuredURLLoader  #load urls into docoument-loader
from langchain.document_loaders import PyPDFLoader
from langchain.chains.question_answering import load_qa_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain import HuggingFaceHub
from langchain.chains import RetrievalQA
from langchain.vectorstores import FAISS
import os
import pandas as pd
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"

### Load the dataset into a DataFrame

In [21]:
# Download the zip file from the kaggle website and Load the flder into colab
def load_dataset():
    # set variables
    main_folder = '../input/celeba-dataset/'
    images_folder = main_folder + 'img_align_celeba/img_align_celeba/'

    EXAMPLE_PIC = images_folder + '000506.jpg'

    # import the data set that include the attribute for each picture
    df_attr = pd.read_csv(main_folder + 'list_attr_celeba.csv')
    df_attr.set_index('image_id', inplace=True)
    df_attr.replace(to_replace=-1, value=0, inplace=True) #replace -1 by 0
    df_attr.shape


### Loading the PDF



*   https://python.langchain.com/docs/modules/data_connection/document_loaders/pdf
*   Example: use the PyPDFLoader from the LangChain library here to load our PDF file





In [22]:
loader = PyPDFLoader("H:\\.NCU Assignment Files\\AAIES\\Lab\\70 Toughest interivew questions.pdf")
pages = loader.load_and_split()
pages

[Document(page_content='70 TOUGHTEST \nINTERVIEW QUESTIONS \nAND AN SWERS  \nNote s By Neha Malhotra  \n \n1. Tell me about yourself.  \n• Answer: "Certainly, I\'d be happy to. I hold a Bachelor\'s \ndegree in Computer Science and have spent the last six \nyears working in software development. I\'ve had the \nopportunity to work on various projects, from developing \nmobile applications to leading a te am of developers in my \nprevious role at XYZ Company. I\'m known for my \nproblem -solving skills and my ability to work \ncollaboratively with cross -functional teams. Outside of \nwork, I\'m passionate about volunteering for coding boot \ncamps, where I mentor aspiri ng developers."  \n2. Why should we hire you?  \n• Answer: "You should hire me because I bring a unique \ncombination of technical expertise, leadership experience, \nand a proven track record of delivering results. In my \nprevious role at ABC Inc., I not only led a team that \ncompleted a critical project ahead of sche

### Chunking the text



*   https://python.langchain.com/docs/modules/data_connection/document_transformers/
*   Example: use the RecursiveCharacterTextSplitter here to split the data which works by taking a large text and splitting it based on a specified chunk size.





In [23]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1024,
    chunk_overlap=64,
    separators=['\n\n', '\n', '(?=>\. )', ' ', '']
)
docs = text_splitter.split_documents(pages)

### Storing the Embeddings in a Vector Store:



*    https://python.langchain.com/docs/modules/data_connection/vectorstores/

*   Example: using FAISS. FAISS, short for Facebook AI Similarity Search, is a powerful library designed for efficient searching and clustering of dense vectors.




In [24]:
embeddings = HuggingFaceEmbeddings()
db = FAISS.from_documents(docs, embeddings)

### Similarity Search with Open Source Model



*    connect here to the hugging face hub to fetch the Flan-T5 XL model.
*    Define a host of model settings for the model, such as temperature and max_length.
*    The load_qa_chain function provides a simple method for feeding documents to an LLM





In [25]:
llm=HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":1, "max_length":1000000})
chain = load_qa_chain(llm, chain_type="stuff")

### Creating QA Chain and Querying



*    Use the RetrievalQAChain to retrieve documents using a Retriever and then uses a QA chain to answer a question based on the retrieved documents





In [26]:
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff",
retriever=db.as_retriever(search_kwargs={"k": 3}))

### Main function to upload a PDF and search over it

In [27]:
def main():
    x = input('enter path to your pdf')
    loader = PyPDFLoader(x)
    pages = loader.load_and_split()
    text_splitter = RecursiveCharacterTextSplitter(
                    chunk_size=1024,
                    chunk_overlap=64,
                    separators=['\n\n', '\n', '(?=>\. )', ' ', '']
                    )
    
    docs  = text_splitter.split_documents(pages)
    embeddings = HuggingFaceEmbeddings()
    db = FAISS.from_documents(docs, embeddings)
    llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature": 1, "max_length": 1000000})

    chain = load_qa_chain(llm, chain_type="stuff")

    qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff",
                                     retriever=db.as_retriever(search_kwargs={"k": 3}))
    query = input('what is your query')
    print(qa.run(query))

In [29]:
main()

47
