<a href="https://colab.research.google.com/github/Roman-99/PDF-Reader-Base-Code/blob/main/TTU_Capstone_LLM_PDF_reader_base_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#installs
!pip install -q -U bitsandbytes
!pip install -q -U transformers
!pip install -q -U langchain
!pip install -q -U accelerate
!pip install -q -U PyPDF2
!pip install -q -U faiss-gpu
!pip install -q -U sentence_transformers
!pip install -q -U pypdf
!pip install -q -U torch

In [None]:
from langchain.document_loaders import PyPDFLoader

#path to pdfs
path = "/content/drive/MyDrive/Technical Manuals"

#load pdf
reader = PyPDFLoader('/content/drive/MyDrive/Technical Manuals/AFD-180201-00-5-3.pdf')

#split pdf into pages
pages = reader.load_and_split()

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=150,
    chunk_overlap=15,
    separators=['\n\n', '\n', '(?=>\. )', ' ', '']
    )

# split pages into text chunks
texts = text_splitter.split_documents(pages)

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings

#embedding text
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2',model_kwargs={'device': 'cuda'})

In [None]:
from langchain.vectorstores import FAISS

#create vector store from text chunks and embeds
vectorstore = FAISS.from_documents(texts, embeddings)

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import transformers
import torch

#config to load model in 4-bit to lower memory needs
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

#LLM Model: sharded version of falcon-7b to reduce memory needs
model = AutoModelForCausalLM.from_pretrained("vilsonrodrigues/falcon-7b-instruct-sharded", quantization_config=bnb_config, trust_remote_code=True)

#tokenizer
tokenizer = AutoTokenizer.from_pretrained("vilsonrodrigues/falcon-7b-instruct-sharded")

Loading checkpoint shards:   0%|          | 0/15 [00:00<?, ?it/s]

In [None]:
from transformers import pipeline

#create pipeline with desired settings
pipe = pipeline("text-generation",
                model=model,
                tokenizer= tokenizer,
                torch_dtype=torch.bfloat16,
                device_map="auto",
                max_new_tokens=1024,
                do_sample=True,
                top_k=10,
                num_return_sequences=1,
                eos_token_id=tokenizer.eos_token_id
                )

In [None]:
from langchain import HuggingFacePipeline

#load pipeline
llm=HuggingFacePipeline(pipeline=pipe, model_kwargs={'temperature':0})

In [None]:
from langchain.chains import RetrievalQA

#setup question answer system
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff",
                                 retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
                                 return_source_documents=True,
                                 verbose=False,
)

In [None]:
#question to be asked
query = "What is a program manager?"

#Send question as a query to qa chain
result = qa({"query": query})

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


In [None]:
print(result)

{'query': 'What is a program manager?', 'result': '\nThe person responsible for managing a specific program. The term can apply to a variety of industries.', 'source_documents': [Document(page_content='2.1.1 Program Manager (PM). The PM is responsible for, and with the authority to, accomplish program objectives for', metadata={'source': '/content/drive/MyDrive/Technical Manuals/AFD-180201-00-5-3.pdf', 'page': 22}), Document(page_content='2.1.1 Program Manager (PM) ................................................. 2 - 1', metadata={'source': '/content/drive/MyDrive/Technical Manuals/AFD-180201-00-5-3.pdf', 'page': 2}), Document(page_content='Product Group Manager (PGM) : The program\nmanager for a Product Group. PGMs fulﬁll the', metadata={'source': '/content/drive/MyDrive/Technical Manuals/AFD-180201-00-5-3.pdf', 'page': 165})]}
