In [None]:
## Data Ingestion
from langchain_community.document_loaders import PyPDFLoader, CSVLoader, TextLoader
pdf_loader_annual = PyPDFLoader("SyntheticDatasetRAG/Sample Underwriter/pdf/annual-report-2023-2024.pdf")
pdf_documents_annual = pdf_loader_annual.load()
pdf_documents_annual

In [None]:
## Transformation
from langchain.text_spiltter import RecursiveCharacterTextSpiltter
text_spiltter = RecursiveCharacterTextSpiltter(chunk_size=1000,chunk_overlap=100)
pdf_text_documents = text.spiltter.split_documents(pdf_documents_annual)
pdf_text_documents[:5]

In [None]:
## Vector Embedding and Vector Store
from langchain_community.embeddings import BertEmbeddings
from transformers import BertTokenizer, BertModel
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
import torch
# Initialize the BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
BertModel.from_pretrained(model_name)
# Initialize BertEmbeddings from langchain_community
bert_embeddings = BertEmbeddings(model=model,tokenizer=tokenizer)
# Generating Embeddings
bert_embeddings.embed([pdf_text_documents])
# Create a FAISS vector Store
faiss_index = FAISS.from_documents(pdf_text_documents, bert_embeddings)
## Querying the FAISS faiss_index
query = "Search query text"
results = faiss_index.similarity_search(query, k=3) # Returns top 3 results
print(results)
# Save the index to a file
faiss_index.save("faiss_index")
# Load the index from a file
loaded_faiss_index = FAISS.load("faiss_index")
# Setup the QA system with the FAISS indes
qa_chain = RetrievalQA.from_chain_type(llm=OpenAI(),retriever=faiss_index.as_retriever())
# Ask a question
query = "Give me summary of the Annual report"
answer = qa_chain.run(query)
print(answer)