<a href="https://colab.research.google.com/github/Saksham9804/Projects/blob/main/RAG_Q%26A.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installing and importing the necessary libraries

In [None]:
# Installing required packages
!pip install langchain langchain-community faiss-cpu sentence-transformers transformers pypdf




Importing Modules

In [None]:
import kagglehub
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os

# Downloading Dataset and doing the required Setups

In [None]:
# Downloading Kaggle dataset and confirming dataset path
path = kagglehub.dataset_download("sakshamtiwari98/market-analysis")
print("Path to dataset files:", path)

# Setting the PDF directory to the downloaded path
pdf_dir = path

# Now, Listing specific PDF files to process

pdf_files = [
    "Attention is all you need.pdf",
    "BERT Pre training of Deep Bidirectional Transformers for.pdf",
    "Contrastive Language-Image Pre-Training with.pdf",
    "Language Models are Few-Shot Learners.pdf",
    "LLaMA Open and Efficient Foundation Language Models.pdf"
]

Path to dataset files: /kaggle/input/market-analysis


# Loading and Chunking PDF Documents into Pieces

In [None]:
def load_and_chunk_pdfs(files, dir_path, chunk_size=1000, chunk_overlap=150):
    all_docs = []   #( Collecting all loaded document texts )

    # Iterating over each file to load and extract text
    for file in files:
        file_path = os.path.join(dir_path, file)

        # Printing warning and skipping if file does not exist(Well that will not happen :)
        if not os.path.isfile(file_path):
            print(f"File not found: {file_path}")
            continue

        # Loading PDF content into documents
        loader = PyPDFLoader(file_path)
        docs = loader.load()

        # Adding loaded documents to the collection
        all_docs.extend(docs)

    # Raising error if no documents get loaded to catch issues early
    if not all_docs:
        raise ValueError("No documents were loaded. Please check your PDF files.")

    # Creating a text splitter with overlapping chunks to preserve context and splitting into smaller chunks
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    docs_split = splitter.split_documents(all_docs)

    # Returning the split document chunks
    return docs_split

Loading and chunking the PDF files

In [None]:
docs_split = load_and_chunk_pdfs(pdf_files, pdf_dir)

# Removing metadata like filenames or page info to prevent leakage in output

In [None]:
# Clearing metadata from each document to avoid showing it
for doc in docs_split:
    doc.metadata = {}  # Or setting to None, using empty dict


#Creating Vector Store for Retrieval

Initializing embedding mode, creating FAISS vector store, saving vector index and returning the vector store intance

In [None]:
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain_community.vectorstores import FAISS

def create_vector_store(docs_split, embedding_model_name="all-MiniLM-L6-v2", save_path="vector_db_index"):
    embedding_model = SentenceTransformerEmbeddings(model_name=embedding_model_name)
    vector_db = FAISS.from_documents(docs_split, embedding_model)
    vector_db.save_local(save_path)
    return vector_db

vector_db = create_vector_store(docs_split)

Loading the language model

In [None]:
from transformers import pipeline
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline

def load_llm(model_name="google/flan-t5-base", max_new_tokens=256):

    # Initializing text-to-text generation pipeline with selected model
    hf_pipe = pipeline("text2text-generation", model=model_name, max_new_tokens=max_new_tokens)

    # Wrapping the pipeline in a LangChain interface for LLMs and returning llm
    llm = HuggingFacePipeline(pipeline=hf_pipe)
    return llm

# Loading the language model for answering questions
llm = load_llm()


Device set to use cpu


# Building the RAG Chain

In [None]:
from langchain.chains import RetrievalQA

def build_rag_chain(vector_db, llm, k=5):

    # Creating retriever to fetch top-k relevant documents from vector store
    retriever = vector_db.as_retriever(search_kwargs={"k": k})

    # Building the RetrievalQA chain with concatenation of retrieved chunks ('stuff')
    # Enabling return of source documents for explainability
    rag_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        return_source_documents=True
    )
    # Returning the RAG chain ready to answer queries
    return rag_chain

# Constructing the RAG question-answering chain
rag_chain = build_rag_chain(vector_db, llm)


# Asking Questions

Defining the result that should we get on asking a question.

In [None]:
def ask_question(question):
    prompt = (question)
    result = rag_chain.invoke({"query": prompt})

    # Printing the question entered
    print(f"Question: {question}\n")
    print("Answers: \n")

    # Spliting the result into sentences and print each on a new line
    sentences = result['result'].strip().split('.')
    for sentence in sentences:
        if sentence.strip():  # Avoid printing empty lines
            print(sentence.strip() + ".")
    print("\n"*50) # Adding spacing after the answer
    return result

Ask the question based on the attached files

In [None]:
ask_question(input("Enter your question: "))

Enter your question: give the summary for LLaMA Open and Efficient Foundation Language Models 
Question: give the summary for LLaMA Open and Efficient Foundation Language Models 

Answers: 

LLaMA: Open and Efficient Foundation Language Models Hugo Touvron, Thibaut Lavril, Gautier Izacard , Xavier Martinet Marie-Anne Lachaux, Timothee Lacroix, Baptiste Rozière, Naman Goyal Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin Edouard Grave, Guillaume Lample Meta AI Abstract We introduce LLaMA, a collection of founda- tion language models ranging from 7B to 65B parameters.
We train our models on trillions of tokens, and show that it is possible to train state-of-the-art models using publicly avail- able datasets exclusively, without resorting to proprietary and inaccessible datasets.
We hope that releasing these models to the research community will accelerate the development of large language models, and help efforts to improve their robustness and mitigate known issues such as 

{'query': 'give the summary for LLaMA Open and Efficient Foundation Language Models ',
 'result': 'LLaMA: Open and Efficient Foundation Language Models Hugo Touvron, Thibaut Lavril, Gautier Izacard , Xavier Martinet Marie-Anne Lachaux, Timothee Lacroix, Baptiste Rozière, Naman Goyal Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin Edouard Grave, Guillaume Lample Meta AI Abstract We introduce LLaMA, a collection of founda- tion language models ranging from 7B to 65B parameters. We train our models on trillions of tokens, and show that it is possible to train state-of-the-art models using publicly avail- able datasets exclusively, without resorting to proprietary and inaccessible datasets. We hope that releasing these models to the research community will accelerate the development of large language models, and help efforts to improve their robustness and mitigate known issues such as toxicity and bias. Additionally, we observed like Chung et al. (2022) that finetuning these 