# Summarizing multiple PDFs

In [None]:
# !pip install langchain
# !pip install langchain-groq
# !pip install pypdf
# !pip install chromadb
# !pip install transformers

In [20]:
# Importing the dependencies
from langchain.chains.summarize import load_summarize_chain
from langchain.document_loaders import PyPDFLoader
from langchain_groq import ChatGroq
from langchain import PromptTemplate
import glob
import os

In [21]:
# Import dotenv package for environment variables
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Define the LLM
llm = ChatGroq(model_name="llama-3.1-70b-versatile", temperature=0.2, groq_api_key=os.getenv("GROQ_API_KEY"))
def summarize_pdfs_from_folder(pdfs_folder):
    summaries = []
    for pdf_file in glob.glob(pdfs_folder + "/*.pdf"):
        loader = PyPDFLoader(pdf_file)
        docs = loader.load_and_split()
        chain = load_summarize_chain(llm, chain_type="map_reduce")
        summary = chain.run(docs)
        print("Summary for: ", pdf_file)
        print(summary)
        print("\n")
        summaries.append(summary)
    
    return summaries

In [22]:
def custom_summary(pdf_folder, custom_prompt):
    summaries = []
    for pdf_file in glob.glob(pdf_folder + "/*.pdf"):
        loader = PyPDFLoader(pdf_file)
        docs = loader.load_and_split()
        prompt_template = custom_prompt + """

        {text}

        SUMMARY:"""
        PROMPT = PromptTemplate(template=prompt_template, input_variables=["text"])
        chain = load_summarize_chain(llm, chain_type="map_reduce", 
                                    map_prompt=PROMPT, combine_prompt=PROMPT)
        summary_output = chain({"input_documents": docs},return_only_outputs=True)["output_text"]
        summaries.append(summary_output)
        
    return summaries

In [None]:
summaries = summarize_pdfs_from_folder("./pdfs")

In [None]:
# CUSTOM_PROMPT = "Write a concise summary of the following paper with this structure: Problem being solved; Approach; Main results; Main Discussion Points"
# custom_summaries = custom_summary("./pdfs", custom_prompt=CUSTOM_PROMPT)
# # Save all summaries into one .txt file
# with open("custom_summaries.txt", "w") as f:
#     for summary in custom_summaries:
#         f.write(summary + "\n"*3)

In [24]:
# Save all summaries into one .txt file
with open("summaries.txt", "w") as f:
    for summary in summaries:
        f.write(summary+"\n"*3)

# Querying Multiple PDFs

In [65]:
from langchain.indexes import VectorstoreIndexCreator
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

In [None]:
# Load the embeddings model
embeddings = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-small-en")

In [None]:
# Load the PDFs in the pdfs directory
loader = PyPDFDirectoryLoader("./pdfs/")
docs = loader.load()
# print(docs)

In [None]:
# Create the vector store index with the Hugging Face embeddings
index = VectorstoreIndexCreator(embedding=embeddings).from_loaders([loader])
# print(index)

In [None]:
query = "What are the key trends on LLM from 2018 to 2023?"
# Invoke the index vector store using the query
index.query(query)