In [2]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import AzureChatOpenAI
from langchain.prompts import PromptTemplate
from dotenv import load_dotenv
import os

load_dotenv()

True

In [11]:
# Azure Keys
azure_endpoint: str = os.environ.get('AZURE_ENDPOINT')
azure_openai_api_key: str = os.environ.get('AZURE_OPENAI_API_KEY')
azure_openai_api_version: str = os.environ.get('SUMMARY_AZURE_OPENAI_API_VERSION')
azure_deployment: str = os.environ.get('SUMMARY_AZURE_DEPLOYMENT')
print(azure_deployment)

None


In [4]:
llm = AzureChatOpenAI(
    api_version=azure_openai_api_version,
    azure_endpoint = azure_endpoint,
    deployment_name= azure_deployment,
    model = azure_deployment
)

In [None]:
#pdf loader
loader = PyPDFLoader("../input/Zipse_Speech.pdf")

pages = loader.load()
print(pages[1].metadata)

{'producer': 'Acrobat Distiller 9.5.5 (Windows)', 'creator': 'PScript5.dll Version 5.2.2', 'creationdate': '2013-09-10T10:49:51-04:00', 'author': 'INDIADEL', 'moddate': '2013-09-10T10:49:51-04:00', 'title': 'Document', 'source': '../input/nehru_speech.pdf', 'total_pages': 8, 'page': 1, 'page_label': '2'}


In [6]:
#document splitter
splitter = RecursiveCharacterTextSplitter(
  chunk_size = 10000,
  chunk_overlap = 500
)

chunks = splitter.split_documents(pages)

print(len(chunks))

8


In [7]:
#custom prompts for each steps
map_prompt_template = """
                      Write a summary of this chunk of text that includes the main points and any important details.
                      Use only the context, do not make up information
                      {text}
                      """

map_prompt = PromptTemplate(template=map_prompt_template, input_variables=["text"])

combine_prompt_template = """
                      Write a concise summary of the following text delimited by triple backquotes.
                      Use only the context for your answers, do not make up information
                      Return your response in only 5 bullet points which covers the key points of the text.
                      ```{text}```
                      BULLET POINT SUMMARY:
                      """

combine_prompt = PromptTemplate(
    template=combine_prompt_template, input_variables=["text"]
)

In [8]:
#initialise the map reduce technique

chain = load_summarize_chain(
  llm=llm,
  chain_type='map_reduce',
  map_prompt=map_prompt,
  combine_prompt=combine_prompt,
  verbose=False
)

In [None]:
#summarise the chunk
def run_summary():
  summary = chain.run(chunks)
  return summary