PDF Summarization


In [None]:
!python --version

In [None]:
# !pip install pandas
# !pip install langchain
# !pip install pypdf
# !pip install openai
# !pip install tiktoken
# pip install python-dotenv
# pip install reportlab


In [2]:
from langchain import PromptTemplate
from langchain.chains.summarize import load_summarize_chain
from langchain.document_loaders import PyPDFLoader
from langchain.llms import OpenAI
import os
import pandas as pd
from pathlib import Path as p
import re
import time

In [7]:
from dotenv import load_dotenv
load_dotenv()
os.environ['OPENAI_API_KEY'] = os.getenv("API_KEY")

In [8]:
openai_llm = OpenAI(temperature=0.25)
# openai_llm = OpenAI(model="babbage-002", temperature=0.25)

In [9]:
pdf_loader = PyPDFLoader('crime-and-punishment.pdf')

In [10]:
pages = pdf_loader.load_and_split()

In [11]:
# Removed last summary
pages = pages[:743]

In [12]:
pages[0]

Document(page_content='Download free eBooks of classic literature, books and \nnovels at Planet eBook. Subscribe to our free eBooks blog \nand email newsletter.Crime and Punishment\nBy Fyodor Dostoevsky', metadata={'source': 'crime-and-punishment.pdf', 'page': 0})

Pre Processing

In [15]:
removed_string = ["Free eBooks at Planet eBook.comIlya", "Crime and Punishment", "Free eBooks at Planet eBook.com", "\n", "\x18"]

In [16]:
def remove_strings(text, strings_to_remove):
    # print(text, strings_to_remove)
    for string in strings_to_remove:
        text = text.replace(string, '')
    return text

In [17]:
for i,_ in enumerate(pages):
    pages[i].page_content = remove_strings(pages[i].page_content, removed_string)
    

Number of words

In [18]:
total_word_count = 0
for page in pages:
  total_word_count = total_word_count + len(page.page_content.split(' '))
print('Number of words: ', total_word_count)

Number of words:  198282


Summarization Prompt

In [19]:
prompt_template = """
Write a summary of this chunk of text that includes the important details
```{text}```
"""
prompt = PromptTemplate(template=prompt_template, input_variables=['text'])

In [20]:
refine_prompt_template = """
Write a summary of following text delimited by triple quote backquotes
Return your response which covers the main story event of the text. 
```{text}```
"""
refine_prompt = PromptTemplate(template=refine_prompt_template, input_variables=['text'])

In [21]:
refine_chain = load_summarize_chain(
    llm=openai_llm,
    chain_type="refine",
    question_prompt=prompt,
    refine_prompt=refine_prompt,
    return_intermediate_steps= True
)

Chunks of text


In [22]:
class New_Book:
    def __init__(self, page_content, book_name='crime-and-punishment.pdf', page_number=0):
        self.page_content = page_content
        self.metadata = {
            "source": book_name,
            "page": page_number
        }

In [23]:
new_book_page = []
current_page_number, new_page_number, limit = 0, 0, 8 # 9 pages -> approx. 3.5k tokens
while(current_page_number <= (len(pages) + limit)):
    counter = 0
    text = ''
    
    while (counter < limit) and (current_page_number + counter < len(pages)):
        # print(counter)
        text += pages[current_page_number + counter].page_content
        counter += 1
     
    new_book_page.append(New_Book(text, page_number=new_page_number))
    new_page_number += 1
    current_page_number += limit

In [24]:
# Shorten Story Book Content
new_book_page = new_book_page[:50] + new_book_page[85:90]

Summarization

In [26]:
summary_dict_data = {}

In [None]:
iter_number = 0
index_number = 0
while(iter_number < len(new_book_page)):
    print("Index Number: " ,index_number)
    refine_chain_outputs = refine_chain({'input_documents': new_book_page[iter_number:iter_number+2]})
    time.sleep(60)
    summary_dict_data[index_number] = refine_chain_outputs

    iter_number = iter_number + 2
    index_number += 1 


In [65]:
# Appending all the summaries
summaries = []
for idx_i in range(len(summary_dict_data)):
    for idx_j,val in enumerate(summary_dict_data[idx_i]['intermediate_steps']):
        summaries.append(val)

In [70]:
final_data = []
for doc, out in zip(new_book_page, summaries):
  output = {}
  output['file_name'] = p(doc.metadata["source"]).stem
  output['file_type'] = p(doc.metadata["source"]).suffix
  output['page_number'] = doc.metadata["page"]
  output['chunks'] = doc.page_content
  output['concise_summary'] = out
  final_data.append(output)

In [71]:
summary_df = pd.DataFrame.from_dict(final_data)
summary_df = summary_df.sort_values(
    by=["file_name", "page_number"]
)

In [79]:
summary_df.tail(5)

Unnamed: 0,file_name,file_type,page_number,chunks,concise_summary
50,crime-and-punishment,.pdf,85,1 Intense disgust drew him away from Svidrigaï...,\nRaskolnikov is shocked to find out that Svid...
51,crime-and-punishment,.pdf,86,we’ve talked of this more than once before. I...,\nDounia is trying to save her brother from be...
52,crime-and-punishment,.pdf,87,Chapter VIHe spent that evening till ten o’cl...,\nSvidrigailov spends the evening going to low...
53,crime-and-punishment,.pdf,88,0 his attention. The murmur had not ceased fro...,\nSvidrigailov enters a room and hears someone...
54,crime-and-punishment,.pdf,89,1 to control them. But now she quite gave up a...,\nRaskolnikov visits his mother and sister in ...


Saving summary to pdf

In [75]:
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
from reportlab.lib.styles import getSampleStyleSheet

def create_pdf_with_text(title, summaries, output_filename):
    doc = SimpleDocTemplate(output_filename, pagesize=letter)
    styles = getSampleStyleSheet()
    story = []

    # Title
    title_paragraph = Paragraph(title, styles["Title"])
    story.append(title_paragraph)

    for text in summaries:
        # Add a summary paragraph
        summary_paragraph = Paragraph(text, styles["Normal"])
        story.append(summary_paragraph)

        # Add a new line break (spacer) after each summary
        story.append(Spacer(1, 12))  # Adjust the second argument for spacing

    # Build the PDF document
    doc.build(story)


In [76]:
title = "Summary: Crime and Punishment"
output_file = "summary.pdf"
create_pdf_with_text(title, summaries, output_file)
print("File Updated")

File Updated
