In [390]:
from langchain.document_loaders import PyPDFLoader
import re
from collections import defaultdict

In [303]:
# Load the book
loader = PyPDFLoader("pdf/crime-and-punishment.pdf")
pages = loader.load()

In [304]:
print (f"This book has total of {len(pages)} pages.")

This book has total of 767 pages.


In [305]:
# First page has unnecessary content
pages = pages[1:]

In [306]:
# Define a list of texts to remove
texts_to_remove = ['Crime and Punishment', 'Free eBooks at Planet eBook.com', '']

# Define a pattern for page numbers and unwanted text
pattern_page_numbers = "^\d+"
pattern_unwanted_text = '|'.join(texts_to_remove)


regex_page_numbers = re.compile(pattern_page_numbers)
regex_unwanted_text = re.compile(pattern_unwanted_text, re.IGNORECASE)

In [307]:
for page in pages:
    page.page_content = regex_unwanted_text.sub('', page.page_content)
    page.page_content = page.page_content.strip()
    page.page_content = regex_page_numbers.sub('', page.page_content)

In [308]:
import copy
pages_copy = copy.deepcopy(pages)

In [309]:
preprocessed_pages = []

for i, page in enumerate(pages_copy):
    # Strip the page content of any leading or trailing whitespace
    page.page_content = page.page_content.strip()
    # If the page has less than 100 characters and if it is not start with Part
    if len(page.page_content) < 100 and not(page.page_content.startswith("Part") or page.page_content.startswith("PART") or page.page_content.startswith("Epilogue") ) :
        # If the preprocessed list is not empty, get the last page and add the current page to it
        if preprocessed_pages:
            last_page = preprocessed_pages.pop()
            last_page.page_content += page.page_content
            preprocessed_pages.append(last_page)
            
    else:
        preprocessed_pages.append(page)

In [310]:
print(preprocessed_pages[609].page_content)

Part VI


In [311]:
print(preprocessed_pages[16].page_content)

Chapter II
Raskolnikov was not used to crowds, and, as we said be -
fore, he avoided society of every sort, more especially 
of late. But now all at once he felt a desire to be with other 
people. Something new seemed to be taking place within 
him, and with it he felt a sort of thirst for company. He was 
so weary after a whole month of concentrated wretchedness 
and gloomy excitement that he longed to rest, if only for a 
moment, in some other world, whatever it might be; and, in 
spite of the filthiness of the surroundings, he was glad now 
to stay in the tavern.
The master of the establishment was in another room, but 
he frequently came down some steps into the main room, 
his jaunty, tarred boots with red turn-over tops coming 
into view each time before the rest of his person. He wore 
a full coat and a horribly greasy black satin waistcoat, with 
no cravat, and his whole face seemed smeared with oil like 
an iron lock. At the counter stood a boy of about fourteen, 
and there wa

In [312]:
translators_preface = preprocessed_pages[0].page_content + preprocessed_pages[1].page_content + preprocessed_pages[2].page_content

In [313]:
epilogue_content = preprocessed_pages[-24:]
epilogue_content = [page.page_content for page in epilogue_content]
epilogue = ''.join(epilogue_content)

In [316]:
# Initialize an empty dictionary to store the output
output = {}

# Initialize an empty list to store the current chapter pages
chapter_pages = []

# Initialize a variable to store the current part name
part_name = None

# Initialize a variable to store the current chapter name
chapter_name = None

for page in preprocessed_pages[:-24]:
    # Get the page content as a string
    page_content = page.page_content
    # Check if the page content starts with "Part"
    if page_content.startswith("Part"):
        # If there is a previous part name and chapter name, add them and their pages to the output
        if part_name and chapter_name:
            output[part_name].append([chapter_name, chapter_pages])
        
        # Update the current part name with the page content
        part_name = page_content
        
        # Reset the chapter name and the chapter pages list
        chapter_name = None
        chapter_pages = []
        
        # Create a new key for the current part name in the output dictionary
        output[part_name] = []
    
    # Check if the page content starts with "Chapter"
    elif page_content.startswith("Chapter"):
        # If there is a previous chapter name, add it and its pages to the output
        if chapter_name:
            output[part_name].append([chapter_name, chapter_pages])
        
        # Update the current chapter name with the page content
        chapter_name = page_content
        
        # Reset the chapter pages list
        chapter_pages = []
    
    # Otherwise, the page content is part of the current chapter
    else:
        # Add the page content to the chapter pages list
        chapter_pages.append(page_content)

# Add the last part name and chapter name and their pages to the output
if part_name and chapter_name:
    output[part_name].append([chapter_name, chapter_pages])

In [317]:
for part, chapters in output.items():
    # Loop over the chapters list
    for i, chapter in enumerate(chapters):
        # Get the chapter name and pages
        chapter_name = chapter[0]
        chapter_pages = chapter[1]
        # Format the chapter name and pages as a single string
        chapter_string = chapter_name + " ".join(chapter_pages)
        # Assign the chapter string back to the output
        output[part][i] = chapter_string

In [318]:
output.keys()

dict_keys(['Part I', 'Part II', 'Part III', 'Part IV', 'Part V', 'Part VI'])

In [319]:
print(output['Part VI'][7])

Chapter VIII
When he went into Sonia’s room, it was already getting 
dark. All day Sonia had been waiting for him in ter -
rible anxiety. Dounia had been waiting with her. She had 
come to her that morning, remembering Svidrigaïlov’s 
words that Sonia knew. We will not describe the conver -
sation and tears of the two girls, and how friendly they 
became. Dounia gained one comfort at least from that in -
terview, that her brother would not be alone. He had gone 
to her, Sonia, first with his confession; he had gone to her 
for human fellowship when he needed it; she would go with 
him wherever fate might send him. Dounia did not ask, but 
she knew it was so. She looked at Sonia almost with rever -
ence and at first almost embarrassed her by it. Sonia was 
almost on the point of tears. She felt herself, on the contrary, 
hardly worthy to look at Dounia. Dounia’s gracious image 
when she had bowed to her so attentively and respectfully at 
their first meeting in Raskolnikov’s room had re

translators_preface \
epilogue \
output

In [429]:
import openai
from dotenv import load_dotenv
import os
import re

load_dotenv()

True

In [430]:
openai_api_key = os.getenv("OPENAI_API_KEY")

In [431]:
openai_api_key

'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'

In [416]:
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chat_models import ChatOpenAI
from langchain.chains.summarize import load_summarize_chain
from langchain.prompts import PromptTemplate
from langchain.text_splitter import CharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains.llm import LLMChain
from langchain.chains.combine_documents.stuff import StuffDocumentsChain

In [417]:
print(translators_preface)

Translator’s Preface
A few words about Dostoevsky himself may help the Eng -
lish reader to understand his work.
Dostoevsky was the son of a doctor. His parents were 
very hard- working and deeply religious people, but so poor 
that they lived with their five children in only two rooms. 
The father and mother spent their evenings in reading aloud 
to their children, generally from books of a serious charac -
ter.
Though always sickly and delicate Dostoevsky came out 
third in the final examination of the Petersburg school of 
Engineering. There he had already begun his first work, 
‘Poor Folk.’
This story was published by the poet Nekrassov in his 
review and was received with acclamations. The shy, un -
known youth found himself instantly something of a 
celebrity. A brilliant and successful career seemed to open 
before him, but those hopes were soon dashed. In 1849 he 
was arrested.
Though neither by temperament nor conviction a revolu -
tionist, Dostoevsky was one of a little group

In [432]:
# Define LLM
llm = OpenAI(temperature=0, openai_api_key="")

In [433]:
num_tokens = llm.get_num_tokens(translators_preface)

print (f"This translators_preface has {num_tokens} tokens in it")

This translators_preface has 1199 tokens in it


In [450]:
1199//2

599

In [464]:
text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n"], chunk_size=800, chunk_overlap=200)
docs = text_splitter.create_documents([translators_preface])

In [465]:
num_docs = len(docs)

num_tokens_first_doc = llm.get_num_tokens(docs[0].page_content)

print (f"Now we have {num_docs} documents and the first one has {num_tokens_first_doc} tokens")

Now we have 8 documents and the first one has 193 tokens


In [466]:
# Define prompt
prompt_template = """Write a concise summary of the following text delimited by triple backquotes.
              Return your response which covers the key points of the text.
              ```{text}```
              Concise SUMMARY:
  """

prompt = PromptTemplate(template=prompt_template, input_variables=["text"])

In [467]:
# Define StuffDocumentsChain
stuff_chain = load_summarize_chain(llm, chain_type="stuff", prompt=prompt)

In [468]:
print(stuff_chain.run(docs))


Dostoevsky was the son of a poor doctor and his parents read to their children from books of a serious character. He was successful in his studies and his first work, ‘Poor Folk’, was published and received with acclamations. However, he was arrested and sentenced to death for taking part in conversations against the censorship. His sentence was commuted to hard labour in Siberia, where he developed epilepsy. After four years, he was allowed to return to Russia and started two journals, both of which were forbidden by the Censorship. He was weighed down by debt and wrote at a heart-breaking speed. In his later years, he was comforted by his second wife. He made a famous speech at the unveiling of the monument to Pushkin in Moscow and was followed to the grave by a vast multitude of mourners. He is still the most widely read writer in Russia.
