In [156]:
from langchain.document_loaders import PyPDFLoader
import re
from collections import defaultdict

### Loading and Preprocessing the Book


In [157]:
loader = PyPDFLoader("pdf/crime-and-punishment.pdf")
pages = loader.load()

In [158]:
print (f"This book has total of {len(pages)} pages.")

This book has total of 767 pages.


In [159]:
# First page has unnecessary content
pages = pages[1:]

__The first page of the book contains only the book title and the author's name, so I have omitted this page.__


In [160]:
# Define a list of texts to remove
texts_to_remove = ['Crime and Punishment', 'Free eBooks at Planet eBook.com', '']

# Define a pattern for page numbers and unwanted text
pattern_page_numbers = "^\d+"
pattern_unwanted_text = '|'.join(texts_to_remove)

# Compile regex patterns
regex_page_numbers = re.compile(pattern_page_numbers)
regex_unwanted_text = re.compile(pattern_unwanted_text, re.IGNORECASE)

In [161]:
for page in pages:
    page.page_content = regex_unwanted_text.sub('', page.page_content)
    page.page_content = page.page_content.strip()
    page.page_content = regex_page_numbers.sub('', page.page_content)

__Removing unwanted text, including specified book titles and characters, from the pages using Python's regex patterns.__

In [162]:
# Creating deep copy of the pages
import copy
pages_copy = copy.deepcopy(pages)

In [163]:
# Create an empty list to store preprocessed pages
preprocessed_pages = []

# Iterate through each page in the copy of the original pages
for i, page in enumerate(pages_copy):
    # Strip the page content of any leading or trailing whitespace
    page.page_content = page.page_content.strip()
    # Check if the page has less than 100 characters and does not start with specific keywords
    if len(page.page_content) < 100 and not(page.page_content.startswith("Part") or page.page_content.startswith("PART") or page.page_content.startswith("Epilogue") ) :
        # If the preprocessed list is not empty, combine the current page with the last page in the list
        if preprocessed_pages:
            last_page = preprocessed_pages.pop()
            last_page.page_content += page.page_content
            preprocessed_pages.append(last_page)
    else:
        preprocessed_pages.append(page)

**1.** This Python code performs preprocessing on a list of pages, aiming to combine short pages with the previous one if certain conditions are met. The `preprocessed_pages` list is initialized to store the final processed pages.

**2.** The `strip()` method is used to remove any leading or trailing whitespace from the page content.

**3.** Check if the page has less than 100 characters and does not start with specific keywords ("Part," "PART," or "Epilogue"), as it indicates that the page might be a continuation of the previous part of the book (The book contains 6 parts).

**4.** If the conditions are met, it combines the current page with the last page in the `preprocessed_pages` list (if not empty).

**5.** Otherwise, the page is added to the `preprocessed_pages` list as is.


In [164]:
print(preprocessed_pages[609].page_content)

Part VI


In [165]:
print(preprocessed_pages[16].page_content)

Chapter II
Raskolnikov was not used to crowds, and, as we said be -
fore, he avoided society of every sort, more especially 
of late. But now all at once he felt a desire to be with other 
people. Something new seemed to be taking place within 
him, and with it he felt a sort of thirst for company. He was 
so weary after a whole month of concentrated wretchedness 
and gloomy excitement that he longed to rest, if only for a 
moment, in some other world, whatever it might be; and, in 
spite of the filthiness of the surroundings, he was glad now 
to stay in the tavern.
The master of the establishment was in another room, but 
he frequently came down some steps into the main room, 
his jaunty, tarred boots with red turn-over tops coming 
into view each time before the rest of his person. He wore 
a full coat and a horribly greasy black satin waistcoat, with 
no cravat, and his whole face seemed smeared with oil like 
an iron lock. At the counter stood a boy of about fourteen, 
and there wa

In [166]:
# Initialize an empty dictionary to store the parts_and_chapters
parts_and_chapters = {}

# Initialize an empty list to store the current chapter pages
chapter_pages = []

# Initialize a variable to store the current part name
part_name = None

# Initialize a variable to store the current chapter name
chapter_name = None

for page in preprocessed_pages[:-24]:
    # Get the page content as a string
    page_content = page.page_content
    # Check if the page content starts with "Part"
    if page_content.startswith("Part"):
        # If there is a previous part name and chapter name, add them and their pages to the output
        if part_name and chapter_name:
            parts_and_chapters[part_name].append([chapter_name, chapter_pages])
        
        # Update the current part name with the page content
        part_name = page_content
        
        # Reset the chapter name and the chapter pages list
        chapter_name = None
        chapter_pages = []
        
        # Create a new key for the current part name in the parts_and_chapters dictionary
        parts_and_chapters[part_name] = []
    
    # Check if the page content starts with "Chapter"
    elif page_content.startswith("Chapter"):
        # If there is a previous chapter name, add it and its pages to the output
        if chapter_name:
            parts_and_chapters[part_name].append([chapter_name, chapter_pages])
        
        # Update the current chapter name with the page content
        chapter_name = page_content
        
        # Reset the chapter pages list
        chapter_pages = []
    
    # Otherwise, the page content is part of the current chapter
    else:
        # Add the page content to the chapter pages list
        chapter_pages.append(page_content)

# Add the last part name and chapter name and their pages to the output
if part_name and chapter_name:
    parts_and_chapters[part_name].append([chapter_name, chapter_pages])

In [167]:
for part, chapters in parts_and_chapters.items():
    # Loop over the chapters list
    for i, chapter in enumerate(chapters):
        # Get the chapter name and pages
        chapter_name = chapter[0]
        chapter_pages = chapter[1]
        # Format the chapter name and pages as a single string
        chapter_string = chapter_name + " ".join(chapter_pages)
        # Assign the chapter string back to the output
        parts_and_chapters[part][i] = chapter_string

**1.** This Python code processes a list of preprocessed pages to organize and extract parts and chapters, creating the structured dictionary `parts_and_chapters` that associates each part with its respective chapters and their corresponding pages.

**2.** Initialization involves an empty dictionary (`parts_and_chapters`), an empty list (`chapter_pages`), and variables (`part_name` and `chapter_name`) to track the current part and chapter names.

**3.** The main loop iterates through preprocessed pages, excluding the last 24 pages, which represent the epilogue of the book. It identifies parts and chapters based on keywords ("Part" or "Chapter") at the beginning of each page content.

**4.** If the page content starts with "Part," the current part is updated, and associated chapters are reset. A new key is created in the output dictionary for the current part.

**5.** If the page content starts with "Chapter," the current chapter is updated, and the chapter pages list is reset.

**6.** Pages not indicating a part or chapter are considered part of the current chapter and added to the `chapter_pages` list.

**7.** At each part or chapter change, the current information is added to the output dictionary.

**8.** The last part, chapter, and associated pages are added to the output.

The resulting `parts_and_chapters` dictionary provides a structured representation of the book's content, facilitating a more organized and accessible layout for further analysis or presentation in a notebook.



In [168]:
# Analyzing the `parts_and_chapters` Dictionary
parts_and_chapters.keys()

dict_keys(['Part I', 'Part II', 'Part III', 'Part IV', 'Part V', 'Part VI'])

In [169]:
# Example: Retrieving and analyzing Part I
Part1 = parts_and_chapters['Part I']
print(f"Part I of the book has {len(Part1)} chapters")

Part I of the book has 7 chapters


In [173]:
# First chapter of last part of the book
print(parts_and_chapters['Part VI'][0])

Chapter I
A strange period began for Raskolnikov: it was as though 
a fog had fallen upon him and wrapped him in a dreary 
solitude from which there was no escape. Recalling that pe -
riod long after, he believed that his mind had been clouded 
at times, and that it had continued so, with intervals, till 
the final catastrophe. He was convinced that he had been 
mistaken about many things at that time, for instance as to 
the date of certain events. Anyway, when he tried later on to 
piece his recollections together, he learnt a great deal about 
himself from what other people told him. He had mixed up 
incidents and had explained events as due to circumstanc -
es which existed only in his imagination. At times he was 
a prey to agonies of morbid uneasiness, amounting some -
times to panic. But he remembered, too, moments, hours, 
perhaps whole days, of complete apathy, which came upon 
him as a reaction from his previous terror and might be 
compared with the abnormal insensibility, s

### Summarization

In [176]:
import openai
from dotenv import load_dotenv
import os
import re

load_dotenv()

True

In [177]:
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chat_models import ChatOpenAI
from langchain.chains.summarize import load_summarize_chain
from langchain.prompts import PromptTemplate
from langchain.text_splitter import CharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains.llm import LLMChain
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chat_models import ChatAnthropic
from langchain.chains import MapReduceDocumentsChain, ReduceDocumentsChain

In [178]:
api_key = os.getenv("API_KEY")

__Translators Preface Summary__

In [179]:
# First 3 pages are the translators preface
translators_preface = preprocessed_pages[0].page_content + preprocessed_pages[1].page_content + preprocessed_pages[2].page_content

In [180]:
print(translators_preface)

Translator’s Preface
A few words about Dostoevsky himself may help the Eng -
lish reader to understand his work.
Dostoevsky was the son of a doctor. His parents were 
very hard- working and deeply religious people, but so poor 
that they lived with their five children in only two rooms. 
The father and mother spent their evenings in reading aloud 
to their children, generally from books of a serious charac -
ter.
Though always sickly and delicate Dostoevsky came out 
third in the final examination of the Petersburg school of 
Engineering. There he had already begun his first work, 
‘Poor Folk.’
This story was published by the poet Nekrassov in his 
review and was received with acclamations. The shy, un -
known youth found himself instantly something of a 
celebrity. A brilliant and successful career seemed to open 
before him, but those hopes were soon dashed. In 1849 he 
was arrested.
Though neither by temperament nor conviction a revolu -
tionist, Dostoevsky was one of a little group

In [57]:
# Define LLM
llm = ChatAnthropic(model_name="claude-2", temperature=0)
# llm = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0)

In [58]:
num_tokens = llm.get_num_tokens(translators_preface)

print (f"This translators_preface has {num_tokens} tokens in it")

This translators_preface has 1082 tokens in it


In [59]:
text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n"], chunk_size=1200, chunk_overlap=100)
translators_preface_docs = text_splitter.create_documents([translators_preface])

In [60]:
num_docs = len(translators_preface_docs)

num_tokens_first_doc = llm.get_num_tokens(translators_preface_docs[0].page_content)

print (f"Now we have {num_docs} documents and the first one has {num_tokens_first_doc} tokens")

Now we have 4 documents and the first one has 281 tokens


In [61]:
# Define prompt
prompt_template = """
        Generate a comprehensive and concise summary covering all crucial elements from the following 
        Translator's Preface text delimited by triple backquotes. Return your response which covers the
        key points of the text.
        
        ```{text}```
        
        Comprehensive SUMMARY:
"""

prompt = PromptTemplate(template=prompt_template, input_variables=["text"])

In [59]:
# Define StuffDocumentsChain
stuff_chain = load_summarize_chain(llm, chain_type="stuff", prompt=prompt)

In [60]:
summarized_translators_preface = stuff_chain.run(translators_preface_docs)

In [61]:
import textwrap

In [62]:
summarized_translators_preface = textwrap.fill(summarized_translators_preface, 
                             width=100,
                             break_long_words=False,
                             replace_whitespace=False)

In [63]:
print(summarized_translators_preface)

This Translator's Preface provides a brief overview of the life and experiences of Fyodor
Dostoevsky, shedding light on the influences that shaped his work. Dostoevsky came from a poor but
deeply religious family and showed promise as a writer from a young age. However, his hopes for a
successful career were dashed when he was arrested in 1849 for his association with a group of young
men involved in revolutionary activities. He was initially sentenced to death but had his sentence
commuted to hard labor. The intense suffering he endured during this time left a lasting impact on
his writing, as he frequently explored themes of suffering and cruelty. Dostoevsky also suffered
from epilepsy, which further contributed to his experiences of hardship. Despite facing numerous
challenges, he continued to write and gained recognition as a prominent Russian writer. He died in
1881, leaving behind a legacy as one of Russia's most widely read authors.


### Part 1 

In [67]:
parts_and_chapters.keys()

dict_keys(['Part I', 'Part II', 'Part III', 'Part IV', 'Part V', 'Part VI'])

In [181]:
Part1 = parts_and_chapters['Part I']
print(f"Part 1 has {len(Part1)} chapters")

Part 1 has 7 chapters


In [None]:
part1 = "".join(Part1)
print(part1)

In [70]:
num_tokens = llm.get_num_tokens(part1)

print (f"The part1 of book has {num_tokens} tokens in it")

The part1 of book has 48940 tokens in it


In [71]:
text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n"], chunk_size=10000, chunk_overlap=500)
part1_docs = text_splitter.create_documents([part1])

In [72]:
num_docs = len(part1_docs)

num_tokens_first_doc = llm.get_num_tokens(part1_docs[0].page_content)
print (f"Now we have {num_docs} documents and the first one has {num_tokens_first_doc} tokens")

Now we have 21 documents and the first one has 2443 tokens


In [70]:
# Define prompt
prompt_template = """
        Given a text delimited by triple backquotes that contains part1 of the crime and punishment book, which is divided 
        into multiple chapters, generate a summary of part1 that covers all the key points and aspects of the story. 
        The summary should be written in a clear and engaging way.

        ```{text}```

        Summary of Part1:
"""
prompt = PromptTemplate(template=prompt_template, input_variables=["text"])

refine_template = (
    "Your job is to produce a final summary\n"
    "We have provided an existing summary up to a certain point: {existing_answer}\n"
    "We have the opportunity to refine the existing summary"
    "(only if needed) with some more context below.\n"
    "------------\n"
    "{text}\n"
    "------------\n"
    "Given the new context, refine the original summary"
    "If the context isn't useful, return the original summary."
)
refine_prompt = PromptTemplate.from_template(refine_template)

In [71]:
refine_chain = load_summarize_chain(
    llm=llm,
    chain_type="refine",
    question_prompt=prompt,
    refine_prompt=refine_prompt,
    input_key="input_documents",
    output_key="output_text",
)

In [None]:
refine_outputs = refine_chain({"input_documents": part1_docs})

In [92]:
print(refine_outputs["output_text"])

### Part 2

In [73]:
parts_and_chapters.keys()

dict_keys(['Part I', 'Part II', 'Part III', 'Part IV', 'Part V', 'Part VI'])

In [182]:
Part2 = parts_and_chapters['Part II']
print(f"Part 2 has {len(Part2)} chapters")

Part 2 has 7 chapters


In [None]:
part2 = "".join(Part2)
print(part2)

In [77]:
num_tokens = llm.get_num_tokens(part2)

print (f"The part2 of book has {num_tokens} tokens in it")

The part2 of book has 56958 tokens in it


In [79]:
text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n"], chunk_size=10000, chunk_overlap=500)
part2_docs = text_splitter.create_documents([part2])

In [80]:
num_docs = len(part2_docs)

num_tokens_first_doc = llm.get_num_tokens(part2_docs[0].page_content)
print (f"Now we have {num_docs} documents and the first one has {num_tokens_first_doc} tokens")

Now we have 23 documents and the first one has 2537 tokens


In [81]:
# Define prompt
prompt_template = """
        Given a text delimited by triple backquotes that contains part2 of the crime and punishment book, which is divided 
        into multiple chapters, generate a summary of part2 that covers all the key points and aspects of the story. 
        The summary should be written in a clear and engaging way.

        ```{text}```

        Summary of Part1:
"""
prompt = PromptTemplate(template=prompt_template, input_variables=["text"])

refine_template = (
    "Your job is to produce a final summary\n"
    "We have provided an existing summary up to a certain point: {existing_answer}\n"
    "We have the opportunity to refine the existing summary"
    "(only if needed) with some more context below.\n"
    "------------\n"
    "{text}\n"
    "------------\n"
    "Given the new context, refine the original summary"
    "If the context isn't useful, return the original summary."
)
refine_prompt = PromptTemplate.from_template(refine_template)

In [82]:
refine_chain = load_summarize_chain(
    llm=llm,
    chain_type="refine",
    question_prompt=prompt,
    refine_prompt=refine_prompt,
    input_key="input_documents",
    output_key="output_text",
)

In [None]:
refine_outputs = refine_chain({"input_documents": part2_docs})
print(refine_outputs["output_text"])

### Part 3

In [83]:
parts_and_chapters.keys()

dict_keys(['Part I', 'Part II', 'Part III', 'Part IV', 'Part V', 'Part VI'])

In [183]:
Part3 = parts_and_chapters['Part III']
print(f"Part 3 has {len(Part3)} chapters")

Part 3 has 6 chapters


In [None]:
part3 = "".join(Part3)
print(part3)

In [92]:
num_tokens = llm.get_num_tokens(part3)
print (f"The part3 of book has {num_tokens} tokens in it")

The part3 of book has 44944 tokens in it


In [93]:
text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n"], chunk_size=10000, chunk_overlap=500)
part3_docs = text_splitter.create_documents([part3])

In [94]:
num_docs = len(part3_docs)

num_tokens_first_doc = llm.get_num_tokens(part3_docs[0].page_content)
print (f"Now we have {num_docs} documents and the first one has {num_tokens_first_doc} tokens")

Now we have 19 documents and the first one has 2720 tokens


In [95]:
# Define prompt
prompt_template = """
        Given a text delimited by triple backquotes that contains part1 of the crime and punishment book, which is divided 
        into multiple chapters, generate a summary of part1 that covers all the key points and aspects of the story. 
        The summary should be written in a clear and engaging way.

        ```{text}```

        Summary of Part1:
"""
prompt = PromptTemplate(template=prompt_template, input_variables=["text"])

refine_template = (
    "Your job is to produce a final summary\n"
    "We have provided an existing summary up to a certain point: {existing_answer}\n"
    "We have the opportunity to refine the existing summary"
    "(only if needed) with some more context below.\n"
    "------------\n"
    "{text}\n"
    "------------\n"
    "Given the new context, refine the original summary"
    "If the context isn't useful, return the original summary."
)
refine_prompt = PromptTemplate.from_template(refine_template)

In [96]:
refine_chain = load_summarize_chain(
    llm=llm,
    chain_type="refine",
    question_prompt=prompt,
    refine_prompt=refine_prompt,
    input_key="input_documents",
    output_key="output_text",
)

In [None]:
refine_outputs = refine_chain({"input_documents": part3_docs})
print(refine_outputs["output_text"])

### Part 4

In [97]:
parts_and_chapters.keys()

dict_keys(['Part I', 'Part II', 'Part III', 'Part IV', 'Part V', 'Part VI'])

In [184]:
Part4 = parts_and_chapters['Part IV']
print(f"Part 4 has {len(Part4)} chapters")

Part 4 has 6 chapters


In [None]:
part4 = "".join(Part4)
print(part4)

In [102]:
num_tokens = llm.get_num_tokens(part4)

print (f"The part4 of book has {num_tokens} tokens in it")

The part4 of book has 41729 tokens in it


In [103]:
text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n"], chunk_size=10000, chunk_overlap=500)
part4_docs = text_splitter.create_documents([part4])

In [104]:
num_docs = len(part4_docs)

num_tokens_first_doc = llm.get_num_tokens(part4_docs[0].page_content)
print (f"Now we have {num_docs} documents and the first one has {num_tokens_first_doc} tokens")

Now we have 17 documents and the first one has 2587 tokens


In [107]:
# Define prompt
prompt_template = """
        Given a text delimited by triple backquotes that contains part1 of the crime and punishment book, which is divided 
        into multiple chapters, generate a summary of part1 that covers all the key points and aspects of the story. 
        The summary should be written in a clear and engaging way.

        ```{text}```

        Summary of Part1:
"""
prompt = PromptTemplate(template=prompt_template, input_variables=["text"])

refine_template = (
    "Your job is to produce a final summary\n"
    "We have provided an existing summary up to a certain point: {existing_answer}\n"
    "We have the opportunity to refine the existing summary"
    "(only if needed) with some more context below.\n"
    "------------\n"
    "{text}\n"
    "------------\n"
    "Given the new context, refine the original summary"
    "If the context isn't useful, return the original summary."
)
refine_prompt = PromptTemplate.from_template(refine_template)

In [106]:
refine_chain = load_summarize_chain(
    llm=llm,
    chain_type="refine",
    question_prompt=prompt,
    refine_prompt=refine_prompt,
    input_key="input_documents",
    output_key="output_text",
)

In [None]:
refine_outputs = refine_chain({"input_documents": part4_docs})
print(refine_outputs["output_text"])

### Part 5

In [108]:
parts_and_chapters.keys()

dict_keys(['Part I', 'Part II', 'Part III', 'Part IV', 'Part V', 'Part VI'])

In [185]:
Part5 = parts_and_chapters['Part V']
print(f"Part 5 has {len(Part5)} chapters")

Part 5 has 5 chapters


In [None]:
part5 = "".join(Part5)
print(part5)

In [111]:
num_tokens = llm.get_num_tokens(part5)

print (f"The part5 of book has {num_tokens} tokens in it")

The part5 of book has 41495 tokens in it


In [112]:
text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n"], chunk_size=10000, chunk_overlap=500)
part5_docs = text_splitter.create_documents([part5])

In [113]:
num_docs = len(part5_docs)

num_tokens_first_doc = llm.get_num_tokens(part5_docs[0].page_content)
print (f"Now we have {num_docs} documents and the first one has {num_tokens_first_doc} tokens")

Now we have 17 documents and the first one has 2428 tokens


In [114]:
# Define prompt
prompt_template = """
        Given a text delimited by triple backquotes that contains part1 of the crime and punishment book, which is divided 
        into multiple chapters, generate a summary of part1 that covers all the key points and aspects of the story. 
        The summary should be written in a clear and engaging way.

        ```{text}```

        Summary of Part1:
"""
prompt = PromptTemplate(template=prompt_template, input_variables=["text"])

refine_template = (
    "Your job is to produce a final summary\n"
    "We have provided an existing summary up to a certain point: {existing_answer}\n"
    "We have the opportunity to refine the existing summary"
    "(only if needed) with some more context below.\n"
    "------------\n"
    "{text}\n"
    "------------\n"
    "Given the new context, refine the original summary"
    "If the context isn't useful, return the original summary."
)
refine_prompt = PromptTemplate.from_template(refine_template)

In [115]:
refine_chain = load_summarize_chain(
    llm=llm,
    chain_type="refine",
    question_prompt=prompt,
    refine_prompt=refine_prompt,
    input_key="input_documents",
    output_key="output_text",
)

In [None]:
refine_outputs = refine_chain({"input_documents": part5_docs})
print(refine_outputs["output_text"])

### Part 6

In [116]:
parts_and_chapters.keys()

dict_keys(['Part I', 'Part II', 'Part III', 'Part IV', 'Part V', 'Part VI'])

In [186]:
Part6 = parts_and_chapters['Part VI']
print(f"Part 6 has {len(Part6)} chapters")

Part 6 has 8 chapters


In [None]:
part6 = "".join(Part6)
print(part6)

In [119]:
num_tokens = llm.get_num_tokens(part6)

print (f"The part6 of book has {num_tokens} tokens in it")

The part6 of book has 51935 tokens in it


In [120]:
text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n"], chunk_size=10000, chunk_overlap=500)
part6_docs = text_splitter.create_documents([part6])

In [121]:
num_docs = len(part6_docs)

num_tokens_first_doc = llm.get_num_tokens(part6_docs[0].page_content)
print (f"Now we have {num_docs} documents and the first one has {num_tokens_first_doc} tokens")

Now we have 22 documents and the first one has 2496 tokens


In [122]:
# Define prompt
prompt_template = """
        Given a text delimited by triple backquotes that contains part1 of the crime and punishment book, which is divided 
        into multiple chapters, generate a summary of part1 that covers all the key points and aspects of the story. 
        The summary should be written in a clear and engaging way.

        ```{text}```

        Summary of Part1:
"""
prompt = PromptTemplate(template=prompt_template, input_variables=["text"])

refine_template = (
    "Your job is to produce a final summary\n"
    "We have provided an existing summary up to a certain point: {existing_answer}\n"
    "We have the opportunity to refine the existing summary"
    "(only if needed) with some more context below.\n"
    "------------\n"
    "{text}\n"
    "------------\n"
    "Given the new context, refine the original summary"
    "If the context isn't useful, return the original summary."
)
refine_prompt = PromptTemplate.from_template(refine_template)

In [123]:
refine_chain = load_summarize_chain(
    llm=llm,
    chain_type="refine",
    question_prompt=prompt,
    refine_prompt=refine_prompt,
    input_key="input_documents",
    output_key="output_text",
)

In [None]:
refine_outputs = refine_chain({"input_documents": part6_docs})
print(refine_outputs["output_text"])

### Epilogue

In [None]:
# The epilogue starts from the last 24 pages of the book to the end.
epilogue_content = preprocessed_pages[-24:]
epilogue_content = [page.page_content for page in epilogue_content]
epilogue = ''.join(epilogue_content)
print(epilogue)

In [127]:
num_tokens = llm.get_num_tokens(epilogue)

print(f"This epilogue has {num_tokens} tokens in it")

This epilogue has 8485 tokens in it


In [128]:
text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n"], chunk_size=10000, chunk_overlap=500)
epilogue_docs = text_splitter.create_documents([epilogue])

In [129]:
num_docs = len(epilogue_docs)

num_tokens_first_doc = llm.get_num_tokens(epilogue_docs[0].page_content)
print (f"Now we have {num_docs} documents and the first one has {num_tokens_first_doc} tokens")

Now we have 4 documents and the first one has 2320 tokens


In [130]:
# Define prompt
prompt_template = """
        Given a text delimited by triple backquotes that contains part1 of the crime and punishment book, which is divided 
        into multiple chapters, generate a summary of part1 that covers all the key points and aspects of the story. 
        The summary should be written in a clear and engaging way.

        ```{text}```

        Summary of Part1:
"""
prompt = PromptTemplate(template=prompt_template, input_variables=["text"])

refine_template = (
    "Your job is to produce a final summary\n"
    "We have provided an existing summary up to a certain point: {existing_answer}\n"
    "We have the opportunity to refine the existing summary"
    "(only if needed) with some more context below.\n"
    "------------\n"
    "{text}\n"
    "------------\n"
    "Given the new context, refine the original summary"
    "If the context isn't useful, return the original summary."
)
refine_prompt = PromptTemplate.from_template(refine_template)

In [131]:
refine_chain = load_summarize_chain(
    llm=llm,
    chain_type="refine",
    question_prompt=prompt,
    refine_prompt=refine_prompt,
    input_key="input_documents",
    output_key="output_text",
)

In [None]:
refine_outputs = refine_chain({"input_documents": epilogue_docs})
print(refine_outputs["output_text"])