### Task-1 Summarization of 'Crime and Punishment' Book

In [52]:
#Necessary Imports
import os
import nltk,string,re
import openai
from fpdf import FPDF
from dotenv import load_dotenv
from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter,CharacterTextSplitter
from langchain.chains.llm import LLMChain
from langchain_core.prompts import PromptTemplate
from langchain.chains import MapReduceDocumentsChain, ReduceDocumentsChain
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain_openai import ChatOpenAI

In [53]:
load_dotenv()

True

### Preprocess The Text

In [54]:
# Download NLTK punkt package
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Shaheer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [55]:
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove the non-text elements (URLs, email addresses)
    text = re.sub(r'\S*@\S*\s?', '', text)
    text = re.sub(r'http\S+', '', text)
    # Remove the punctuations
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Fix common OCR errors
    text = re.sub(r'\x18', '', text)
    text = re.sub(r'\n', ' ', text)
    return text

### Loading the Book

In [56]:
#This function will extract the text from the book
def extract_text_from_book(file_path):
    text = ""
    loader = PyPDFLoader(file_path)
    pages = loader.load()
    for page in pages:
        page.page_content = preprocess_text(page.page_content)
    return pages  ##Preprocessed text will be returned

In [57]:
file_path= r'C:\Users\Shaheer\Desktop\Cogent-Labs-Assessment\Task-1\crime-and-punishment.pdf'  #file path of the book
book_text = extract_text_from_book(file_path)

In [77]:
book_text[:5] 

[Document(metadata={'source': 'C:\\Users\\Shaheer\\Desktop\\Cogent-Labs-Assessment\\Task-1\\crime-and-punishment.pdf', 'page': 0}, page_content='download free ebooks of classic literature books and  novels at planet ebook subscribe to our free ebooks blog  and email newslettercrime and punishment by fyodor dostoevsky '),
 Document(metadata={'source': 'C:\\Users\\Shaheer\\Desktop\\Cogent-Labs-Assessment\\Task-1\\crime-and-punishment.pdf', 'page': 1}, page_content='crime and punishment translator’s preface a few words about dostoevsky himself may help the eng  lish reader to understand his work dostoevsky was the son of a doctor his parents were  very hard working and deeply religious people but so poor  that they lived with their five children in only two rooms  the father and mother spent their evenings in reading aloud  to their children generally from books of a serious charac  ter though always sickly and delicate dostoevsky came out  third in the final examination of the petersburg

In [59]:
print('total documents created from book:',len(book_text)) 

total documents created from book: 767


### Converting the Documents into Chunks

In [60]:
# RecursiveCharacterTextSplitter is used as it breaks long text into coherent, manageable chunks.
splitter = RecursiveCharacterTextSplitter(
    chunk_size=4096,
    chunk_overlap=500,
    length_function=len
)

In [61]:
# Split the text into chunks
chunks = splitter.split_documents(book_text)

In [76]:
#Printing the chunks
chunks[:5]

[Document(metadata={'source': 'C:\\Users\\Shaheer\\Desktop\\Cogent-Labs-Assessment\\Task-1\\crime-and-punishment.pdf', 'page': 0}, page_content='download free ebooks of classic literature books and  novels at planet ebook subscribe to our free ebooks blog  and email newslettercrime and punishment by fyodor dostoevsky'),
 Document(metadata={'source': 'C:\\Users\\Shaheer\\Desktop\\Cogent-Labs-Assessment\\Task-1\\crime-and-punishment.pdf', 'page': 1}, page_content='crime and punishment translator’s preface a few words about dostoevsky himself may help the eng  lish reader to understand his work dostoevsky was the son of a doctor his parents were  very hard working and deeply religious people but so poor  that they lived with their five children in only two rooms  the father and mother spent their evenings in reading aloud  to their children generally from books of a serious charac  ter though always sickly and delicate dostoevsky came out  third in the final examination of the petersburg 

### Map Reduce approach to Summarize the Book

In [68]:
#LLM
llm = ChatOpenAI(temperature=0)
# Map
map_template = """The following is a set of documents
{docs}
Based on this list of docs, please identify the main themes 
Helpful Answer:"""
map_prompt = PromptTemplate.from_template(map_template)
map_chain = LLMChain(llm=llm, prompt=map_prompt)

In [69]:
# Reduce
reduce_template = """The following is a set of summaries extracted from each chapter of the book :
{docs}
Please distill these into a comprehensive final summary, ensuring that all the main themes are interconnected and logically organized. The summary should include:

1. A brief overview of each chapter, highlighting the main ideas, key points, and significant arguments.
2. Identification and explanation of the primary themes and concepts discussed throughout the book, with examples and explanations.
3. A summary of the author's perspective and point of view, including main arguments and unique insights.
4. A critical analysis of the book's content, discussing strengths and weaknesses of the arguments presented.
5. Key takeaways and reflections on the overall message and relevance to the reader.

Present the summary as a detailed narrative, highlighting the key points in a cohesive manner. Aim for a summary length of 6-7 pages.

Helpful Answer:"""
reduce_prompt = PromptTemplate.from_template(reduce_template)

In [70]:
# Run chain
reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)

# Takes a list of documents, combines them into a single string, and passes this to an LLMChain
combine_documents_chain = StuffDocumentsChain(
    llm_chain=reduce_chain, document_variable_name="docs"
)

# Combines and iteratively reduces the mapped documents
reduce_documents_chain = ReduceDocumentsChain(
    # This is final chain that is called.
    combine_documents_chain=combine_documents_chain,
    # If documents exceed context for `StuffDocumentsChain`
    collapse_documents_chain=combine_documents_chain,
    # The maximum number of tokens to group documents into.
    token_max=4000,
)

In [71]:
# Combining documents by mapping a chain over them, then combining results
map_reduce_chain = MapReduceDocumentsChain(
    # Map chain
    llm_chain=map_chain,
    # Reduce chain
    reduce_documents_chain=reduce_documents_chain,
    # The variable name in the llm_chain to put the documents in
    document_variable_name="docs",
    # Return the results of the map steps in the output
    return_intermediate_steps=False,
)

In [72]:
result = map_reduce_chain.invoke(chunks)

In [None]:
print(result["output_text"])

In the interconnected world of the provided documents, a rich tapestry of themes emerges, weaving together the complexities of human relationships, the consequences of actions, and the depths of emotional turmoil. At the heart of this narrative lies the theme of crime and punishment, where characters grapple with the weight of their actions and the repercussions that follow. This theme is intertwined with the exploration of guilt, remorse, and redemption, as individuals navigate the moral dilemmas that arise from their choices.

The narrative begins with a focus on family relationships and love, as seen in the heartfelt letter from Pulcheria Raskolnikov to her son Rodya. The deep love and concern expressed in the letter highlight the strong bond between family members, setting the stage for the exploration of familial dynamics and conflicts that unfold throughout the text. Pulcheria Raskolnikov's mention of praying and faith in the mercy of their creator introduces themes of faith and 

### Converting the Summary into PDF

In [74]:
# The generated book summary
summary = result["output_text"]
book_title = "Crime and Punishment"
author_name = "Fyodor Dostoevsky"

# Create instance of FPDF class
pdf = FPDF()

# Function to draw borders on the entire page
def draw_borders():
    pdf.set_line_width(0.5)
    pdf.rect(10, 10, 190, 277)  # Draw a rectangle (borders) from (10,10) to (200,287)

# Add a page and draw borders
pdf.add_page()
draw_borders()

# Set title font and size
pdf.set_font('Arial', 'B', 16)
pdf.cell(0, 10, 'Book Summary', 0, 1, 'C')

# Add a line break
pdf.ln(10)

# Add book title
pdf.set_font('Arial', 'B', 14)
pdf.cell(0, 10, f'Title: {book_title}', 0, 1, 'L')

# Add author name
pdf.set_font('Arial', 'B', 14)
pdf.cell(0, 10, f'Author: {author_name}', 0, 1, 'L')

# Add another line break
pdf.ln(10)

# Set summary content font and size
pdf.set_font('Arial', '', 13)

# Split the summary into chunks that fit into one page
lines = pdf.multi_cell(0, 10, summary, border=0, split_only=True)

# Add the lines to the PDF
for i, line in enumerate(lines):
    if pdf.get_y() > 265:  # Check if the page end is reached
        pdf.add_page()
        draw_borders()
        pdf.set_y(20)  # Reset the y position
    pdf.multi_cell(0, 10, line, border=0)

# Add footer to the last page
pdf.set_y(-15)
pdf.set_font('Arial', 'I', 8)
pdf.cell(0, 10, f'Page {pdf.page_no()}', 0, 0, 'C')

# Output the PDF to a file
output_pdf_path = "Book-Summary-Crime-and-Punishment.pdf"
pdf.output(output_pdf_path)

''