In [2]:
# Install PyTorch
!pip install torch



In [3]:
# Install transformers
!pip install transformers



In [4]:
# Install sentencepiece 
!pip install sentencepiece



In [19]:
!pip install PyMuPDF



In [6]:
!pip install tqdm



In [11]:
# Importing dependencies from transformers
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

In [12]:
# Load tokenizer 
tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")

In [13]:
# Load model 
model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [1]:
from transformers import pipeline
from bs4 import BeautifulSoup
import requests

In [2]:
summarizer = pipeline("summarization")

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.





In [15]:
# URL = "https://pmc.ncbi.nlm.nih.gov/articles/PMC6696419/"
URL = "https://hackernoon.com/will-the-game-stop-with-gamestop-or-is-this-just-the-beginning-2j1x32aa"

r = requests.get(URL)
soup = BeautifulSoup(r.text, 'html.parser')
results = soup.find_all(['h1', 'p'])
text = [result.text for result in results]
ARTICLE = ' '.join(text)
ARTICLE

'Will The Game Stop with Gamestop Or Is This Just The Beginning? The GameStop squeeze on short-sellers is an extraordinary event in markets, where at face value, retail traders and investors have worked together in an attempt to put some of the largest wall street institutions out of business. The events can be interpreted with many viable lenses and there are ironies baked in that are pure serendipity. There has been a centrally controlled game in the global financial system in which insiders benefited while outsiders got hurt that comes to a head with a company called GameStop. The broking firm of most of the retail side of this warfare ‘RobinHood’ is literally stealing from its poor, retail investors to give to its rich, capital backers. One of the historical realities of this game has been that macro-investing – the sages of not only portfolio management, but often also sophisticated social and cultural figures – have had a hard time making money in markets now for decades. With go

In [3]:
import fitz

# Function to read and extract text from a large PDF file
def read_pdf(file_path, chunk_size=1000):
    """
    Reads a PDF file in chunks and extracts the text.
    
    :param file_path: str, path to the PDF file
    :param chunk_size: int, the number of pages to read at a time (for memory efficiency)
    :return: str, concatenated text from the PDF
    """
    # Open the PDF
    pdf_document = fitz.open(file_path)
    total_pages = pdf_document.page_count
    all_text = []

    # Loop through the PDF in chunks to minimize memory usage
    for start_page in range(0, total_pages, chunk_size):
        end_page = min(start_page + chunk_size, total_pages)
        
        # Extract text from the current chunk of pages
        chunk_text = ""
        for page_num in range(start_page, end_page):
            page = pdf_document.load_page(page_num)
            chunk_text += page.get_text("text")
        
        all_text.append(chunk_text)

        # Optionally, print progress
        print(f"Processed pages {start_page + 1} to {end_page}...")

    pdf_document.close()

    # Return concatenated text
    return "\n".join(all_text)

# Example usage
file_path = "harry-potter-sorcerers-stone.pdf"
pdf_text = read_pdf(file_path)

# Optionally, save the extracted text to a file
with open("output.txt", "w", encoding="utf-8") as f:
    f.write(pdf_text)

# Print a small preview of the extracted text
print("Preview of extracted text:\n", pdf_text)


Processed pages 1 to 221...
Preview of extracted text:
 HP 1 - Harry Potter and the
Sorcerer's Stone
Harry Potter and the Sorcerer's Stone
 
 
Harry Potter
&
The Sorcerer’s Stone
 
 
by J.K. Rowling
 
 
 
 
 
HP 1 - Harry Potter and the
Sorcerer's Stone
CHAPTER ONE
 
THE BOY WHO LIVED
 
      M r. and Mrs. Dursley, of number four, Privet Drive, were proud to say
that they were perfectly normal, thank you very much. They were the last people
you’d expect to be involved in anything strange or mysterious, because they just
didn’t hold with such nonsense.
      Mr. Dursley was the director of a firm called Grunnings, which made
drills. He was a big, beefy man with hardly any neck, although he did have a
very large mustache. Mrs. Dursley was thin and blonde and had nearly twice the
usual amount of neck, which came in very useful as she spent so much of her
time craning over garden fences, spying on the neighbors. The Dursleys had a
small son called Dudley and in their opinion there was no f

In [4]:
max_chunk = 500
pdf_text = pdf_text.replace('.', '.<eos>')
pdf_text = pdf_text.replace('?', '?<eos>')
pdf_text = pdf_text.replace('!', '!<eos>')
sentences = pdf_text.split('<eos>')
current_chunk = 0 
chunks = []
for sentence in sentences:
    if len(chunks) == current_chunk + 1: 
        if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk:
            chunks[current_chunk].extend(sentence.split(' '))
        else:
            current_chunk += 1
            chunks.append(sentence.split(' '))
    else:
        print(current_chunk)
        chunks.append(sentence.split(' '))

for chunk_id in range(len(chunks)):
    chunks[chunk_id] = ' '.join(chunks[chunk_id])
0
len(chunks)

0


194

In [5]:
### res = summarizer(chunks, max_length=120, min_length=30, do_sample=False)
# res[0]



from tqdm import tqdm  # Import tqdm for the progress bar

# Function to process and summarize text chunks with a progress bar
def summarize_with_progress_bar(chunks, summarizer, max_length=120, min_length=30):
    """
    Summarizes a list of text chunks with a progress bar.
    
    :param chunks: List of text chunks to summarize
    :param summarizer: Summarizer model (e.g., Hugging Face pipeline)
    :param max_length: Maximum length of summary
    :param min_length: Minimum length of summary
    :return: List of summaries
    """
    summaries = []
    
    # Use tqdm to show a progress bar while processing chunks
    for chunk in tqdm(chunks, desc="Summarizing", unit="chunk"):
        summary = summarizer(chunk, max_length=max_length, min_length=min_length, do_sample=False)
        summaries.append(summary[0]['summary_text'])
    
    return summaries

# Example usage
# Assuming `summarizer` is already initialized (e.g., using Hugging Face's pipeline)
from transformers import pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Summarize with progress bar
summaries = summarize_with_progress_bar(chunks, summarizer)

text = ' '.join([summ['summary_text'] for summ in summaries])

# Print the summary
print("Summary:", text)


Summarizing: 100%|██████████| 194/194 [48:50<00:00, 15.10s/chunk] 


TypeError: string indices must be integers

In [9]:
text = ' '.join([summ for summ in summaries])
text
# print(summaries)



In [10]:
with open('booksummary.txt', 'w') as f:
    f.write(text)

UnicodeEncodeError: 'charmap' codec can't encode character '\ufffd' in position 1849: character maps to <undefined>

In [17]:
# Create tokens - number representation of our text
tokens = tokenizer(pdf_text, truncation=True, padding="longest", return_tensors="pt")
# Input tokens
tokens

{'input_ids': tensor([[ 4100,   305,   233,  5849, 10173,   111,   109, 79643,   131,   116,
          4040,  5849, 10173,   111,   109, 79643,   131,   116,  4040,  5849,
         10173,   259,   139, 79643,   123,   116,  4040,   141,   907,   107,
          1028,   107, 42728,  4100,   305,   233,  5849, 10173,   111,   109,
         79643,   131,   116,  4040,   110, 77196,  9459,  1835,   110, 52921,
         16101, 16387,   470,   627,  6114,   107,   111,  5390,   107, 18694,
         14364,   108,   113,   344,   541,   108, 20204,   261,   144,  4010,
           108,   195,  2038,   112,   416,   120,   157,   195,  2475,  1644,
           108,  2041,   119,   221,   249,   107,   322,   195,   109,   289,
           200,   119,   123,   252,  1337,   112,   129,  1065,   115,   742,
          4768,   132,  8287,   108,   262,   157,   188,   595,   123,   144,
          1137,   122,   253, 19120,   107,  1263,   107, 18694, 14364,   140,
           109,  1758,   113,   114,  

In [18]:
# Summarize 
summary = model.generate(**tokens)
# Output summary tokens
summary[0]
# Decode summary
tokenizer.decode(summary[0])

'<pad>There was nothing strange or mysterious about the Dursleys.</s>'