In [None]:
!pip install --upgrade transformers
!pip install torch

Collecting transformers
  Downloading transformers-4.46.2-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.21,>=0.20 (from transformers)
  Downloading tokenizers-0.20.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.46.2-py3-none-any.whl (10.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m26.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.20.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m35.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.19.1
    Uninstalling tokenizers-0.19.1:
      Successfully uninstalled tokenizers-

In [None]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=6f06253cd5f1085395516861ecc385245a722b6dc7522020910c2d68a1c163c1
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [None]:
from transformers import pipeline
from typing import Optional, List
import requests
import re

class TextSummarizer:
    def __init__(self, model_name: str = "facebook/bart-large-cnn"):
        """Initialize the model pipeline."""
        print(f"Initializing {model_name} model...")
        self.summarizer = pipeline(
            "summarization",
            model=model_name,
            device=-1  # Use CPU
        )

    def fetch_text(self, url: str) -> Optional[str]:
        """Fetch text from URL."""
        try:
            response = requests.get(url)
            response.raise_for_status()
            return response.text
        except Exception as e:
            print(f"Error fetching text: {e}")
            return None

    def preprocess_text(self, text: str) -> str:
        """Clean and preprocess the text."""
        # Remove Project Gutenberg header and footer
        start_marker = "*** START OF THE PROJECT GUTENBERG EBOOK"
        end_marker = "*** END OF THE PROJECT GUTENBERG EBOOK"

        if start_marker in text:
            text = text.split(start_marker)[1]
        if end_marker in text:
            text = text.split(end_marker)[0]

        # Clean up text
        text = re.sub(r'\r\n', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        return text.strip()

    def split_into_chunks(self, text: str, chunk_size: int = 1000) -> List[str]:
        """Split text into chunks, trying to break at sentence boundaries."""
        chunks = []
        sentences = text.replace('!', '.').replace('?', '.').split('.')
        current_chunk = []
        current_length = 0

        for sentence in sentences:
            sentence = sentence.strip() + '.'
            sentence_length = len(sentence)

            if current_length + sentence_length > chunk_size and current_chunk:
                chunks.append(' '.join(current_chunk))
                current_chunk = [sentence]
                current_length = sentence_length
            else:
                current_chunk.append(sentence)
                current_length += sentence_length

        if current_chunk:
            chunks.append(' '.join(current_chunk))

        return chunks

    def generate_summary(self, text: str, max_length: int = 150, min_length: int = 50) -> str:
        """Generate summary for a chunk of text."""
        try:
            summary = self.summarizer(
                text,
                max_length=max_length,
                min_length=min_length,
                do_sample=False
            )
            return summary[0]['summary_text']
        except Exception as e:
            print(f"Error generating summary: {e}")
            return ""

def main():
    # Initialize summarizer
    summarizer = TextSummarizer()

    # Fetch and preprocess text
    print("Fetching Dracula text...")
    url = "https://www.gutenberg.org/cache/epub/345/pg345.txt"
    text = summarizer.fetch_text(url)

    if not text:
        print("Failed to fetch text. Exiting.")
        return

    print("Preprocessing text...")
    processed_text = summarizer.preprocess_text(text)

    # Split into chunks and generate summaries
    print("\nGenerating summaries...")
    chunks = summarizer.split_into_chunks(processed_text)
    summaries = []

    for i, chunk in enumerate(chunks[:5], 1):  # Process first 5 chunks
        print(f"\nProcessing chunk {i}/5:")
        summary = summarizer.generate_summary(chunk)
        if summary:
            summaries.append(summary)
            print(f"Generated summary: {summary}")

    # Save results
    print("\nSaving results...")
    with open('dracula_summary.txt', 'w', encoding='utf-8') as f:
        f.write("Dracula - Chapter Summaries\n")
        f.write("=" * 80 + "\n\n")
        for i, summary in enumerate(summaries, 1):
            f.write(f"Chunk {i} Summary:\n")
            f.write("-" * 40 + "\n")
            f.write(summary + "\n\n")

    print("Done! Results saved to 'dracula_summary.txt'")

if __name__ == "__main__":
    main()

Initializing facebook/bart-large-cnn model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Fetching Dracula text...
Preprocessing text...

Generating summaries...

Processing chunk 1/5:
Generated summary: DRACULA by Bram Stoker is published by Country Life Press, Garden City, N.Y. The book includes the diary of Dr. Jonathan Harker and letters from Mina Murray to Lucy Westenra. Dr. Seward’s Diary is also included.

Processing chunk 2/5:
Generated summary: Dr. Seward’s Diary, spoken by Van Helsing. Mina Harker's Journal. All needless matters have been eliminated. There is throughout no statement of past things wherein memory may err. All the records chosen are exactly contemporary, given from the standpoints.

Processing chunk 3/5:
Generated summary: Buda-Pesth seems a wonderful place, from the glimpse which I got of it from the train and the little I could walk through the streets. The impression I had was that we were leaving the West and entering the East; the most western of splendid bridges over the Danube, which is here of noble width and depth.

Processing chunk 4/5:
Ge

In [None]:
from transformers import pipeline
from typing import Optional, List, Dict
import requests
import re
from rouge_score import rouge_scorer

class TextSummarizer:
    def __init__(self, model_name: str = "facebook/bart-large-cnn"):
        """Initialize the model pipeline and ROUGE scorer."""
        print(f"Initializing {model_name} model...")
        self.summarizer = pipeline(
            "summarization",
            model=model_name,
            device=-1  # Use CPU
        )
        self.scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    def fetch_text(self, url: str) -> Optional[str]:
        """Fetch text from URL."""
        try:
            response = requests.get(url)
            response.raise_for_status()
            return response.text
        except Exception as e:
            print(f"Error fetching text: {e}")
            return None

    def preprocess_text(self, text: str) -> str:
        """Clean and preprocess the text."""
        # Remove Project Gutenberg header and footer
        start_marker = "*** START OF THE PROJECT GUTENBERG EBOOK"
        end_marker = "*** END OF THE PROJECT GUTENBERG EBOOK"

        if start_marker in text:
            text = text.split(start_marker)[1]
        if end_marker in text:
            text = text.split(end_marker)[0]

        # Clean up text
        text = re.sub(r'\r\n', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        return text.strip()

    def split_into_chunks(self, text: str, chunk_size: int = 1000) -> List[str]:
        """Split text into chunks, trying to break at sentence boundaries."""
        chunks = []
        sentences = text.replace('!', '.').replace('?', '.').split('.')
        current_chunk = []
        current_length = 0

        for sentence in sentences:
            sentence = sentence.strip() + '.'
            sentence_length = len(sentence)

            if current_length + sentence_length > chunk_size and current_chunk:
                chunks.append(' '.join(current_chunk))
                current_chunk = [sentence]
                current_length = sentence_length
            else:
                current_chunk.append(sentence)
                current_length += sentence_length

        if current_chunk:
            chunks.append(' '.join(current_chunk))

        return chunks

    def generate_summary(self, text: str, max_length: int = 150, min_length: int = 50) -> str:
        """Generate summary for a chunk of text."""
        try:
            summary = self.summarizer(
                text,
                max_length=max_length,
                min_length=min_length,
                do_sample=False
            )
            return summary[0]['summary_text']
        except Exception as e:
            print(f"Error generating summary: {e}")
            return ""

    def calculate_rouge_scores(self, reference: str, summary: str) -> Dict:
        """Calculate ROUGE scores for a summary against its reference text."""
        scores = self.scorer.score(reference, summary)
        return {
            'rouge1': scores['rouge1'].fmeasure,
            'rouge2': scores['rouge2'].fmeasure,
            'rougeL': scores['rougeL'].fmeasure
        }

def main():
    # Initialize summarizer
    summarizer = TextSummarizer()

    # Fetch and preprocess text
    print("Fetching Dracula text...")
    url = "https://www.gutenberg.org/cache/epub/345/pg345.txt"
    text = summarizer.fetch_text(url)

    if not text:
        print("Failed to fetch text. Exiting.")
        return

    print("Preprocessing text...")
    processed_text = summarizer.preprocess_text(text)

    # Split into chunks and generate summaries
    print("\nGenerating summaries and calculating ROUGE scores...")
    chunks = summarizer.split_into_chunks(processed_text)
    summaries = []
    rouge_scores = []

    for i, chunk in enumerate(chunks[:5], 1):  # Process first 5 chunks
        print(f"\nProcessing chunk {i}/5:")
        summary = summarizer.generate_summary(chunk)
        if summary:
            summaries.append(summary)
            # Calculate ROUGE scores
            scores = summarizer.calculate_rouge_scores(chunk, summary)
            rouge_scores.append(scores)
            print(f"Generated summary: {summary}")
            print("ROUGE Scores:")
            print(f"ROUGE-1: {scores['rouge1']:.4f}")
            print(f"ROUGE-2: {scores['rouge2']:.4f}")
            print(f"ROUGE-L: {scores['rougeL']:.4f}")

    # Save results
    print("\nSaving results...")
    with open('dracula_summary_with_rouge.txt', 'w', encoding='utf-8') as f:
        f.write("Dracula - Chapter Summaries with ROUGE Scores\n")
        f.write("=" * 80 + "\n\n")
        for i, (summary, scores) in enumerate(zip(summaries, rouge_scores), 1):
            f.write(f"Chunk {i} Summary:\n")
            f.write("-" * 40 + "\n")
            f.write(summary + "\n\n")
            f.write("ROUGE Scores:\n")
            f.write(f"ROUGE-1: {scores['rouge1']:.4f}\n")
            f.write(f"ROUGE-2: {scores['rouge2']:.4f}\n")
            f.write(f"ROUGE-L: {scores['rougeL']:.4f}\n\n")

        # Calculate and write average ROUGE scores
        avg_rouge1 = sum(score['rouge1'] for score in rouge_scores) / len(rouge_scores)
        avg_rouge2 = sum(score['rouge2'] for score in rouge_scores) / len(rouge_scores)
        avg_rougeL = sum(score['rougeL'] for score in rouge_scores) / len(rouge_scores)

        f.write("\nAverage ROUGE Scores:\n")
        f.write("-" * 40 + "\n")
        f.write(f"Average ROUGE-1: {avg_rouge1:.4f}\n")
        f.write(f"Average ROUGE-2: {avg_rouge2:.4f}\n")
        f.write(f"Average ROUGE-L: {avg_rougeL:.4f}\n")

    print("Done! Results saved to 'dracula_summary_with_rouge.txt'")

if __name__ == "__main__":
    main()

Initializing facebook/bart-large-cnn model...
Fetching Dracula text...
Preprocessing text...

Generating summaries and calculating ROUGE scores...

Processing chunk 1/5:
Generated summary: DRACULA by Bram Stoker is published by Country Life Press, Garden City, N.Y. The book includes the diary of Dr. Jonathan Harker and letters from Mina Murray to Lucy Westenra. Dr. Seward’s Diary is also included.
ROUGE Scores:
ROUGE-1: 0.3077
ROUGE-2: 0.1456
ROUGE-L: 0.2404

Processing chunk 2/5:
Generated summary: Dr. Seward’s Diary, spoken by Van Helsing. Mina Harker's Journal. All needless matters have been eliminated. There is throughout no statement of past things wherein memory may err. All the records chosen are exactly contemporary, given from the standpoints.
ROUGE Scores:
ROUGE-1: 0.3886
ROUGE-2: 0.3445
ROUGE-L: 0.3886

Processing chunk 3/5:
Generated summary: Buda-Pesth seems a wonderful place, from the glimpse which I got of it from the train and the little I could walk through the streets