In [None]:
import requests
from bs4 import BeautifulSoup
from transformers import pipeline
import numpy as np
import re
import nltk

# Ensure you have the necessary NLTK data
nltk.download('stopwords')

# Load the summarization pipeline
initial_summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
refinement_summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Function to extract text from a webpage
def extract_text_from_url(url: str) -> str:
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            paragraphs = soup.find_all('p')
            text = ' '.join([para.get_text() for para in paragraphs])
            return text
        else:
            raise Exception(f"Failed to fetch the webpage, status code: {response.status_code}")
    except Exception as e:
        print(f"Error occurred: {e}")
        return None

# Function to preprocess text
def preprocess_text(text: str) -> str:
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text.strip()

# Function to handle large texts by chunking
def chunk_text(text: str, chunk_size: int = 1024) -> list:
    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

# Function to summarize a chunk of text
def summarize_chunk(chunk: str) -> str:
    summary = initial_summarizer(chunk, max_length=150, min_length=50, do_sample=False)
    return summary[0]['summary_text']

# Function to perform refinement summarization
def refine_summary(initial_summaries: list) -> str:
    combined_summary = ' '.join(initial_summaries)
    refined_summary = refinement_summarizer(combined_summary, max_length=200, min_length=100, do_sample=False)
    return refined_summary[0]['summary_text']

# Main function to handle the process
def hierarchical_summarization(url: str):
    text = extract_text_from_url(url)
    if text:
        preprocessed_text = preprocess_text(text)
        chunks = chunk_text(preprocessed_text)

        # Initial Summarization
        initial_summaries = [summarize_chunk(chunk) for chunk in chunks]

        # Refinement Summarization
        final_summary = refine_summary(initial_summaries)

        return {
            "initial_summaries": initial_summaries,
            "final_summary": final_summary
        }
    else:
        return {"error": "Failed to extract text from the URL."}

# Example usage
if __name__ == "__main__":
    url = input("Enter a webpage URL to summarize: ").strip()
    result = hierarchical_summarization(url)

    if "error" in result:
        print(result["error"])
    else:
        print("\nInitial Summaries:")
        for i, summary in enumerate(result["initial_summaries"], 1):
            print(f"Chunk {i}: {summary}")

        print("\nFinal Summary:")
        print(result["final_summary"])


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



Enter a webpage URL to summarize: https://www.bbc.com/news/articles/cvglrrz95zzo


Your max_length is set to 150, but your input_length is only 24. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=12)



Initial Summaries:
Chunk 1: US Secretary of State Antony Blinken has announced new sanctions against the Russian media channel RT. He said RT is part of a network of Russianbacked media outlets which have sought to covertly undermine democracy in the United States. RT livestreamed Mr Blinken's remarks on X and declared it the USs latest conspiracy theory.
Chunk 2: Mr Blinken also accused RT of running online fundraisers to purchase body armour sniper rifles drones and other equipment for Russian soldiers fighting in Ukraine. The network he said has also sought to influence Moldovas politics in coordination with Russian intelligence ahead of presidential elections in October 2024.
Chunk 3: The announcement is part of a suite of actions the US government has taken against Russian state media as the 2024 election approaches. Mr Blinken emphasised that the sanctions were not related to the content of the outlets reporting and he affirmed the USs support for independent journalism.
Chunk 4

In [None]:
!pip install rouge-score


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=e72c3810acc4825471ded48d763360da70b9973381373a1306253f1ce2c3c709
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [None]:
from rouge_score import rouge_scorer

def evaluate_rouge(reference_summary, generated_summary):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference_summary, generated_summary)
    return scores

# Example usage
reference_summary ="""U.S. Secretary of State Antony Blinken announced sanctions against RT, denouncing it as a "de facto arm of Russia's intelligence apparatus" that insidiously undermines U.S. democracy. He accused RT of engaging in covert influence operations and blatantly supporting Russian military efforts in Ukraine. In a weak attempt to deflect accountability, RT dismissed these grave allegations as mere conspiracy theories. These sanctions are part of a broader and urgent crackdown on Russian state media, which poses a serious threat as the 2024 elections approach."""
generated_summary = """US Secretary of State Antony Blinken has announced new sanctions against the Russian media channel RT. He said RT is part of a network of Russianbacked media outlets which have sought to covertly undermine democracy in the United States. Mr Blinken also accused RT of running online fundraisers to purchase body armour sniper rifles drones and other equipment for Russian soldiers fighting in Ukraine. The network he said has also sought to influence Moldovas politics in coordination with Russian intelligence ahead of presidential elections in October 2024."""

rouge_scores = evaluate_rouge(reference_summary, generated_summary)
print("ROUGE Scores:", rouge_scores)


ROUGE Scores: {'rouge1': Score(precision=0.41379310344827586, recall=0.4186046511627907, fmeasure=0.41618497109826585), 'rouge2': Score(precision=0.11627906976744186, recall=0.11764705882352941, fmeasure=0.11695906432748537), 'rougeL': Score(precision=0.26436781609195403, recall=0.26744186046511625, fmeasure=0.26589595375722536)}


In [None]:
!pip install nltk




In [None]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

def evaluate_bleu(reference_summary, generated_summary):
    reference = [reference_summary.split()]
    generated = generated_summary.split()
    smoothie = SmoothingFunction().method4
    score = sentence_bleu(reference, generated, smoothing_function=smoothie)
    return score

# Example usage
reference_summary ="""U.S. Secretary of State Antony Blinken announced sanctions against RT, denouncing it as a "de facto arm of Russia's intelligence apparatus" that insidiously undermines U.S. democracy. He accused RT of engaging in covert influence operations and blatantly supporting Russian military efforts in Ukraine. In a weak attempt to deflect accountability, RT dismissed these grave allegations as mere conspiracy theories. These sanctions are part of a broader and urgent crackdown on Russian state media, which poses a serious threat as the 2024 elections approach."""
generated_summary = """US Secretary of State Antony Blinken has announced new sanctions against the Russian media channel RT. He said RT is part of a network of Russianbacked media outlets which have sought to covertly undermine democracy in the United States. Mr Blinken also accused RT of running online fundraisers to purchase body armour sniper rifles drones and other equipment for Russian soldiers fighting in Ukraine. The network he said has also sought to influence Moldovas politics in coordination with Russian intelligence ahead of presidential elections in October 2024."""
bleu_score = evaluate_bleu(reference_summary, generated_summary)
print("BLEU Score:", bleu_score)


BLEU Score: 0.08583620742303912


In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk

# Ensure you have downloaded the VADER lexicon
nltk.download('vader_lexicon')

# Function to analyze sentiment using VADER
def analyze_sentiment_vader(text):
    sia = SentimentIntensityAnalyzer()
    return sia.polarity_scores(text)

# Example summaries
reference_summary ="""U.S. Secretary of State Antony Blinken announced sanctions against RT, denouncing it as a "de facto arm of Russia's intelligence apparatus" that insidiously undermines U.S. democracy. He accused RT of engaging in covert influence operations and blatantly supporting Russian military efforts in Ukraine. In a weak attempt to deflect accountability, RT dismissed these grave allegations as mere conspiracy theories. These sanctions are part of a broader and urgent crackdown on Russian state media, which poses a serious threat as the 2024 elections approach."""
generated_summary = """US Secretary of State Antony Blinken has announced new sanctions against the Russian media channel RT. He said RT is part of a network of Russianbacked media outlets which have sought to covertly undermine democracy in the United States. Mr Blinken also accused RT of running online fundraisers to purchase body armour sniper rifles drones and other equipment for Russian soldiers fighting in Ukraine. The network he said has also sought to influence Moldovas politics in coordination with Russian intelligence ahead of presidential elections in October 2024."""

# Analyze sentiments
generated_sentiment = analyze_sentiment_vader(generated_summary)
reference_sentiment = analyze_sentiment_vader(reference_summary)

print(f"Generated Summary Sentiment: {generated_sentiment}")
print(f"Reference Summary Sentiment: {reference_sentiment}")


Generated Summary Sentiment: {'neg': 0.074, 'neu': 0.864, 'pos': 0.063, 'compound': 0.0}
Reference Summary Sentiment: {'neg': 0.189, 'neu': 0.705, 'pos': 0.106, 'compound': -0.7906}


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
