In [None]:
import logging
import requests
import re
import numpy as np
from bs4 import BeautifulSoup
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import spacy
import plotly.graph_objects as go
import time
from functools import lru_cache

# Set up logging for better debugging and information tracking
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Load pre-trained models
logging.info("Loading pre-trained models...")
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
sentiment_model = AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
sentiment_tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
nlp = spacy.load("en_core_web_sm")

# Function to preprocess text
def preprocess_text(text: str) -> str:
    logging.info("Preprocessing text...")
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text.strip()

# Function to handle large texts by chunking
def chunk_text(text: str, chunk_size: int = 1024) -> list:
    logging.info("Chunking text into manageable pieces...")
    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

# Function to extract text from a webpage
@lru_cache(maxsize=100)
def extract_text_from_url(url: str) -> str:
    try:
        logging.info(f"Fetching content from URL: {url}")
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        paragraphs = soup.find_all('p')
        text = ' '.join([para.get_text() for para in paragraphs])
        return text
    except requests.RequestException as e:
        logging.error(f"Error occurred while fetching the webpage: {e}")
        return None

# Function to analyze sentiment
def analyze_sentiment(text: str) -> str:
    try:
        logging.info("Analyzing sentiment...")
        inputs = sentiment_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
        outputs = sentiment_model(**inputs)
        sentiment_scores = outputs.logits.detach().numpy().flatten()
        sentiment_labels = ["Very Negative", "Negative", "Neutral", "Positive", "Very Positive"]
        sentiment = sentiment_labels[np.argmax(sentiment_scores)]
        return sentiment
    except Exception as e:
        logging.error(f"Error during sentiment analysis: {e}")
        return "Error"

# Function to extract named entities
def extract_entities(text: str):
    logging.info("Extracting named entities...")
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

# Function to summarize a chunk of text
def summarize_chunk(chunk: str) -> str:
    try:
        logging.info("Summarizing a chunk of text...")
        summary = summarizer(chunk, max_length=150, min_length=50, do_sample=False)
        return summary[0]['summary_text']
    except Exception as e:
        logging.error(f"Error during summarization: {e}")
        return "An error occurred during summarization."

# Function to visualize sentiment distribution
def visualize_sentiment(sentiment: str):
    logging.info("Visualizing sentiment distribution...")
    sentiment_labels = ["Very Negative", "Negative", "Neutral", "Positive", "Very Positive"]
    sentiment_counts = {label: 0 for label in sentiment_labels}
    sentiment_counts[sentiment] += 1
    fig = go.Figure(data=[go.Pie(labels=list(sentiment_counts.keys()), values=list(sentiment_counts.values()))])
    fig.update_layout(title="Sentiment Distribution")
    fig.show()

# Main function to handle the process
def generate_summary_and_analysis(url: str):
    text = extract_text_from_url(url)
    if text:
        preprocessed_text = preprocess_text(text)
        chunks = chunk_text(preprocessed_text)
        summaries = [summarize_chunk(chunk) for chunk in chunks]
        full_summary = ' '.join(summaries)

        sentiment = analyze_sentiment(preprocessed_text)
        entities = extract_entities(preprocessed_text)

        # Display results
        print("\nSummary:", full_summary)
        print("\nSentiment:", sentiment)
        print("\nNamed Entities:", entities)

        # Visualize sentiment
        visualize_sentiment(sentiment)

        return {
            "summary": full_summary,
            "sentiment": sentiment,
            "entities": entities
        }
    else:
        return {"error": "Failed to extract text from the URL."}

# Example usage
if __name__ == "__main__":
    url = input("Enter a webpage URL to summarize and analyze: ").strip()
    start_time = time.time()
    result = generate_summary_and_analysis(url)
    end_time = time.time()

    if "error" in result:
        print(result["error"])
    else:
        print(f"\nProcessing Time: {end_time - start_time:.2f} seconds")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/669M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Enter a webpage URL to summarize and analyze: https://www.bbc.com/news/articles/cvglrrz95zzo


Your max_length is set to 150, but your input_length is only 24. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=12)



Summary: US Secretary of State Antony Blinken has announced new sanctions against the Russian media channel RT. He said RT is part of a network of Russianbacked media outlets which have sought to covertly undermine democracy in the United States. RT livestreamed Mr Blinken's remarks on X and declared it the USs latest conspiracy theory. Mr Blinken also accused RT of running online fundraisers to purchase body armour sniper rifles drones and other equipment for Russian soldiers fighting in Ukraine. The network he said has also sought to influence Moldovas politics in coordination with Russian intelligence ahead of presidential elections in October 2024. The announcement is part of a suite of actions the US government has taken against Russian state media as the 2024 election approaches. Mr Blinken emphasised that the sanctions were not related to the content of the outlets reporting and he affirmed the USs support for independent journalism. CNN.com will feature iReporter photos in a w


Processing Time: 119.11 seconds


In [None]:
!pip install rouge-score


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=caf0f5147b111eb030efcc4e698037e31bf7f2281af6d03aa6fec78e198f9cea
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [None]:
from rouge_score import rouge_scorer

def evaluate_rouge(reference_summary, generated_summary):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference_summary, generated_summary)
    return scores

# Example usage
reference_summary ="""U.S. Secretary of State Antony Blinken announced sanctions against RT, denouncing it as a "de facto arm of Russia's intelligence apparatus" that insidiously undermines U.S. democracy. He accused RT of engaging in covert influence operations and blatantly supporting Russian military efforts in Ukraine. In a weak attempt to deflect accountability, RT dismissed these grave allegations as mere conspiracy theories. These sanctions are part of a broader and urgent crackdown on Russian state media, which poses a serious threat as the 2024 elections approach."""
generated_summary = """US Secretary of State Antony Blinken has announced new sanctions against the Russian media channel RT. He said RT is part of a network of Russianbacked media outlets which have sought to covertly undermine democracy in the United States. RT livestreamed Mr Blinken's remarks on X and declared it the USs latest conspiracy theory. Mr Blinken also accused RT of running online fundraisers to purchase body armour sniper rifles drones and other equipment for Russian soldiers fighting in Ukraine. The network he said has also sought to influence Moldovas politics in coordination with Russian intelligence ahead of presidential elections in October 2024. The announcement is part of a suite of actions the US government has taken against Russian state media as the 2024 election approaches. Mr Blinken emphasised that the sanctions were not related to the content of the outlets reporting and he affirmed the USs support for independent journalism. """
rouge_scores = evaluate_rouge(reference_summary, generated_summary)
print("ROUGE Scores:", rouge_scores)


ROUGE Scores: {'rouge1': Score(precision=0.31788079470198677, recall=0.5581395348837209, fmeasure=0.4050632911392405), 'rouge2': Score(precision=0.11333333333333333, recall=0.2, fmeasure=0.1446808510638298), 'rougeL': Score(precision=0.2119205298013245, recall=0.37209302325581395, fmeasure=0.270042194092827)}


In [None]:
!pip install nltk




In [None]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

def evaluate_bleu(reference_summary, generated_summary):
    reference = [reference_summary.split()]
    generated = generated_summary.split()
    smoothie = SmoothingFunction().method4
    score = sentence_bleu(reference, generated, smoothing_function=smoothie)
    return score

# Example usage
# Example usage
reference_summary ="""U.S. Secretary of State Antony Blinken announced sanctions against RT, denouncing it as a "de facto arm of Russia's intelligence apparatus" that insidiously undermines U.S. democracy. He accused RT of engaging in covert influence operations and blatantly supporting Russian military efforts in Ukraine. In a weak attempt to deflect accountability, RT dismissed these grave allegations as mere conspiracy theories. These sanctions are part of a broader and urgent crackdown on Russian state media, which poses a serious threat as the 2024 elections approach."""
generated_summary = """US Secretary of State Antony Blinken has announced new sanctions against the Russian media channel RT. He said RT is part of a network of Russianbacked media outlets which have sought to covertly undermine democracy in the United States. RT livestreamed Mr Blinken's remarks on X and declared it the USs latest conspiracy theory. Mr Blinken also accused RT of running online fundraisers to purchase body armour sniper rifles drones and other equipment for Russian soldiers fighting in Ukraine. The network he said has also sought to influence Moldovas politics in coordination with Russian intelligence ahead of presidential elections in October 2024. The announcement is part of a suite of actions the US government has taken against Russian state media as the 2024 election approaches. Mr Blinken emphasised that the sanctions were not related to the content of the outlets reporting and he affirmed the USs support for independent journalism. """
bleu_score = evaluate_bleu(reference_summary, generated_summary)
print("BLEU Score:", bleu_score)


BLEU Score: 0.05947482282374942


In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk

# Ensure you have downloaded the VADER lexicon
nltk.download('vader_lexicon')

# Function to analyze sentiment using VADER
def analyze_sentiment_vader(text):
    sia = SentimentIntensityAnalyzer()
    return sia.polarity_scores(text)

# Example summaries
reference_summary ="""U.S. Secretary of State Antony Blinken announced sanctions against RT, denouncing it as a "de facto arm of Russia's intelligence apparatus" that insidiously undermines U.S. democracy. He accused RT of engaging in covert influence operations and blatantly supporting Russian military efforts in Ukraine. In a weak attempt to deflect accountability, RT dismissed these grave allegations as mere conspiracy theories. These sanctions are part of a broader and urgent crackdown on Russian state media, which poses a serious threat as the 2024 elections approach."""
generated_summary = """US Secretary of State Antony Blinken has announced new sanctions against the Russian media channel RT. He said RT is part of a network of Russianbacked media outlets which have sought to covertly undermine democracy in the United States. RT livestreamed Mr Blinken's remarks on X and declared it the USs latest conspiracy theory. Mr Blinken also accused RT of running online fundraisers to purchase body armour sniper rifles drones and other equipment for Russian soldiers fighting in Ukraine. The network he said has also sought to influence Moldovas politics in coordination with Russian intelligence ahead of presidential elections in October 2024. The announcement is part of a suite of actions the US government has taken against Russian state media as the 2024 election approaches. Mr Blinken emphasised that the sanctions were not related to the content of the outlets reporting and he affirmed the USs support for independent journalism. """

# Analyze sentiments
generated_sentiment = analyze_sentiment_vader(generated_summary)
reference_sentiment = analyze_sentiment_vader(reference_summary)

print(f"Generated Summary Sentiment: {generated_sentiment}")
print(f"Reference Summary Sentiment: {reference_sentiment}")


Generated Summary Sentiment: {'neg': 0.065, 'neu': 0.881, 'pos': 0.054, 'compound': -0.1779}
Reference Summary Sentiment: {'neg': 0.189, 'neu': 0.705, 'pos': 0.106, 'compound': -0.7906}


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
