In [None]:
import requests
from bs4 import BeautifulSoup
from transformers import pipeline, BertTokenizer, BertForSequenceClassification
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import nltk
from nltk.corpus import stopwords

# Ensure you have the necessary NLTK data
nltk.download('stopwords')

# Load the summarization pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Load the sentiment analysis model
tokenizer = BertTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
sentiment_model = BertForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")

# Function to extract text from a webpage
def extract_text_from_url(url: str) -> str:
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            paragraphs = soup.find_all('p')
            text = ' '.join([para.get_text() for para in paragraphs])
            return text
        else:
            raise Exception(f"Failed to fetch the webpage, status code: {response.status_code}")
    except Exception as e:
        print(f"Error occurred: {e}")
        return None

# Function to analyze sentiment
def analyze_sentiment(text: str) -> str:
    # Split text into chunks if it's too long
    max_length = 512
    chunks = [text[i:i + max_length] for i in range(0, len(text), max_length)]
    sentiments = []
    for chunk in chunks:
        inputs = tokenizer(chunk, return_tensors="pt", truncation=True, padding=True)
        outputs = sentiment_model(**inputs)
        scores = outputs.logits.detach().numpy().flatten()
        sentiment_labels = ["Very Negative", "Negative", "Neutral", "Positive", "Very Positive"]
        sentiment = sentiment_labels[np.argmax(scores)]
        sentiments.append(sentiment)
    # Aggregate sentiments
    return max(set(sentiments), key=sentiments.count)

# Function to summarize text
def summarize_text(text: str) -> str:
    if len(text) > 1024:  # BART has a max input length limit
        text = text[:1024]
    summary = summarizer(text, max_length=150, min_length=50, do_sample=False)
    return summary[0]['summary_text']

# Function to perform topic modeling
def perform_topic_modeling(text: str) -> list:
    vectorizer = CountVectorizer(stop_words=stopwords.words('english'))
    X = vectorizer.fit_transform([text])
    lda = LatentDirichletAllocation(n_components=3, random_state=42)
    lda.fit(X)
    topic_keywords = np.array(vectorizer.get_feature_names_out())[np.argsort(lda.components_, axis=1)][:, :-1]
    return [", ".join(keywords) for keywords in topic_keywords]

# Function to personalize summary based on user interest
def personalize_summary(summary: str, interest: str) -> str:
    if interest.lower() in summary.lower():
        return f"Interest Match: {interest}\nSummary: {summary}"
    return f"Summary: {summary}"

# Main function to handle the process
def summarize_news(url: str, user_interest: str):
    text = extract_text_from_url(url)
    if text:
        # Sentiment Analysis
        sentiment = analyze_sentiment(text)

        # Summarize Text
        summary = summarize_text(text)

        # Topic Modeling to identify key topics
        topics = perform_topic_modeling(text)

        # Personalize Summary
        personalized_summary = personalize_summary(summary, user_interest)

        return {
            "summary": personalized_summary,
            "sentiment": sentiment,
            "topics": topics
        }
    else:
        return {"error": "Failed to extract text from the URL."}

# Example usage
url = input("Enter a news URL to summarize: ").strip()
user_interest = input("Enter your interest keyword: ").strip()
result = summarize_news(url, user_interest)

if "error" in result:
    print(result["error"])
else:
    print("\nSummary:", result["summary"])
    print("\nSentiment:", result["sentiment"])
    print("\nMain Topics:", result["topics"])


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/669M [00:00<?, ?B/s]

Enter a news URL to summarize: https://www.bbc.com/news/articles/cvglrrz95zzo
Enter your interest keyword: sanctions

Summary: Interest Match: sanctions
Summary: US Secretary of State Antony Blinken has announced new sanctions against the Russian media channel RT. He said RT is part of a network of Russian-backed media outlets which have sought to covertly "undermine democracy in the United States" RT live-streamed Mr Blinken's remarks and declared it the "US's latest conspiracy theory"

Sentiment: Very Negative

Main Topics: ['us, rt, russian, state, said, russia, blinken, influence, media, mr, intelligence, 2024, elections, bbc, government, also, sought, sanctions, part, election, states, content, external, president, friday, presidential, broadcaster, covert, foreign, undermine, accused, time, top, trying, department, two, journalism, network, new, reserved, reporting, refresher, related, purchase, reporters, read, remarks, profession, 2016, played, press, maria, military, ministry,

In [None]:
!pip install rouge-score


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=2910f5fd34a9fa11153553c2f306cc2931c1c4e3790d9bc98461d94fddcd12fd
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [None]:
from rouge_score import rouge_scorer

def evaluate_rouge(reference_summary, generated_summary):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference_summary, generated_summary)
    return scores

# Example usage
reference_summary = "U.S. Secretary of State Antony Blinken has announced new sanctions against the Russian media channel RT, labeling it as a de facto arm of Russia intelligence apparatus involved in undermining U.S. democracy. He accused RT of integrating cyber operational units linked to Russian intelligence and engaging in covert influence operations and military procurement across various regions, including attempts to sway elections in Moldova. In response, RT dismissed the accusations as a conspiracy theory, while U.S. officials highlighted the networks role in raising funds for military supplies for Russian troops in Ukraine. Blinken emphasized that the sanctions were not a critique of journalism, reaffirming U.S. support for independent media, and stated that covert influence activities are not journalism. This announcement is part of broader actions against Russian state media as the 2024 election approaches."
generated_summary = """US Secretary of State Antony Blinken has announced new sanctions against the Russian media channel RT. He said RT is part of a network of Russian-backed media outlets which have sought to covertly "undermine democracy in the United States" RT live-streamed Mr Blinken's remarks and declared it the "US's latest conspiracy theory"""
rouge_scores = evaluate_rouge(reference_summary, generated_summary)
print("ROUGE Scores:", rouge_scores)


ROUGE Scores: {'rouge1': Score(precision=0.7321428571428571, recall=0.29927007299270075, fmeasure=0.4248704663212435), 'rouge2': Score(precision=0.3090909090909091, recall=0.125, fmeasure=0.17801047120418848), 'rougeL': Score(precision=0.4642857142857143, recall=0.1897810218978102, fmeasure=0.2694300518134715)}


In [None]:
!pip install nltk




In [None]:
!pip install pycocoevalcap


Collecting pycocoevalcap
  Downloading pycocoevalcap-1.2-py3-none-any.whl.metadata (3.2 kB)
Downloading pycocoevalcap-1.2-py3-none-any.whl (104.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.3/104.3 MB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pycocoevalcap
Successfully installed pycocoevalcap-1.2


In [None]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

def evaluate_bleu(reference_summary, generated_summary):
    reference = [reference_summary.split()]
    generated = generated_summary.split()
    smoothie = SmoothingFunction().method4
    score = sentence_bleu(reference, generated, smoothing_function=smoothie)
    return score

# Example usage
reference_summary = "U.S. Secretary of State Antony Blinken has announced new sanctions against the Russian media channel RT, labeling it as a de facto arm of Russia intelligence apparatus involved in undermining U.S. democracy. He accused RT of integrating cyber operational units linked to Russian intelligence and engaging in covert influence operations and military procurement across various regions, including attempts to sway elections in Moldova. In response, RT dismissed the accusations as a conspiracy theory, while U.S. officials highlighted the networks role in raising funds for military supplies for Russian troops in Ukraine. Blinken emphasized that the sanctions were not a critique of journalism, reaffirming U.S. support for independent media, and stated that covert influence activities are not journalism. This announcement is part of broader actions against Russian state media as the 2024 election approaches."
generated_summary = """US Secretary of State Antony Blinken has announced new sanctions against the Russian media channel RT. He said RT is part of a network of Russian-backed media outlets which have sought to covertly "undermine democracy in the United States" RT live-streamed Mr Blinken's remarks and declared it the "US's latest conspiracy theory"""
bleu_score = evaluate_bleu(reference_summary, generated_summary)
print("BLEU Score:", bleu_score)


BLEU Score: 0.06644389190938313


In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk

# Ensure you have downloaded the VADER lexicon
nltk.download('vader_lexicon')

# Function to analyze sentiment using VADER
def analyze_sentiment_vader(text):
    sia = SentimentIntensityAnalyzer()
    return sia.polarity_scores(text)

# Example summaries
reference_summary ="""U.S. Secretary of State Antony Blinken announced sanctions against RT, labeling it a "de facto arm of Russia's intelligence apparatus" that actively undermines U.S. democracy. He accused RT of engaging in covert influence operations and directly supporting Russian military efforts in Ukraine. In response, RT dismissed these serious allegations as mere conspiracy theories, attempting to deflect accountability. These sanctions are part of a broader and necessary crackdown on Russian state media ahead of the critical 2024 elections."""
generated_summary = """US Secretary of State Antony Blinken has announced new sanctions against the Russian media channel RT. He said RT is part of a network of Russian-backed media outlets which have sought to covertly "undermine democracy in the United States" RT live-streamed Mr Blinken's remarks and declared it the "US's latest conspiracy theory"""

# Analyze sentiments
generated_sentiment = analyze_sentiment_vader(generated_summary)
reference_sentiment = analyze_sentiment_vader(reference_summary)

print(f"Generated Summary Sentiment: {generated_sentiment}")
print(f"Reference Summary Sentiment: {reference_sentiment}")


Generated Summary Sentiment: {'neg': 0.099, 'neu': 0.851, 'pos': 0.05, 'compound': -0.4215}
Reference Summary Sentiment: {'neg': 0.131, 'neu': 0.747, 'pos': 0.121, 'compound': 0.0258}


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
