In [1]:
!pip install nltk newspaper3k scikit-learn pandas matplotlib wordcloud seaborn networkx

Collecting newspaper3k
  Downloading newspaper3k-0.2.8-py3-none-any.whl.metadata (11 kB)
Collecting cssselect>=0.9.2 (from newspaper3k)
  Downloading cssselect-1.2.0-py2.py3-none-any.whl.metadata (2.2 kB)
Collecting feedparser>=5.2.1 (from newspaper3k)
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting tldextract>=2.0.1 (from newspaper3k)
  Downloading tldextract-5.1.2-py3-none-any.whl.metadata (11 kB)
Collecting feedfinder2>=0.0.4 (from newspaper3k)
  Downloading feedfinder2-0.0.4.tar.gz (3.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting jieba3k>=0.35.1 (from newspaper3k)
  Downloading jieba3k-0.35.1.zip (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tinysegmenter==0.3 (from newspaper3k)
  Downloading tinysegmenter-0.3.tar.gz (16 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Co

In [2]:
import requests
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
import numpy as np
import networkx as nx
from newspaper import Article
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from collections import Counter
import seaborn as sns
import logging
import time
import re

In [3]:
class NewsArticleSummarizer:
     def __init__(self):

        nltk.download('punkt')
        nltk.download('stopwords')
        nltk.download('wordnet')
        nltk.download('averaged_perceptron_tagger')

        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
        self.tokenizer = RegexpTokenizer(r'\w+')


        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s'
        )
        self.logger = logging.getLogger(__name__)

     def fetch_article(self, url):

        try:
            article = Article(url)
            article.download()
            article.parse()

            return {
                'title': article.title,
                'text': article.text,
                'publish_date': article.publish_date,
                'authors': article.authors,
                'top_image': article.top_image
            }
        except Exception as e:
            self.logger.error(f"Error fetching article: {str(e)}")
            return None

     def preprocess_text(self, text):


        text = text.lower()


        text = re.sub(r'[^a-zA-Z\s]', '', text)


        words = word_tokenize(text)


        processed_words = [
            self.lemmatizer.lemmatize(word)
            for word in words
            if word not in self.stop_words and len(word) > 2
        ]

        return ' '.join(processed_words)

     def generate_summary(self, text, num_sentences=5):

        sentences = sent_tokenize(text)


        similarity_matrix = self.build_similarity_matrix(sentences)


        nx_graph = nx.from_numpy_array(similarity_matrix)
        scores = nx.pagerank(nx_graph)


        ranked_sentences = sorted(
            ((scores[i], sentence) for i, sentence in enumerate(sentences)),
            reverse=True
        )

        summary = ' '.join(sentence for _, sentence in ranked_sentences[:num_sentences])
        return summary

     def build_similarity_matrix(self, sentences):

        similarity_matrix = np.zeros((len(sentences), len(sentences)))

        for idx1 in range(len(sentences)):
            for idx2 in range(len(sentences)):
                if idx1 != idx2:
                    similarity_matrix[idx1][idx2] = self.sentence_similarity(
                        sentences[idx1],
                        sentences[idx2]
                    )

        return similarity_matrix

     def sentence_similarity(self, sent1, sent2):

        words1 = [word.lower() for word in word_tokenize(sent1)]
        words2 = [word.lower() for word in word_tokenize(sent2)]

        all_words = list(set(words1 + words2))

        vector1 = [0] * len(all_words)
        vector2 = [0] * len(all_words)

        for word in words1:
            if word not in self.stop_words:
                vector1[all_words.index(word)] += 1

        for word in words2:
            if word not in self.stop_words:
                vector2[all_words.index(word)] += 1

        return 1 - cosine_distance(vector1, vector2)

     def extract_keywords(self, text, top_n=10):

        vectorizer = TfidfVectorizer(
            max_features=100,
            stop_words='english'
        )


        tfidf_matrix = vectorizer.fit_transform([text])


        feature_names = vectorizer.get_feature_names_out()
        scores = tfidf_matrix.toarray()[0]


        keyword_scores = list(zip(feature_names, scores))
        keyword_scores.sort(key=lambda x: x[1], reverse=True)

        return keyword_scores[:top_n]

     def generate_visualizations(self, text, title):


        wordcloud = WordCloud(
            width=800,
            height=400,
            background_color='white'
        ).generate(text)


        words = word_tokenize(text)
        word_freq = Counter(words)


        plt.figure(figsize=(15, 10))


        plt.subplot(2, 1, 1)
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')
        plt.title('Word Cloud')


        plt.subplot(2, 1, 2)
        top_words = dict(sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:10])
        sns.barplot(x=list(top_words.values()), y=list(top_words.keys()))
        plt.title('Top 10 Most Frequent Words')

        plt.tight_layout()
        plt.savefig(f"{title.replace(' ', '_')}_analysis.png")
        plt.close()

     def analyze_article(self, url, summary_sentences=5):
        start_time = time.time()
        self.logger.info(f"Starting analysis for URL: {url}")

        article_data = self.fetch_article(url)
        if not article_data:
            return None


        processed_text = self.preprocess_text(article_data['text'])

        results = {
            'title': article_data['title'],
            'original_length': len(article_data['text']),
            'summary': self.generate_summary(article_data['text'], summary_sentences),
            'keywords': self.extract_keywords(processed_text),
            'publish_date': article_data['publish_date'],
            'authors': article_data['authors']
        }


        self.generate_visualizations(processed_text, article_data['title'])

        self.logger.info(f"Analysis completed in {time.time() - start_time:.2f} seconds")
        return results

In [4]:
def main():

    summarizer = NewsArticleSummarizer()
    url = "https://www.newyorker.com/magazine/2024/07/08/the-boys-season-4-review"

    results = summarizer.analyze_article(url)
    if results:
        print(f"\nArticle Title: {results['title']}")
        print(f"\nAuthors: {', '.join(results['authors'])}")
        print(f"Published: {results['publish_date']}")
        print(f"\nOriginal Length: {results['original_length']} characters")
        print("\nSummary:")
        print(results['summary'])
        print("\nTop Keywords:")
        for keyword, score in results['keywords']:
            print(f"- {keyword}: {score:.4f}")

if __name__ == "__main__":
    main()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.



Article Title: “The Boys” Gets Too Close for Comfort

Authors: Inkoo Kang
Published: 2024-07-08 00:00:00

Original Length: 7641 characters

Summary:
For Homelander, fatherhood poses fresh, even poignant, challenges. But, perhaps because of the tightrope Kripke has long been able to walk, “The Boys” has been a bona-fide hit for Prime Video, with a final season still to come, two spinoffs (including the teen-oriented “Gen V”), and more in development. In this season, the supe is reunited with Ryan (Cameron Crovetti), the son who’s been kept from him for years—one blessed, and cursed, with supernatural abilities of his own. Yet Homelander, who’s loath to leave him with a “shithole country,” doesn’t know how else to demonstrate his love. As another character puts it, “If Ryan becomes like Homelander, that’s the end of the world.”


Top Keywords:
- homelander: 0.4914
- season: 0.3024
- boy: 0.2268
- seven: 0.1890
- character: 0.1512
- come: 0.1512
- kripke: 0.1512
- like: 0.1512
- people: 