In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.probability import FreqDist
from bs4 import BeautifulSoup
import requests
import heapq

# Download NLTK resources (only needed first time)
nltk.download('punkt')
nltk.download('stopwords')
# Download the 'punkt_tab' resource
nltk.download('punkt_tab') # This line is added to download the missing resource

def fetch_article_text(url):
    """Fetch article text from a URL"""
    try:
        response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract text from paragraphs
        paragraphs = soup.find_all('p')
        article_text = ' '.join([para.get_text() for para in paragraphs])
        return article_text
    except Exception as e:
        print(f"Error fetching article: {e}")
        return None

def summarize_text(text, num_sentences=5):
    """Summarize text using extractive summarization"""
    # Tokenize sentences
    sentences = sent_tokenize(text)

    if len(sentences) < num_sentences:
        return text  # Return original if text is shorter than requested summary

    # Remove stopwords and tokenize words
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text.lower())
    words = [word for word in words if word.isalnum() and word not in stop_words]

    # Calculate word frequencies
    word_frequencies = FreqDist(words)

    # Calculate sentence scores
    sentence_scores = {}
    for sentence in sentences:
        for word in word_tokenize(sentence.lower()):
            if word in word_frequencies:
                if sentence not in sentence_scores:
                    sentence_scores[sentence] = word_frequencies[word]
                else:
                    sentence_scores[sentence] += word_frequencies[word]

    # Get top N sentences
    summary_sentences = heapq.nlargest(
        num_sentences,
        sentence_scores,
        key=sentence_scores.get
    )

    return ' '.join(summary_sentences)

# Example usage with a predefined URL
if __name__ == "__main__":
    # You can change this URL to any article you want to summarize
    article_url = "https://en.wikipedia.org/wiki/Natural_language_processing"

    print(f"Fetching article from: {article_url}")
    article_text = fetch_article_text(article_url)

    if article_text:
        print("\nGenerating summary...")
        summary = summarize_text(article_text, num_sentences=5)

        print("\n=== Article Summary ===")
        print(summary)
    else:
        print("Failed to fetch article text.")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Fetching article from: https://en.wikipedia.org/wiki/Natural_language_processing

Generating summary...

=== Article Summary ===
As an example, George Lakoff offers a methodology to build natural language processing (NLP) algorithms through the perspective of cognitive science, along with the findings of cognitive linguistics,[50] with two defining aspects:
 Ties with cognitive linguistics are part of the historical heritage of NLP, but they have been less frequently addressed since the statistical turn during the 1990s. [57] Likewise, ideas of cognitive NLP are inherent to neural models multimodal NLP (although rarely made explicit)[58] and developments in artificial intelligence, specifically tools and technologies using large language model approaches[59] and new directions in artificial general intelligence based on the free energy principle[60] by British neuroscientist and theoretician at University College London Karl J. Friston. Nevertheless, approaches to develop cognitive mod