In [None]:
# Install necessary packages
!pip install nltk transformers torch requests

# Download necessary NLTK data properly
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')  # Extra safe
nltk.download('maxent_ne_chunker')            # Extra safe
nltk.download('words')                        # Extra safe




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [None]:
import requests
from html.parser import HTMLParser
import re
from collections import Counter
from transformers import pipeline

# --- Scrape and Parse Paragraphs Manually ---
class ParagraphExtractor(HTMLParser):
    def __init__(self):
        super().__init__()
        self.recording = False
        self.data = []

    def handle_starttag(self, tag, attrs):
        if tag == 'p':
            self.recording = True

    def handle_endtag(self, tag):
        if tag == 'p':
            self.recording = False

    def handle_data(self, data):
        if self.recording:
            cleaned = data.strip()
            if cleaned:
                self.data.append(cleaned)

def scrape_and_extract_paragraphs(url):
    response = requests.get(url)
    html = response.text
    parser = ParagraphExtractor()
    parser.feed(html)
    return ' '.join(parser.data)

# --- Extractive Summarization without NLTK ---
def simple_sentence_tokenize(text):
    sentences = re.split(r'(?<=[.!?]) +', text)
    return sentences

def simple_word_tokenize(text):
    words = re.findall(r'\b\w+\b', text.lower())
    return words

def summarize_text_simple(text, num_sentences=5):
    sentences = simple_sentence_tokenize(text)
    words = simple_word_tokenize(text)

    # Basic English stopwords manually defined
    stop_words = set([
        'the', 'a', 'an', 'in', 'on', 'at', 'for', 'of', 'to', 'and', 'but', 'if', 'or',
        'because', 'as', 'until', 'while', 'by', 'about', 'against', 'between', 'into',
        'through', 'during', 'before', 'after', 'above', 'below', 'from', 'up', 'down',
        'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once',
        'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each',
        'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only',
        'own', 'same', 'so', 'than', 'too', 'very', 'can', 'will', 'just'
    ])

    words = [word for word in words if word not in stop_words]

    word_freq = Counter(words)

    sentence_scores = {}
    for sent in sentences:
        for word in simple_word_tokenize(sent.lower()):
            if word in word_freq:
                sentence_scores[sent] = sentence_scores.get(sent, 0) + word_freq[word]

    summarized_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:num_sentences]
    return ' '.join(summarized_sentences)

# --- Abstractive Summarization (Transformers) ---
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def summarize_text_abstractive(text, max_tokens=1024):
    text = text.strip().replace("\n", " ")
    if len(text) > max_tokens:
        text = text[:max_tokens]
    summary = summarizer(text, max_length=130, min_length=30, do_sample=False)
    return summary[0]['summary_text']

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cuda:0


In [None]:
if __name__ == "__main__":
    # Example article URL
    url = "https://www.indiatoday.in/india/story/no-politics-no-talk-of-statehood-at-the-cost-of-26-lives-omar-abdullah-in-jk-assembly-on-pahalgam-attack-2716201-2025-04-28"

    article_text = scrape_and_extract_paragraphs(url)
    print("\n Original Text Preview:")
    print(article_text[:500])  # Show first 500 characters

    print("\n Extractive Summary:")
    print(summarize_text_simple(article_text))

    print("\n Abstractive Summary:")
    print(summarize_text_abstractive(article_text))


 Original Text Preview:
Listen to Story Jammu and Kashmir Chief Minister Omar Abdullah on Monday ruled out politicising the Pahalgam terror attack , which claimed 26 lives, saying he would not demand statehood for the Union Territory over the dead bodies of innocent civilians. Expressing deep grief, the National Conference leader said that, while the restoration of statehood remains an important goal, he would defer raising the demand for another time and not make human lives a political bargaining chip. "We are not in

 Extractive Summary:
While the National Conference has continued to press for the restoration of statehood, Omar Abdullah made it clear that political aspirations should not come at the cost of mourning lives lost to terrorism The National Conference leader strongly condemned the terror attack and said, "Today, we want to strongly condemn the Pahalgam terror attack." He also read out the names of the victims in the Assembly, saying the safety and protection of the tour

In [None]:
'''
Sites to try-

(new blog) https://www.indiatoday.in/india/story/government-writes-to-bbc-india-head-strong-sentiments-pahalgam-terror-attack-reporting-ib-ministry-watchlist-2716097-2025-04-28
(wikipedia) https://en.wikipedia.org/wiki/Animal
(news page) https://economictimes.indiatimes.com/?from=mdr
(rocket fuel blog) https://rexarc.com/blog/a-quick-guide-to-rocket-fuel/

'''