In [2]:
!pip install --upgrade nltk transformers requests beautifulsoup4
import nltk
nltk.download('punkt')
nltk.download('stopwords')

from bs4 import BeautifulSoup
import requests
from transformers import pipeline
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import string

class WebContentAgent:
    def __init__(self):
        # Use a smaller model that's more stable for Colab
        self.summarizer = pipeline(
            "summarization",
            model="sshleifer/distilbart-cnn-12-6",
            device=-1
        )

    def process_url(self, url):
        try:
            # Step 1: Fetch web content
            headers = {'User-Agent': 'Mozilla/5.0'}
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()

            # Step 2: Parse content
            soup = BeautifulSoup(response.text, 'html.parser')
            paragraphs = [p.get_text() for p in soup.find_all('p')]
            text = ' '.join(paragraphs)

            if not text.strip():
                return {'error': 'No readable text found'}

            # Step 3: Process content
            cleaned_text = ' '.join(text.split())
            return {
                'url': url,
                'keywords': self._extract_keywords(cleaned_text),
                'summary': self._safe_summarize(cleaned_text),
                'status': 'success'
            }

        except Exception as e:
            return {
                'url': url,
                'status': 'error',
                'message': str(e)
            }

    def _extract_keywords(self, text, num_keywords=10):
        stop_words = set(stopwords.words('english') + list(string.punctuation))
        words = [word.lower() for word in word_tokenize(text)
                if word.lower() not in stop_words and word.isalpha()]
        return [word for word, _ in nltk.FreqDist(words).most_common(num_keywords)]

    def _safe_summarize(self, text, target_summary_length=150):
        sentences = sent_tokenize(text)

        # If text is short, summarize directly
        if len(word_tokenize(text)) < 500:
            try:
                summary = self.summarizer(
                    text,
                    max_length=target_summary_length,
                    min_length=30,
                    do_sample=False
                )
                return summary[0]['summary_text']
            except:
                return text[:500] + "..."  # Fallback truncation

        # For long texts: extract most important sentences
        word_freq = nltk.FreqDist(
            word.lower() for word in word_tokenize(text)
            if word.lower() not in stopwords.words('english')
            and word.isalpha()
        )

        sentence_scores = {}
        for i, sentence in enumerate(sentences):
            for word in word_tokenize(sentence.lower()):
                if word in word_freq:
                    if i not in sentence_scores:
                        sentence_scores[i] = word_freq[word]
                    else:
                        sentence_scores[i] += word_freq[word]

        # Get top N sentences
        top_sentences = sorted(
            sentence_scores.items(),
            key=lambda x: x[1],
            reverse=True
        )[:int(len(sentences)*0.3)]  # Take top 30%

        # Return in original order
        summary = ' '.join(
            sentences[i] for i, score in sorted(top_sentences, key=lambda x: x[0])
        )

        return summary if len(summary) > 50 else text[:500] + "..."

# Test in Colab - use a simpler URL for demo
agent = WebContentAgent()
result = agent.process_url("https://en.wikipedia.org/wiki/Mukesh_Ambani")  # News article

if result.get('status') == 'success':
    print("\nKeywords:", result['keywords'])
    print("\nSummary:", result['summary'])
else:
    print("Error:", result.get('message'))



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Device set to use cpu



Keywords: ['ambani', 'india', 'reliance', 'mumbai', 'world', 'richest', 'billion', 'million', 'mukesh', 'indian']

Summary: Mukesh Dhirubhai Ambani (born 19 April 1957) is an Indian billionaire and businessman who is the chairman and managing director of Reliance Industries. [8][9][10][11][12] In October 2024, Ambani was ranked 1st on the Forbes list of India's 100 richest tycoons, with a net worth of $119.5 billion. [13] Ambani was born on 19 April 1957 in the British Crown colony of Aden (present-day Yemen) into a Gujarati Hindu family to Dhirubhai Ambani and Kokilaben Ambani. [14] Ambani lived only briefly in Yemen because his father decided to move back to India in 1958 to start a trading business that focused on spices and textiles. [18] The family's financial status slightly improved when they moved to India but Ambani still lived in a communal society, used public transportation, and never received an allowance. [19] Dhirubhai later purchased a 14-floor apartment block called '