In [15]:
# Import required libraries
import requests
from bs4 import BeautifulSoup
from transformers import pipeline, AutoTokenizer


In [16]:
# Function to fetch webpage content and extract text
def fetch_webpage_text(url):
    if not url.startswith(("http://", "https://")):
        return "Invalid URL. Please ensure the URL starts with 'http://' or 'https://'."

    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract text from <p> tags, ignoring irrelevant sections
        text = ' '.join([p.get_text() for p in soup.find_all('p') if p.get_text()])
        if len(text.strip()) == 0:
            return "No textual content found on the webpage."
        return text.strip()
    except requests.exceptions.RequestException as e:
        return f"Error fetching the webpage: {e}"


In [17]:
# Function to clean and preprocess the extracted text
def preprocess_text(text):
    # Remove excessive whitespace and non-ASCII characters
    text = ' '.join(text.split())
    return ''.join([char for char in text if ord(char) < 128])


In [18]:
# Function to split text into chunks based on tokenization limits
def split_text_into_tokenized_chunks(text, tokenizer, max_tokens=1024):
    words = text.split()
    chunks = []
    current_chunk = []

    for word in words:
        current_chunk.append(word)
        tokenized_length = len(tokenizer(' '.join(current_chunk))['input_ids'])
        if tokenized_length > max_tokens:
            current_chunk.pop()  # Remove the last word to stay within the limit
            chunks.append(' '.join(current_chunk))
            current_chunk = [word]

    if current_chunk:
        chunks.append(' '.join(current_chunk))  # Add the last chunk

    return chunks


In [19]:
# Function to truncate a summary to a specific word limit while preserving sentences
def truncate_summary_to_word_limit(summary, word_limit=200):
    words = summary.split()
    if len(words) <= word_limit:
        return summary  # Already within limit
    truncated = ' '.join(words[:word_limit])  # Truncate to word limit

    # Ensure the truncation ends at the last complete sentence
    if '.' in truncated:
        return truncated[:truncated.rfind('.') + 1]
    return truncated  # Fallback to hard truncation


In [20]:
# Function to summarize the text using a pre-trained model
def summarize_text(text, tokenizer, max_length=200, min_length=100, final_word_limit=300):
    try:
        summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
        chunks = split_text_into_tokenized_chunks(text, tokenizer)
        summaries = []

        print("\nDebug: Tokenized chunk sizes:")
        for chunk in chunks:
            print(f"Chunk token count: {len(tokenizer(chunk)['input_ids'])}")

        for chunk in chunks:
            summary = summarizer(chunk, max_length=max_length, min_length=min_length, do_sample=False)
            summaries.append(summary[0]['summary_text'])

        # Combine summaries into one and truncate to fit the final word limit
        combined_summary = ' '.join(summaries)
        truncated_summary = truncate_summary_to_word_limit(combined_summary, word_limit=final_word_limit)
        return truncated_summary
    except Exception as e:
        return f"Error during summarization: {e}"


In [21]:
# Main function to orchestrate the summarization process
def main():
    # Input the URL
    url = input("Enter the URL of the webpage to summarize: ").strip()
    print("\nFetching webpage content...\n")
    
    # Fetch webpage content
    webpage_text = fetch_webpage_text(url)
    if webpage_text.startswith("Error"):
        print(webpage_text)
        return

    # Debugging: Print extracted text length
    print(f"\nExtracted text length: {len(webpage_text.split())} words\n")
    print("Extracted text preview:\n", webpage_text[:500], "\n...")  # Show first 500 characters

    # Preprocess text
    clean_text = preprocess_text(webpage_text)

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")

    # Generate summary
    print("\nGenerating summary...\n")
    summary = summarize_text(clean_text, tokenizer, final_word_limit=300)
    print("\nSummarized Text:\n")
    print(summary)

# Run the main function
main()



Fetching webpage content...


Extracted text length: 11322 words

Extracted text preview:
 Starbucks Corporation is an American multinational chain of coffeehouses and roastery reserves headquartered in Seattle, Washington. It was founded in 1971 by Jerry Baldwin, Zev Siegl, and Gordon Bowker at Seattle's Pike Place Market initially as a coffee bean wholesaler. Starbucks was converted into a coffee shop serving espresso-based drinks under the ownership of Howard Schultz, who was chief executive officer from 1986 to 2000 and led the aggressive expansion of the franchise across the West 
...

Generating summary...



Device set to use cpu



Debug: Tokenized chunk sizes:
Chunk token count: 1024
Chunk token count: 1024
Chunk token count: 1024
Chunk token count: 1024
Chunk token count: 1024
Chunk token count: 1024
Chunk token count: 1024
Chunk token count: 1024
Chunk token count: 1024
Chunk token count: 1024
Chunk token count: 1018
Chunk token count: 1024
Chunk token count: 1024
Chunk token count: 1024
Chunk token count: 967

Summarized Text:

Starbucks is an American multinational chain of coffeehouses and roastery reserves headquartered in Seattle, Washington. It was founded in 1971 by Jerry Baldwin, Zev Siegl, and Gordon Bowker at Seattle's Pike Place Market. As of November 2022, the company had 35,711 stores in 80 countries, 15,873 of which were located in the United States. Starbucks serves hot and cold drinks, whole-bean coffee, micro-ground instant coffee, espresso, caffe latte, full and loose-leaf teas, juices, Frappuccino beverages, pastries, and snacks. Depending on the country, most locations provide free Wi-Fi I