<a href="https://colab.research.google.com/github/Samarth1642002/TEXT_Summarization_threads/blob/main/webpage_summarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
from bs4 import BeautifulSoup
import threading
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from heapq import nlargest
import time

nlp = spacy.load("en_core_web_sm")
stopwords = list(STOP_WORDS)
cue_phrases = ["incidentally", "example", "anyway", "furthermore","according","first", "second", "then", "now", "thus", "moreover", "therefore", "hence", "lastly", "finally", "summary"]


In [None]:
def get_webpage_text(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.content, 'html.parser')
    webpage_text = ""
    for para in soup.find_all('p'):
        webpage_text += para.get_text()
    return webpage_text

In [None]:
def summarize_webpage(url):
    try:
        webpage_text = get_webpage_text(url)
        docx = nlp(webpage_text)
        word_frequencies = {}
        for word in docx:
            if word.text.lower() not in stopwords:
                if word.text.lower() not in cue_phrases:
                    if word.text.lower() not in word_frequencies.keys():
                        word_frequencies[word.text.lower()] = 1
                    else:
                        word_frequencies[word.text.lower()] += 1
        maximum_frequency = max(word_frequencies.values())
        for word in word_frequencies.keys():
            word_frequencies[word] = word_frequencies[word] / maximum_frequency
        sentence_list = [sentence for sentence in docx.sents]
        sentence_scores = {}
        for sent in sentence_list:
            for word in sent:
                if word.text.lower() in word_frequencies.keys():
                    if len(sent.text.split(' ')) < 25:
                        if sent not in sentence_scores.keys():
                            sentence_scores[sent] = word_frequencies[word.text.lower()]
                        else:
                            sentence_scores[sent] += word_frequencies[word.text.lower()]
        summarized_sentences = nlargest(10, sentence_scores, key=sentence_scores.get)
        final_summary = ' '.join([w.text for w in summarized_sentences])
        print(f"Summary for URL {url}:\n{final_summary}\n", flush=True)
    except Exception as e:
        print(f"An error occurred for URL {url}: {e}\n", flush=True)


In [None]:


if __name__ == '__main':
    urls = [
        "https://en.wikipedia.org/wiki/History_of_India",
        "https://en.wikipedia.org/wiki/India",
        "https://en.wikipedia.org/wiki/Machine_learning",
        "https://en.wikipedia.org/wiki/Big_data",
        "https://en.wikipedia.org/wiki/Data_science",
    ]

    start_time = time.time()
    threads = []

    for i, url in enumerate(urls):
        thread = threading.Thread(target=summarize_and_print, args=(url, i))
        thread.start()
        threads.append(thread)

    for thread in threads:
        thread.join()

    print(f"Total execution time: {time.time() - start_time} seconds")