<a href="https://colab.research.google.com/github/Sidhtang/bert-project/blob/main/website_summariser_using_bart.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers
!pip install tensorflow
!pip install requests
!pip install bs4
!pip install lxml
!pip install streamlit

Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Installing collected packages: bs4
Successfully installed bs4-0.0.2
Collecting streamlit
  Downloading streamlit-1.36.0-py2.py3-none-any.whl (8.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.6/8.6 MB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
Collecting gitpython!=3.1.19,<4,>=3.0.7 (from streamlit)
  Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m45.6 MB/s[0m eta [36m0:00:00[0m
Collecting watchdog<5,>=2.1.5 (from streamlit)
  Downloading watchdog-4.0.1-py3-none-manylinux2014_x86_64.whl (83 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.0/83.0 kB[

In [2]:
from transformers import pipeline
import requests
from bs4 import BeautifulSoup
import re
import streamlit as st

In [3]:

def extractText(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'lxml')
        excludeList = ['disclaimer', 'cookie', 'privacy policy']
        includeList = soup.find_all(
            ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p'])
        elements = [element for element in includeList if not any(
            keyword in element.get_text().lower() for keyword in excludeList)]
        text = " ".join([element.get_text()
                                  for element in elements])
        text = re.sub(r'\n\s*\n', '\n', text)
        return text
    else:
        return "Error in response"

In [4]:
def splitTextIntoChunks(text, chunk_size=1024):
  chunks = []
  for i in range(0, len(text), chunk_size):
    chunk = text[i:i + chunk_size]
    chunks.append(chunk)
  return chunks

In [11]:
def summarize(text, chunk_size=1024, chunk_summary_size=128):
  summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
  chunks = splitTextIntoChunks(text, chunk_size)

  summaries = []
  for chunk in chunks:
    size = chunk_summary_size
    if(len(chunk) < chunk_summary_size):
      size = int(len(chunk)/2)
    summary = summarizer(chunk, max_length=size)[0]["summary_text"] # Removed min_length
    summaries.append(summary)

  concatenated_summary = ""
  for summary in summaries:
    concatenated_summary += summary + " "

  return concatenated_summary

In [13]:
url = 'https://www.geeksforgeeks.org/next-sentence-prediction-using-bert/?ref=lbp'

text = extractText(url)
summarize(text)



'BERT stands for Bidirectional Representation for Transformers. It was proposed by researchers at Google Research in 2018. BERT is trained on a variety of different tasks to improve the language understanding of the model. In this article, we will discuss the tasks under the next sentence prediction for BERT. BERT architecture 3rd type. In the above architecture, the [CLS] token is the first token in the input. This means an input sentence is coming. The [SEP] represents the separation between the different inputs. Here, the inputs sentence are tokenized according to BERT vocab. '

In [None]:
st.title("Website Summarizer")

url = st.text_input("Enter the website URL")

if st.button("Summarize"):
    if url:
        try:
            info_text = st.empty()
            info_text.info("Extracting text from the website...")
            article = extractText(url)
            info_text.info("Summarizing the text...")
            summarized = summarize(article)
            info_text.info("Adding final touches...")
            finalSummary = summarize(summarized)
            info_text.empty()

            st.header("Summarized Text")
            st.write(finalSummary)
        except Exception as e:
            st.error("An error occurred. Please check the URL or try again later.")
    else:
        st.warning("Please enter a valid website URL.")