# Summarize an URL and give it in text and speech file

# TODO: robots.txt


Note: Web scraping might be against the terms of service of some websites. Make sure you have permission to scrape a website and always respect robots.txt and other rules that a website may have around web scraping.


pip install gtts
pip install sumy
pip install gensim
pip install beautifulsoup4
pip install requests

In [1]:
from gtts import gTTS
import IPython.display as ipd

# The text you want to convert to speech
text = "Ferdowsi's Shahnameh,\
due to its special and unique place in the literary and  \
cultural history of Iran, has always been present  \
and influential in various fields and fields of life and thought of Iranians."

# Language in which you want to convert
# language = 'en'
# American accent
language = 'en-us' 

# Passing the text and language to the engine, slow=False makes the speech faster
tts = gTTS(text=text, lang=language, slow=False)

# Saving the converted audio in a mp3 file named 'welcome'
tts.save("welcome.mp3")

# Play the saved file
ipd.Audio("welcome.mp3", autoplay=True)


In [4]:

# URL of the article you want to summarize
# https://en.wikipedia.org/wiki/Shiraz"
url = input('Inter your URL')

# NLTK (Natural Language Toolkit) is a
Python package for natural language processing. It provides a set of tools and resources for working with human language data, including tokenization, stemming, tagging, parsing, and semantic reasoning.

In [None]:
import nltk
nltk.download('punkt')

In [5]:
# summarize an URL in a text
from bs4 import BeautifulSoup
import requests
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer

def fetch_data_from_url(url):
    # Fetch the content from the URL
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Assuming the article text is inside <p> HTML tags
    article_text = ' '.join([p.text for p in soup.find_all('p')])

    return article_text

def summarize_text(text, sentence_count=5):
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer = LsaSummarizer()
    summary = summarizer(parser.document, sentence_count)

    return ' '.join(str(sentence) for sentence in summary)


# Fetch and print the summarized data
article_text = fetch_data_from_url(url)
print("Original Article:")
print(article_text)

summary = summarize_text(article_text)
print("\n\nSummarized Article:")
print(summary)


Original Article:



Summarized Article:



In [None]:

# url = input('Inter your URL')

from bs4 import BeautifulSoup
import requests
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer
from gtts import gTTS
import IPython.display as ipd

def fetch_data_from_url(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    article_text = ' '.join([p.text for p in soup.find_all('p')])
    return article_text


def summarize_text(text, sentence_count=5):
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer = LsaSummarizer()
    summary = summarizer(parser.document, sentence_count)
    return [str(sentence) for sentence in summary]


def text_to_speech(text):
    language = 'en'
    tts = gTTS(text=text, lang=language, slow=False)
    tts.save("summary.mp3")
    return ipd.Audio("summary.mp3", autoplay=True)

# Fetch, summarize, and read aloud
article_text = fetch_data_from_url(url)
summary_sentences = summarize_text(article_text)
print("Summarized Article:")
for sentence in summary_sentences:
    print(sentence)

text_to_speech(" ".join(summary_sentences))

In [None]:
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output
import requests
from bs4 import BeautifulSoup
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer
from gtts import gTTS
import base64

# Define functions
def fetch_data_from_url(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    article_text = ' '.join([p.text for p in soup.find_all('p')])
    return article_text

def summarize_text(text, sentence_count=5):
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer = LsaSummarizer()
    summary = summarizer(parser.document, sentence_count)
    return [str(sentence) for sentence in summary]

def text_to_speech(text):
    tts = gTTS(text=text, lang='en', slow=False)
    tts.save("summary.mp3")
    with open("summary.mp3", "rb") as audio_file:
        encoded_audio = base64.b64encode(audio_file.read()).decode()
    return "data:audio/mpeg;base64,{}".format(encoded_audio)

# Create Dash app
app = dash.Dash(__name__)

# App layout
app.layout = html.Div([
    dcc.Input(id='url-input', type='text', placeholder='Enter URL'),
    html.Button('Fetch and Summarize', id='fetch-button'),
    html.Audio(id='audio-output', controls=True, style={'display': 'none'}),
])

# Callback
@app.callback(
    Output('audio-output', 'src'),
    Input('fetch-button', 'n_clicks'),
    [State('url-input', 'value')]
)
def fetch_and_summarize(n_clicks, url):
    if n_clicks is None:
        return

    try:
        article_text = fetch_data_from_url(url)
        summary_sentences = summarize_text(article_text)
        summary_text = " ".join(summary_sentences)
        audio_src = text_to_speech(summary_text)
        return audio_src
    except Exception as e:
        print(f"Error: {e}")  # Print error for debugging
        return None  # Handle error gracefully in the app

# Run the app
if __name__ == '__main__':
    app.run_server(debug=True)
