In [11]:
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
import re

In [12]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('words')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [13]:
def analyze_text(text):
    # Tokenize the text
    tokens = word_tokenize(text)

    # Generate POS tags
    tagged_tokens = pos_tag(tokens)

    # Extract nouns
    nouns = [word for word, tag in tagged_tokens if tag in ['NN', 'NNS', 'NNP', 'NNPS']]

    # Extract numbers
    numbers = [word for word in tokens if re.match(r'\d+', word)]

    # Count POS tags
    pos_tag_count = {}
    for word, tag in tagged_tokens:
        if tag in pos_tag_count:
            pos_tag_count[tag] += 1
        else:
            pos_tag_count[tag] = 1

    return tagged_tokens, nouns, numbers, pos_tag_count

In [14]:
!pip install requests beautifulsoup4




In [15]:
import requests
from bs4 import BeautifulSoup

def extract_text_from_url(url):
    # Send a GET request to the webpage
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all paragraphs (common way to get article text)
        paragraphs = soup.find_all('p')

        # Join the text of all paragraphs
        text = ' '.join([para.get_text() for para in paragraphs])

        return text
    else:
        return f"Failed to retrieve content. Status code: {response.status_code}"

# URL of the webpage
url = 'https://www.jesuits.global/2024/08/19/the-dialogue-between-faith-and-science-in-the-spirit-of-laudato-si/'

# Extract text from the URL
text = extract_text_from_url(url)

# Print the extracted text (or handle it as needed)
print(text)


By József Benedek; Gábor Nevelős,
SJ | Hungarian Province
[From “Jesuits 2024 - The Society of Jesus in the world”] The Faludi Ferenc Jesuit Academy launched in
2022 a new dialogue between faith and science through a series of eight
“mirror” conferences held between January and June. The novelty of this
dialogue is that it brought to the same table representatives of religious
institutions and men and women with a scientific background to debate on
selected topics relevant both for the protection of creation and sustainable
development. The following topics were selected for social reflection and
debate: partnership and dialogue, green economy, sustainable lifestyle, climate
change, poverty, sustainable communities, environmental change, social justice.
The original approach of these series of mirror conferences, entitled “Forum
for an Integral Ecology,” was embedded in the format of all the conferences, so
that each selected topic was addressed by two specialists, one presenting the
r

In [16]:
# Analyze the text
tagged_tokens, nouns, numbers, pos_tag_count = analyze_text(text)

In [17]:
# Print results
print("POS Tagged Tokens:")
print(tagged_tokens)
print("\nNouns:")
print(nouns)
print("\nNumbers:")
print(numbers)
print("\nPOS Tag Count:")
print(pos_tag_count)

POS Tagged Tokens:
[('By', 'IN'), ('József', 'NNP'), ('Benedek', 'NNP'), (';', ':'), ('Gábor', 'NNP'), ('Nevelős', 'NNP'), (',', ','), ('SJ', 'NNP'), ('|', 'NNP'), ('Hungarian', 'NNP'), ('Province', 'NNP'), ('[', 'NN'), ('From', 'NNP'), ('“', 'NNP'), ('Jesuits', 'NNP'), ('2024', 'CD'), ('-', ':'), ('The', 'DT'), ('Society', 'NNP'), ('of', 'IN'), ('Jesus', 'NNP'), ('in', 'IN'), ('the', 'DT'), ('world', 'NN'), ('”', 'NN'), (']', 'VBP'), ('The', 'DT'), ('Faludi', 'NNP'), ('Ferenc', 'NNP'), ('Jesuit', 'NNP'), ('Academy', 'NNP'), ('launched', 'VBN'), ('in', 'IN'), ('2022', 'CD'), ('a', 'DT'), ('new', 'JJ'), ('dialogue', 'NN'), ('between', 'IN'), ('faith', 'NN'), ('and', 'CC'), ('science', 'NN'), ('through', 'IN'), ('a', 'DT'), ('series', 'NN'), ('of', 'IN'), ('eight', 'CD'), ('“', 'NNS'), ('mirror', 'NN'), ('”', 'JJ'), ('conferences', 'NNS'), ('held', 'VBD'), ('between', 'IN'), ('January', 'NNP'), ('and', 'CC'), ('June', 'NNP'), ('.', '.'), ('The', 'DT'), ('novelty', 'NN'), ('of', 'IN'), ('