In [3]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import re
import requests
from bs4 import BeautifulSoup

nltk.download('punkt')

sample_paragraph = (
    "Natural Language Processing (NLP) is a fascinating field of Artificial Intelligence. "
    "It focuses on the interaction between humans and computers using natural language. "
    "Tokenization is one of the fundamental steps in NLP, where text is broken down into sentences or words."
)

def tokenize_sentences(paragraph):
    return sent_tokenize(paragraph)

def tokenize_words(paragraph):
    return word_tokenize(paragraph)

def clean_text(text):
    """Remove special characters and convert text to lowercase."""
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text

def extract_emails(text):
    """Extract all email addresses from the given text."""
    email_pattern = r'[\w\.-]+@[\w\.-]+\.\w{2,}'
    return re.findall(email_pattern, text)

def fetch_webpage_title(url):
    """Fetch and print the title of the given webpage."""
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        return soup.title.string.strip()
    except Exception as e:
        return f"Error fetching webpage title: {e}"

if __name__ == "__main__":
    sentences = tokenize_sentences(sample_paragraph)
    print("Sentences:")
    for idx, sentence in enumerate(sentences, 1):
        print(f"{idx}: {sentence}")

    print("\n")

    words = tokenize_words(sample_paragraph)
    print("Words:")
    print(words)

    print("\n")

    test_text = 'Hello, World! Welcome to NLP 101.'
    cleaned_text = clean_text(test_text)
    print("Cleaned Text:")
    print(cleaned_text)

    print("\n")

    email_text = 'Contact us at support@example.com and sales@example.org.'
    emails = extract_emails(email_text)
    print("Extracted Emails:")
    print(emails)

    print("\n")

    url = 'https://example.com'
    webpage_title = fetch_webpage_title(url)
    print("Webpage Title:")
    print(webpage_title)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Sentences:
1: Natural Language Processing (NLP) is a fascinating field of Artificial Intelligence.
2: It focuses on the interaction between humans and computers using natural language.
3: Tokenization is one of the fundamental steps in NLP, where text is broken down into sentences or words.


Words:
['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'is', 'a', 'fascinating', 'field', 'of', 'Artificial', 'Intelligence', '.', 'It', 'focuses', 'on', 'the', 'interaction', 'between', 'humans', 'and', 'computers', 'using', 'natural', 'language', '.', 'Tokenization', 'is', 'one', 'of', 'the', 'fundamental', 'steps', 'in', 'NLP', ',', 'where', 'text', 'is', 'broken', 'down', 'into', 'sentences', 'or', 'words', '.']


Cleaned Text:
hello world welcome to nlp 101


Extracted Emails:
['support@example.com', 'sales@example.org']


Webpage Title:
Example Domain


In [2]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import re
import requests
from bs4 import BeautifulSoup

# Download the 'punkt' and 'punkt_tab' resources
nltk.download('punkt')
nltk.download('punkt_tab')  # Download the 'punkt_tab' resource

# ... (rest of your code remains the same)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True