In [1]:
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /Users/binodrai/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/binodrai/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/binodrai/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/binodrai/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [5]:
import re
import nltk
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Initialize the Porter Stemmer and WordNet Lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Sample corpus
sentencescombined_sentences = ["I love this product! It's amazing.",
                               "<html><head><title>Title</title></head><body><p>This is a <b>sample</b> paragraph.</p></body></html>",
                               "He bought 123 apples and 456 oranges.",
                               "Running is a great way to stay fit.",
                               "The quick brown fox jumps over the lazy dog.",
                               "She sells sea shells by the sea shore.", 
                               "COVID-19 has impacted global economies significantly.",
                               "The new iPhone 13 features a sleek design and powerful performance.",
                               "Artificial intelligence and machine learning are transforming industries.",
                               "HTML, CSS, and JavaScript are essential technologies for web development." ]

# Function to preprocess text
def preprocess_text(text):
    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stop words
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # Apply stemming
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
    # Apply lemmatization
    lemmatized_tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in filtered_tokens]
    return {
        'original': text,
        'tokens': tokens,
        'filtered_tokens': filtered_tokens,
        'stemmed_tokens': stemmed_tokens,
        'lemmatized_tokens': lemmatized_tokens
    }

# Function to get the part of speech tag for lemmatization
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

# Process each document in the corpus
for document in sentencescombined_sentences:
    result = preprocess_text(document)
    print("Original Document:", document)
    print("Text After HTML Removal:", result['original'])
    print("Tokens:", result['tokens'])
    print("Filtered Tokens:", result['filtered_tokens'])
    print("Stemmed Tokens:", result['stemmed_tokens'])
    print("Lemmatized Tokens:", result['lemmatized_tokens'])
    print("-" * 50)

Original Document: I love this product! It's amazing.
Text After HTML Removal: i love this product its amazing
Tokens: ['i', 'love', 'this', 'product', 'its', 'amazing']
Filtered Tokens: ['love', 'product', 'amazing']
Stemmed Tokens: ['love', 'product', 'amaz']
Lemmatized Tokens: ['love', 'product', 'amaze']
--------------------------------------------------
Original Document: <html><head><title>Title</title></head><body><p>This is a <b>sample</b> paragraph.</p></body></html>
Text After HTML Removal: titlethis is a sample paragraph
Tokens: ['titlethis', 'is', 'a', 'sample', 'paragraph']
Filtered Tokens: ['titlethis', 'sample', 'paragraph']
Stemmed Tokens: ['titlethi', 'sampl', 'paragraph']
Lemmatized Tokens: ['titlethis', 'sample', 'paragraph']
--------------------------------------------------
Original Document: He bought 123 apples and 456 oranges.
Text After HTML Removal: he bought  apples and  oranges
Tokens: ['he', 'bought', 'apples', 'and', 'oranges']
Filtered Tokens: ['bought', 