In [11]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/shaonsikder/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shaonsikder/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/shaonsikder/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/shaonsikder/nltk_data...


True

In [22]:
# Sample text
text = "Hello! This is an example sentence. It contains some numbers like 123 and HTML tags <b>like this</b>."

### HTML tags removal:
*Strips out HTML markup from text. This is crucial when processing web-scraped data to get clean, readable text.*

In [23]:
# 1. HTML tags removal
def remove_html_tags(text):
    return re.sub(r'<[^>]+>', '', text)

print("\n1. HTML tags removal:")
print(remove_html_tags(text))


1. HTML tags removal:
Hello! This is an example sentence. It contains some numbers like 123 and HTML tags like this.


### Punctuation removal:
*Eliminates punctuation marks from the text. This can help reduce noise in the data and simplify further processing.*

In [14]:
# 2. Punctuation removal
def remove_punctuation(text):
    return re.sub(r'[^\w\s]', '', text)
print("\n2. Punctuation removal:")
print(remove_punctuation(text))


2. Punctuation removal:
Hello This is an example sentence It contains some numbers like 123 and HTML tags blike thisb


### Numbers removal:
*Removes numerical digits from the text. This is often done when numbers aren't relevant to the analysis or to reduce vocabulary size.*

In [16]:
# 3. Numbers removal
def remove_numbers(text):
    return re.sub(r'\d+', '', text)
print("\n3. Numbers removal:")
print(remove_numbers(text))


3. Numbers removal:
Hello! This is an example sentence. It contains some numbers like  and HTML tags <b>like this</b>.


### Stemming:
*Reduces words to their root or base form. For example, "running" becomes "run". It's a crude heuristic process that chops off word endings.*

In [17]:
# 4. Stemming
def stem_words(text):
    ps = PorterStemmer()
    words = word_tokenize(text)
    return [ps.stem(word) for word in words]

print("\n4. Stemming:") # Definition of stemming: https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html
print(stem_words(text))


4. Stemming:
['hello', '!', 'thi', 'is', 'an', 'exampl', 'sentenc', '.', 'it', 'contain', 'some', 'number', 'like', '123', 'and', 'html', 'tag', '<', 'b', '>', 'like', 'thi', '<', '/b', '>', '.']


### Lemmatization:
*Similar to stemming, but aims to return the base or dictionary form of a word. It's more sophisticated than stemming and uses linguistic knowledge to achieve a proper root word.*

In [18]:
# 5. Lemmatization
def lemmatize_words(text):
    lemmatizer = WordNetLemmatizer()
    words = word_tokenize(text)
    return [lemmatizer.lemmatize(word) for word in words]

print("\n5. Lemmatization:") # Definition of lemmatization: https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html
print(lemmatize_words(text))


5. Lemmatization:
['Hello', '!', 'This', 'is', 'an', 'example', 'sentence', '.', 'It', 'contains', 'some', 'number', 'like', '123', 'and', 'HTML', 'tag', '<', 'b', '>', 'like', 'this', '<', '/b', '>', '.']


### POS tagging:
*Assigns part-of-speech tags (like noun, verb, adjective) to each word in the text. This provides grammatical information that can be useful for further analysis.*

In [19]:
# 6. POS tagging
def pos_tag_text(text):
    words = word_tokenize(text)
    return pos_tag(words)

print("\n6. POS tagging:")
print(pos_tag_text(text))


6. POS tagging:
[('Hello', 'NN'), ('!', '.'), ('This', 'DT'), ('is', 'VBZ'), ('an', 'DT'), ('example', 'NN'), ('sentence', 'NN'), ('.', '.'), ('It', 'PRP'), ('contains', 'VBZ'), ('some', 'DT'), ('numbers', 'NNS'), ('like', 'IN'), ('123', 'CD'), ('and', 'CC'), ('HTML', 'NNP'), ('tags', 'VBP'), ('<', 'NNP'), ('b', 'NN'), ('>', 'NN'), ('like', 'IN'), ('this', 'DT'), ('<', 'NNP'), ('/b', 'NNP'), ('>', 'NNP'), ('.', '.')]


### Stopwords removal:
*Filters out common words that often don't carry much meaning, such as "the", "is", "at". This helps focus on the more important words in the text.*

In [20]:
# 7. Stopwords removal
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    return [word for word in words if word.lower() not in stop_words]
print("\n7. Stopwords removal:")
print(remove_stopwords(text))

### Tokenization:
*Breaks down text into individual words or subwords. It's the process of splitting a string of text into smaller units called tokens.*

In [25]:
# 8. Tokenization
def tokenize(text):
    return word_tokenize(text)

print("8. Tokenization:")
print(tokenize(text))

8. Tokenization:
['Hello', '!', 'This', 'is', 'an', 'example', 'sentence', '.', 'It', 'contains', 'some', 'numbers', 'like', '123', 'and', 'HTML', 'tags', '<', 'b', '>', 'like', 'this', '<', '/b', '>', '.']
