In [None]:
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
text = "The day is very sunny, also has a soft wind. It is autumn, and I love the session. Unlike summer it is not hard, neither harsh like winter."

In [None]:
sentences = sent_tokenize(text)

for i, sentence in enumerate(sentences):
  print(f"Sentence {i + 1}: {sentence}")

Sentence 1: The day is very sunny, also has a soft wind.
Sentence 2: It is autumn, and I love the session.
Sentence 3: Unlike summer it is not hard, neither harsh like winter.


# Punctuations removing, lowercasing

In [None]:
# remove punctuations, special characters, url as these creates complexity or noise while training the model

import re
from nltk.tokenize import word_tokenize
nltk.download('punkt')

text = "Hello! Check out my website: http://example.com. It's awesome! #excited @user $100."
url_pattern  = r"http\S+|www\S+"
punctuation_pattern = r"[^a-zA-Z0-9\s]"

#  clean URL
cleaned_text = re.sub(url_pattern, '', text)

#  clean punctutations
cleaned_text = re.sub(punctuation_pattern, '', text)

# lowercase the words
cleaned_text = cleaned_text.lower()

tokens = word_tokenize(cleaned_text)
print(cleaned_text)
print(tokens)

hello check out my website httpexamplecom its awesome excited user 100
['hello', 'check', 'out', 'my', 'website', 'httpexamplecom', 'its', 'awesome', 'excited', 'user', '100']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# POS tagging

Parts-Of-Speech tagging

In [None]:
import nltk
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize

nltk.download('averaged_perceptron_tagger_eng')

sentence = "The world is full of bullshit and toxic people, don't wanna make the place better. Yeah!! I will stop them anyway."
word_tokens = word_tokenize(sentence)
pos_tagged_words = pos_tag(word_tokens)

pos_tagged_words

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


[('The', 'DT'),
 ('world', 'NN'),
 ('is', 'VBZ'),
 ('full', 'JJ'),
 ('of', 'IN'),
 ('bullshit', 'NN'),
 ('and', 'CC'),
 ('toxic', 'JJ'),
 ('people', 'NNS'),
 (',', ','),
 ('do', 'VBP'),
 ("n't", 'RB'),
 ('wan', 'VB'),
 ('na', 'RB'),
 ('make', 'VB'),
 ('the', 'DT'),
 ('place', 'NN'),
 ('better', 'RBR'),
 ('.', '.'),
 ('Yeah', 'UH'),
 ('!', '.'),
 ('!', '.'),
 ('I', 'PRP'),
 ('will', 'MD'),
 ('stop', 'VB'),
 ('them', 'PRP'),
 ('anyway', 'RB'),
 ('.', '.')]

# Stopwords removal

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')

sentence = "The world is full of bullshit and toxic people, don't wanna make the place better. Yeah!! I will stop them anyway."
sentence = sentence.lower()
word_tokens = word_tokenize(sentence)

# getting the list of the stopwords
stop_words = set(stopwords.words('english'))

filtered_tokens = [word for word in word_tokens if word not in stop_words]
filtered_tokens

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['world',
 'full',
 'bullshit',
 'toxic',
 'people',
 ',',
 "n't",
 'wan',
 'na',
 'make',
 'place',
 'better',
 '.',
 'yeah',
 '!',
 '!',
 'stop',
 'anyway',
 '.']

# Text normalization

- To reduce noise by getting rid of irrelevant version of a text.

In [None]:
# Sample text
text = "I loveeeeee this! Do you luv it too?"

# Define normalization rules
def normalize(text):
    # Convert to lowercase
    text = text.lower()
    # Replace repeated characters (e.g., loveeeeee -> love)
    text = re.sub(r'(.)\1+', r'\1', text)
    # Replace slang (e.g., luv -> love)
    text = re.sub(r'\bluv\b', 'love', text)
    return text

# Normalize the text
normalized_text = normalize(text)

# Tokenize the normalized text
tokens = word_tokenize(normalized_text)

# Print the normalized and tokenized text
print("Normalized Text:", normalized_text)
print("Tokenized Text:", tokens)

Normalized Text: i love this! do you love it to?
Tokenized Text: ['i', 'love', 'this', '!', 'do', 'you', 'love', 'it', 'to', '?']


# Stemming

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

text = "I am running in the park and feeling troubled."

words = word_tokenize(text)
stemmer = PorterStemmer()
stem_words = [stemmer.stem(word) for word in words]
print(stem_words)

['i', 'am', 'run', 'in', 'the', 'park', 'and', 'feel', 'troubl', '.']


# Lemmatization

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

text = "The quick brown foxes are jumping over the lazy dogs."
doc = nlp(text)

lemmatized_tokens = [word.lemma_ for word in doc]
print(lemmatized_tokens)

['the', 'quick', 'brown', 'fox', 'be', 'jump', 'over', 'the', 'lazy', 'dog', '.']
