In [None]:
import nltk
from nltk.tokenize import (
    PunktSentenceTokenizer, TreebankWordTokenizer, word_tokenize, RegexpTokenizer
)
from nltk.stem import WordNetLemmatizer, PorterStemmer, SnowballStemmer, RegexpStemmer
from nltk.corpus import stopwords
import string

In [None]:
# Install and download necessary packages
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download("stopwords")
nltk.download("averaged_perceptron_tagger")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
# Paragraph for analysis
paragraph = (
    "Finance refers to monetary resources and to the study and discipline of money, currency, assets and liabilities. "
    "As a subject of study, it is related to but distinct from economics, which is the study of the production, distribution, and consumption of goods and services."
)


In [None]:
# Punkt Sentence Tokenizer
punkt_tokenizer = PunktSentenceTokenizer()
sentences = punkt_tokenizer.tokenize(paragraph)
print("\nTokenization (Punkt Sentence Tokenizer):")
print(sentences)


Tokenization (Punkt Sentence Tokenizer):
['Finance refers to monetary resources and to the study and discipline of money, currency, assets and liabilities.', 'As a subject of study, it is related to but distinct from economics, which is the study of the production, distribution, and consumption of goods and services.']


In [None]:
# Treebank Word Tokenizer
treebank_tokenizer = TreebankWordTokenizer()
treebank_tokens = treebank_tokenizer.tokenize(paragraph)
print("\nTokenization (Treebank Word Tokenizer):")
print(treebank_tokens)



Tokenization (Treebank Word Tokenizer):
['Finance', 'refers', 'to', 'monetary', 'resources', 'and', 'to', 'the', 'study', 'and', 'discipline', 'of', 'money', ',', 'currency', ',', 'assets', 'and', 'liabilities.', 'As', 'a', 'subject', 'of', 'study', ',', 'it', 'is', 'related', 'to', 'but', 'distinct', 'from', 'economics', ',', 'which', 'is', 'the', 'study', 'of', 'the', 'production', ',', 'distribution', ',', 'and', 'consumption', 'of', 'goods', 'and', 'services', '.']


In [None]:
import nltk

# Download the 'punkt_tab' dataset
nltk.download('punkt_tab')

# Now, you can use word_tokenize as before:
word_tokens = word_tokenize(paragraph)
print("\nTokenization (Word Tokenizer):")
print(word_tokens)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.



Tokenization (Word Tokenizer):
['Finance', 'refers', 'to', 'monetary', 'resources', 'and', 'to', 'the', 'study', 'and', 'discipline', 'of', 'money', ',', 'currency', ',', 'assets', 'and', 'liabilities', '.', 'As', 'a', 'subject', 'of', 'study', ',', 'it', 'is', 'related', 'to', 'but', 'distinct', 'from', 'economics', ',', 'which', 'is', 'the', 'study', 'of', 'the', 'production', ',', 'distribution', ',', 'and', 'consumption', 'of', 'goods', 'and', 'services', '.']


In [None]:
# Regular Expression Tokenizer
regexp_tokenizer = RegexpTokenizer(r"\w+")
regexp_tokens = regexp_tokenizer.tokenize(paragraph)
print("\nTokenization (RegExp Tokenizer):")
print(regexp_tokens)



Tokenization (RegExp Tokenizer):
['Finance', 'refers', 'to', 'monetary', 'resources', 'and', 'to', 'the', 'study', 'and', 'discipline', 'of', 'money', 'currency', 'assets', 'and', 'liabilities', 'As', 'a', 'subject', 'of', 'study', 'it', 'is', 'related', 'to', 'but', 'distinct', 'from', 'economics', 'which', 'is', 'the', 'study', 'of', 'the', 'production', 'distribution', 'and', 'consumption', 'of', 'goods', 'and', 'services']


In [None]:

# Sentence Segmentation
print("\nSentence Segmentation:")
for idx, sentence in enumerate(sentences, start=1):
    print(f"Sentence {idx}: {sentence}")



Sentence Segmentation:
Sentence 1: Finance refers to monetary resources and to the study and discipline of money, currency, assets and liabilities.
Sentence 2: As a subject of study, it is related to but distinct from economics, which is the study of the production, distribution, and consumption of goods and services.


In [None]:
# Lemmatization using WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word.lower()) for word in word_tokens]
print("\nLemmatization (WordNet Lemmatizer):")
print(lemmatized_words)


Lemmatization (WordNet Lemmatizer):
['finance', 'refers', 'to', 'monetary', 'resource', 'and', 'to', 'the', 'study', 'and', 'discipline', 'of', 'money', ',', 'currency', ',', 'asset', 'and', 'liability', '.', 'a', 'a', 'subject', 'of', 'study', ',', 'it', 'is', 'related', 'to', 'but', 'distinct', 'from', 'economics', ',', 'which', 'is', 'the', 'study', 'of', 'the', 'production', ',', 'distribution', ',', 'and', 'consumption', 'of', 'good', 'and', 'service', '.']


In [None]:
# Stemming using Porter Stemmer
porter_stemmer = PorterStemmer()
porter_stemmed_words = [porter_stemmer.stem(word) for word in word_tokens]
print("\nStemming (Porter Stemmer):")
print(porter_stemmed_words)

# Stemming using Snowball Stemmer
snowball_stemmer = SnowballStemmer("english")
snowball_stemmed_words = [snowball_stemmer.stem(word) for word in word_tokens]
print("\nStemming (Snowball Stemmer):")
print(snowball_stemmed_words)

# Stemming using RegExp Stemmer
regexp_stemmer = RegexpStemmer("ing$|ed$|ly$", min=4)
regexp_stemmed_words = [regexp_stemmer.stem(word) for word in word_tokens]
print("\nStemming (RegExp Stemmer):")
print(regexp_stemmed_words)



Stemming (Porter Stemmer):
['financ', 'refer', 'to', 'monetari', 'resourc', 'and', 'to', 'the', 'studi', 'and', 'disciplin', 'of', 'money', ',', 'currenc', ',', 'asset', 'and', 'liabil', '.', 'as', 'a', 'subject', 'of', 'studi', ',', 'it', 'is', 'relat', 'to', 'but', 'distinct', 'from', 'econom', ',', 'which', 'is', 'the', 'studi', 'of', 'the', 'product', ',', 'distribut', ',', 'and', 'consumpt', 'of', 'good', 'and', 'servic', '.']

Stemming (Snowball Stemmer):
['financ', 'refer', 'to', 'monetari', 'resourc', 'and', 'to', 'the', 'studi', 'and', 'disciplin', 'of', 'money', ',', 'currenc', ',', 'asset', 'and', 'liabil', '.', 'as', 'a', 'subject', 'of', 'studi', ',', 'it', 'is', 'relat', 'to', 'but', 'distinct', 'from', 'econom', ',', 'which', 'is', 'the', 'studi', 'of', 'the', 'product', ',', 'distribut', ',', 'and', 'consumpt', 'of', 'good', 'and', 'servic', '.']

Stemming (RegExp Stemmer):
['Finance', 'refers', 'to', 'monetary', 'resources', 'and', 'to', 'the', 'study', 'and', 'discip

In [None]:
# Stopword Removal
stop_words = set(stopwords.words("english"))
filtered_words = [word for word in lemmatized_words if word not in stop_words and word not in string.punctuation]
print("\nStopword Removal Result:")
print(filtered_words)

# Porter Stemmer Example
print("\nPorter Stemmer Examples:")
words = ["play", "playing", "played", "plays"]
for word in words:
    print(f"Original: {word} -> Stemmed: {porter_stemmer.stem(word)}")

# Snowball Stemmer Example
print("\nSnowball Stemmer Examples:")
words = ["Excellent", "service"]
for word in words:
    print(f"Original: {word} -> Stemmed: {snowball_stemmer.stem(word)}")



Stopword Removal Result:
['finance', 'refers', 'monetary', 'resource', 'study', 'discipline', 'money', 'currency', 'asset', 'liability', 'subject', 'study', 'related', 'distinct', 'economics', 'study', 'production', 'distribution', 'consumption', 'good', 'service']

Porter Stemmer Examples:
Original: play -> Stemmed: play
Original: playing -> Stemmed: play
Original: played -> Stemmed: play
Original: plays -> Stemmed: play

Snowball Stemmer Examples:
Original: Excellent -> Stemmed: excel
Original: service -> Stemmed: servic


In [None]:
# RegExp Tokenizer Example
print("\nRegExp Tokenizer Examples:")
regexp_tokenizer_advanced = RegexpTokenizer("[\\w']+")
text_example = "Let's see how it's working."
print(regexp_tokenizer_advanced.tokenize(text_example))

# Treebank Word Tokenizer Example
print("\nTreebank Word Tokenizer Example:")
print(treebank_tokenizer.tokenize("Why blood? Same blood."))


RegExp Tokenizer Examples:
["Let's", 'see', 'how', "it's", 'working']

Treebank Word Tokenizer Example:
['Why', 'blood', '?', 'Same', 'blood', '.']
