In [63]:
!pip install spacy
!pip install gensim



In [64]:
import nltk
from nltk.tokenize import (word_tokenize,
                          sent_tokenize,
                          TreebankWordTokenizer,
                          WordPunctTokenizer,
                          TweetTokenizer,
                          MWETokenizer)

## White Space Tokenizer

In [77]:
text = "I hope I'd get better at coding and land a decent job. Oh no! Anyways. 😗 🧐"
print("Tokenize Text: {}".format(text.split(" ")))
print("Lenght: {}".format(len(text.split(" "))))

Tokenize Text: ['I', 'hope', "I'd", 'get', 'better', 'at', 'coding', 'and', 'land', 'a', 'decent', 'job.', 'Oh', 'no!', 'Anyways.', '😗', '🧐']
Lenght: 17


In [79]:
from nltk.tokenize import WhitespaceTokenizer
WT = WhitespaceTokenizer()
print("Tokenized Words: {}".format(WT.tokenize(text)))

Tokenized Words: ['I', 'hope', "I'd", 'get', 'better', 'at', 'coding', 'and', 'land', 'a', 'decent', 'job.', 'Oh', 'no!', 'Anyways.', '😗', '🧐']
Length: 17


## Words and Sentence Tokenizer

**Rules**
1. Words - Break text based on whitespaces and punctuation.<br>
2. Sentence - Break text based on punctuation.

In [75]:
text = "I hope I'd get better at coding and land a decent job. Oh no! Anyways. 😗 🧐"
print("Tokenized Words: {}".format(word_tokenize(text)))
print("Length: {}".format(len(word_tokenize(text))))

Tokenized Words: ['I', 'hope', 'I', "'d", 'get', 'better', 'at', 'coding', 'and', 'land', 'a', 'decent', 'job', '.', 'Oh', 'no', '!', 'Anyways', '.', '😗', '🧐']
Length: 21


In [66]:
print("Tokenized Sentence: {}".format(sent_tokenize(text)))
print("Lenght: {}".format(len(sent_tokenize(text))))

Tokenized Sentence: ["I hope I'd get better at coding and land a decent job.", 'Oh no!', 'Anyways.', '😗🧐']
Lenght: 4


## Punctuation Tokenizer

**Rules**
1. Punctuation: Splits almost all special symbols and treat them as separate units.

In [67]:
print("Tokenized Punctuation: {}".format(WordPunctTokenizer().tokenize(text)))
print("Length: {}".format(len(WordPunctTokenizer().tokenize(text))))

Tokenized Punctuation: ['I', 'hope', 'I', "'", 'd', 'get', 'better', 'at', 'coding', 'and', 'land', 'a', 'decent', 'job', '.', 'Oh', 'no', '!', 'Anyways', '.', '😗🧐']
Length: 21


## Treebank Word Tokenizer

**Rules**
1. Treebank: Uses regular expressions to tokenize text.<br><br>
Regular Expressions: A filter that describes a set of strings that matches the pattern.

In [68]:
print("Tokenized Treebank: {}".format(TreebankWordTokenizer().tokenize(text)))
print("Length: {}".format(len(TreebankWordTokenizer().tokenize(text))))

Tokenized Treebank: ['I', 'hope', 'I', "'d", 'get', 'better', 'at', 'coding', 'and', 'land', 'a', 'decent', 'job.', 'Oh', 'no', '!', 'Anyways.', '😗🧐']
Length: 18


## Tweet Tokenizer

**Rules**
1. Tweet - Considers Emoji/Unicodes as different tokens.

In [69]:
text = "Don't take everything seriously on the internet 😗🧐"
print("Tokenized Tweet: {}".format(TweetTokenizer().tokenize(text)))
print("Length: {}".format(len(TweetTokenizer().tokenize(text))))

Tokenized Tweet: ["Don't", 'take', 'everything', 'seriously', 'on', 'the', 'internet', '😗', '🧐']
Length: 9


## MWET Tokenizer

MWET - Multi-Word Expression Tokenizer<br><br>
**Rules**
1. MWET - Allows the user to enter multiple word expressions before using the tokenizer on the text.

In [84]:
text = "Transformers 2 is the best Transformer movie!"
print("Tokenized MWET: {}".format(MWETokenizer().tokenize(WT.tokenize(text))))
print("Length: {}".format(len(MWETokenizer().tokenize((text)))))

Tokenized MWET: ['Transformers', '2', 'is', 'the', 'best', 'Transformer', 'movie!']
Length: 45


In [71]:
tokenizer = MWETokenizer()
tokenizer.add_mwe(('Transformers', '2'))
print("Tokenized MWET: {}".format(tokenizer.tokenize(word_tokenize(text))))
print("Lenght: {}".format(len(tokenizer.tokenize(word_tokenize(text)))))

Tokenized MWET: ['Transformers_2', 'is', 'the', 'best', 'Transformer', 'movie', '!']
Lenght: 7


## Gensim Word Tokenizer

In [72]:
from gensim.utils import tokenize
text = "I hope I'd get better at coding and land a decent job. Oh no! Anyways."
print("Tokenized Gensim Word: {}".format(list(tokenize(text))))
print("Length: {}".format(len(list(tokenize(text)))))

Tokenized Gensim Word: ['I', 'hope', 'I', 'd', 'get', 'better', 'at', 'coding', 'and', 'land', 'a', 'decent', 'job', 'Oh', 'no', 'Anyways']
Length: 16


## Porter Stemmer

In [85]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenizStemminge

**1. Individual Words**

In [86]:
ps = PorterStemmer()
 
# choose some words to be stemmed
words = ["program", "programs", "programmer", "programming", "programmers"]
 
for w in words:
    print(w, " : ", ps.stem(w))

program  :  program
programs  :  program
programmer  :  programm
programming  :  program
programmers  :  programm


**2. Sentences**

In [87]:
sentence = "Programmers program with programming languages"
words = word_tokenize(sentence)
 
for w in words:
    print(w, " : ", ps.stem(w))

Programmers  :  programm
program  :  program
with  :  with
programming  :  program
languages  :  languag


**3. Stemming in Sentences**

In [89]:
from functools import reduce

sentence = "Programmers program with programming languages"
words = word_tokenize(sentence)
 
# using reduce to apply stemmer to each word and join them back into a string
stemmed_sentence = reduce(lambda x, y: x + " " + ps.stem(y), words, "")
 
print(stemmed_sentence)

 programm program with program languag


## Lemmatization

Involves methods to identify and transform words into their base or root forms.<br><br>
**1. Rule-Based**<br>
Word: “walked”<br>
Rule Application: Remove “-ed”<br>
Result: “walk"<br><br>
**2. Dictionary-Based**<br>
‘running’ -> ‘run’<br>
‘better’ -> ‘good’<br>
‘went’ -> ‘go’<br><br>
**3. Machine Learning-Based**<br>
Machine learning-based lemmatization leverages computational models to automatically learn the relationships between words and their base forms. Unlike rule-based or dictionary-based approaches, machine learning models, such as neural networks or statistical models, are trained on large text datasets to generalize patterns in language.

In [90]:
from nltk.stem import WordNetLemmatizer
 
lemmatizer = WordNetLemmatizer()
 
print("rocks :", lemmatizer.lemmatize("rocks"))
print("corpora :", lemmatizer.lemmatize("corpora"))
 
# a denotes adjective in "pos"
print("better :", lemmatizer.lemmatize("better", pos="a"))

rocks : rock
corpora : corpus
better : good
