In [1]:
!pip install nltk



In [2]:
import nltk
from nltk.tokenize import WhitespaceTokenizer
from nltk.tokenize import wordpunct_tokenize
from nltk.tokenize import TreebankWordTokenizer
from nltk.tokenize import TweetTokenizer
from nltk.tokenize import MWETokenizer

from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer

In [3]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [4]:
text = "Hello! I am learning NLP using NLTK. I love machine learning, AI, and data science. :) #NLPRocks"

# Whitespace
ws = WhitespaceTokenizer()
print("Whitespace:", ws.tokenize(text))

# Punctuation
print("WordPunct:", wordpunct_tokenize(text))

# Treebank
tree = TreebankWordTokenizer()
print("Treebank:", tree.tokenize(text))

# Tweet
tweet = TweetTokenizer()
print("Tweet:", tweet.tokenize(text))

# MWE
mwe = MWETokenizer([('machine','learning'), ('data','science')])
print("MWE:", mwe.tokenize(text.split()))

# ---------------- STEMMING ----------------

words = ["running", "runs", "studies", "studying"]

porter = PorterStemmer()
print("Porter:", [porter.stem(w) for w in words])

snow = SnowballStemmer("english")
print("Snowball:", [snow.stem(w) for w in words])

# ---------------- LEMMATIZATION ----------------

lemmatizer = WordNetLemmatizer()
words2 = ["running", "better", "studies", "cars"]

print("Lemmatization:", [lemmatizer.lemmatize(w) for w in words2])


Whitespace: ['Hello!', 'I', 'am', 'learning', 'NLP', 'using', 'NLTK.', 'I', 'love', 'machine', 'learning,', 'AI,', 'and', 'data', 'science.', ':)', '#NLPRocks']
WordPunct: ['Hello', '!', 'I', 'am', 'learning', 'NLP', 'using', 'NLTK', '.', 'I', 'love', 'machine', 'learning', ',', 'AI', ',', 'and', 'data', 'science', '.', ':)', '#', 'NLPRocks']
Treebank: ['Hello', '!', 'I', 'am', 'learning', 'NLP', 'using', 'NLTK.', 'I', 'love', 'machine', 'learning', ',', 'AI', ',', 'and', 'data', 'science.', ':', ')', '#', 'NLPRocks']
Tweet: ['Hello', '!', 'I', 'am', 'learning', 'NLP', 'using', 'NLTK', '.', 'I', 'love', 'machine', 'learning', ',', 'AI', ',', 'and', 'data', 'science', '.', ':)', '#NLPRocks']
MWE: ['Hello!', 'I', 'am', 'learning', 'NLP', 'using', 'NLTK.', 'I', 'love', 'machine', 'learning,', 'AI,', 'and', 'data', 'science.', ':)', '#NLPRocks']
Porter: ['run', 'run', 'studi', 'studi']
Snowball: ['run', 'run', 'studi', 'studi']
Lemmatization: ['running', 'better', 'study', 'car']
