In [1]:
pip install nltk



Perform tokenization (Whitespace, Punctuation-based, Treebank, Tweet, MWE) using NLTK library.
Use porter stemmer and snowball stemmer for stemming. Use any technique for lemmatization. bold text

In [2]:
sentence1 = "This is a sentence"
sentence2 = "This is another big sentence"

In [3]:
import nltk
from nltk.tokenize import (
    word_tokenize,
    wordpunct_tokenize,
    TreebankWordTokenizer,
    TweetTokenizer,
    MWETokenizer
)

nltk.download('punkt')

print(f'Whitespace tokenization = {sentence1.split()}')

print(f'Punctuation-based tokenization = {wordpunct_tokenize(sentence1)}')

tokenizer = MWETokenizer()
tokenizer.add_mwe(('Martha', 'Jones'))
print(f'Multi-word expression (MWE) tokenization = {tokenizer.tokenize(word_tokenize(sentence1))}')

tokenizer = TweetTokenizer()
print(f'Tweet-rules based tokenization = {tokenizer.tokenize(sentence1)}')

tokenizer = TreebankWordTokenizer()
print(f'Default/Treebank tokenization = {tokenizer.tokenize(sentence1)}')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Whitespace tokenization = ['This', 'is', 'a', 'sentence']
Punctuation-based tokenization = ['This', 'is', 'a', 'sentence']
Multi-word expression (MWE) tokenization = ['This', 'is', 'a', 'sentence']
Tweet-rules based tokenization = ['This', 'is', 'a', 'sentence']
Default/Treebank tokenization = ['This', 'is', 'a', 'sentence']


In [4]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

#list of tokenized words
token = word_tokenize(sentence2)

#stem's of each word
stem_words = [stemmer.stem(word) for word in token]

#print stemming results
for e1, e2 in zip(token, stem_words):
    print(e1.ljust(13), '-->', '\t', e2)

This          --> 	 thi
is            --> 	 is
another       --> 	 anoth
big           --> 	 big
sentence      --> 	 sentenc


In [5]:
from nltk.stem.snowball import SnowballStemmer

#the stemmer requires a language parameter
snow_stemmer = SnowballStemmer(language='english')

#list of tokenized words
token = word_tokenize(sentence2)

#stem's of each word
stem_words = [snow_stemmer.stem(word) for word in token]

#print stemming results
for e1, e2 in zip(token, stem_words):
    print(e1.ljust(13), '-->', '\t', e2)

This          --> 	 this
is            --> 	 is
another       --> 	 anoth
big           --> 	 big
sentence      --> 	 sentenc


In [6]:
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

token = word_tokenize(sentence2)

lemmatized_output = [lemmatizer.lemmatize(word) for word in token]

#print stemming results
for e1, e2 in zip(token, lemmatized_output):
    print(e1.ljust(13), '-->', '\t', e2)

[nltk_data] Downloading package wordnet to /root/nltk_data...


This          --> 	 This
is            --> 	 is
another       --> 	 another
big           --> 	 big
sentence      --> 	 sentence
