In [11]:
!pip install nltk



In [12]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [13]:
from nltk.tokenize import word_tokenize, wordpunct_tokenize, TreebankWordTokenizer, TweetTokenizer, MWETokenizer
from nltk.stem import PorterStemmer, SnowballStemmer
from nltk.stem import WordNetLemmatizer

In [14]:
text = "RDR2 is an open world action RPG game set in 1800s era when there were no cars and minimal electricity. The main protagonist's name is Arthur Morgan. I love this game!"

In [15]:
whitespace_tokens = text.split()
whitespace_tokens

['RDR2',
 'is',
 'an',
 'open',
 'world',
 'action',
 'RPG',
 'game',
 'set',
 'in',
 '1800s',
 'era',
 'when',
 'there',
 'were',
 'no',
 'cars',
 'and',
 'minimal',
 'electricity.',
 'The',
 'main',
 "protagonist's",
 'name',
 'is',
 'Arthur',
 'Morgan.',
 'I',
 'love',
 'this',
 'game!']

In [16]:
punctuation_tokens = wordpunct_tokenize(text)
punctuation_tokens

['RDR2',
 'is',
 'an',
 'open',
 'world',
 'action',
 'RPG',
 'game',
 'set',
 'in',
 '1800s',
 'era',
 'when',
 'there',
 'were',
 'no',
 'cars',
 'and',
 'minimal',
 'electricity',
 '.',
 'The',
 'main',
 'protagonist',
 "'",
 's',
 'name',
 'is',
 'Arthur',
 'Morgan',
 '.',
 'I',
 'love',
 'this',
 'game',
 '!']

In [17]:
treebank_tokens = TreebankWordTokenizer().tokenize(text)
treebank_tokens

['RDR2',
 'is',
 'an',
 'open',
 'world',
 'action',
 'RPG',
 'game',
 'set',
 'in',
 '1800s',
 'era',
 'when',
 'there',
 'were',
 'no',
 'cars',
 'and',
 'minimal',
 'electricity.',
 'The',
 'main',
 'protagonist',
 "'s",
 'name',
 'is',
 'Arthur',
 'Morgan.',
 'I',
 'love',
 'this',
 'game',
 '!']

In [18]:
tweet_tokens = TweetTokenizer().tokenize(text)
tweet_tokens

['RDR',
 '2',
 'is',
 'an',
 'open',
 'world',
 'action',
 'RPG',
 'game',
 'set',
 'in',
 '1800s',
 'era',
 'when',
 'there',
 'were',
 'no',
 'cars',
 'and',
 'minimal',
 'electricity',
 '.',
 'The',
 'main',
 "protagonist's",
 'name',
 'is',
 'Arthur',
 'Morgan',
 '.',
 'I',
 'love',
 'this',
 'game',
 '!']

In [19]:
mwe = MWETokenizer([('human', 'language'), ('leading', 'platform')], separator='_')
mwe_tokens = mwe.tokenize(word_tokenize(text))
mwe_tokens

['RDR2',
 'is',
 'an',
 'open',
 'world',
 'action',
 'RPG',
 'game',
 'set',
 'in',
 '1800s',
 'era',
 'when',
 'there',
 'were',
 'no',
 'cars',
 'and',
 'minimal',
 'electricity',
 '.',
 'The',
 'main',
 'protagonist',
 "'s",
 'name',
 'is',
 'Arthur',
 'Morgan',
 '.',
 'I',
 'love',
 'this',
 'game',
 '!']

In [20]:
porter = PorterStemmer()
porter_stems = [porter.stem(word) for word in word_tokenize(text)]
porter_stems

['rdr2',
 'is',
 'an',
 'open',
 'world',
 'action',
 'rpg',
 'game',
 'set',
 'in',
 '1800',
 'era',
 'when',
 'there',
 'were',
 'no',
 'car',
 'and',
 'minim',
 'electr',
 '.',
 'the',
 'main',
 'protagonist',
 "'s",
 'name',
 'is',
 'arthur',
 'morgan',
 '.',
 'i',
 'love',
 'thi',
 'game',
 '!']

In [21]:
snowball = SnowballStemmer("english")
snowball_stems = [snowball.stem(word) for word in word_tokenize(text)]
snowball_stems

['rdr2',
 'is',
 'an',
 'open',
 'world',
 'action',
 'rpg',
 'game',
 'set',
 'in',
 '1800s',
 'era',
 'when',
 'there',
 'were',
 'no',
 'car',
 'and',
 'minim',
 'electr',
 '.',
 'the',
 'main',
 'protagonist',
 "'s",
 'name',
 'is',
 'arthur',
 'morgan',
 '.',
 'i',
 'love',
 'this',
 'game',
 '!']

In [22]:
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in word_tokenize(text)]
lemmatized_words

['RDR2',
 'is',
 'an',
 'open',
 'world',
 'action',
 'RPG',
 'game',
 'set',
 'in',
 '1800s',
 'era',
 'when',
 'there',
 'were',
 'no',
 'car',
 'and',
 'minimal',
 'electricity',
 '.',
 'The',
 'main',
 'protagonist',
 "'s",
 'name',
 'is',
 'Arthur',
 'Morgan',
 '.',
 'I',
 'love',
 'this',
 'game',
 '!']