In [2]:
! pip install nltk



In [26]:
# importing libraries from nltk
import nltk                                 # natural language toolkit
nltk.download('punkt')                      # requirements
nltk.download('wordnet')                    # requirements
from nltk.tokenize import sent_tokenize     # corpus ---> document
from nltk.tokenize import word_tokenize     # document --> word

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
corpus = '''Contrary to popular belief!, Lorem Ipsum is not simply random text.
It has root's in a piece of classical Latin literature from 45 BC,
making it over 2000 years old!. Richard's McClintock, a Latin professor at
Hampden-Sydney in Virginia, looked up one of the more obscure Latin words.'''

print(corpus)

Contrary to popular belief!, Lorem Ipsum is not simply random text.
It has root's in a piece of classical Latin literature from 45 BC,
making it over 2000 years old!. Richard's McClintock, a Latin professor at
Hampden-Sydney in Virginia, looked up one of the more obscure Latin words.


In [11]:
# senetence tokenization to convert corpus into documents
documents = sent_tokenize(corpus)
print(documents)
print()

# looping to see individual document
num = 1
for sentence in documents:
    print(f'document-{num}:',sentence)
    num += 1

['Contrary to popular belief!, Lorem Ipsum is not simply random text.', "It has root's in a piece of classical Latin literature from 45 BC,\nmaking it over 2000 years old!.", "Richard's McClintock, a Latin professor at\nHampden-Sydney in Virginia, looked up one of the more obscure Latin words."]

document-1: Contrary to popular belief!, Lorem Ipsum is not simply random text.
document-2: It has root's in a piece of classical Latin literature from 45 BC,
making it over 2000 years old!.
document-3: Richard's McClintock, a Latin professor at
Hampden-Sydney in Virginia, looked up one of the more obscure Latin words.


In [12]:
# to convert document into words
# traditional method by loops
for sentence in documents:
    print(word_tokenize(sentence))

# special character (hyphen) between the words are not split

['Contrary', 'to', 'popular', 'belief', '!', ',', 'Lorem', 'Ipsum', 'is', 'not', 'simply', 'random', 'text', '.']
['It', 'has', 'root', "'s", 'in', 'a', 'piece', 'of', 'classical', 'Latin', 'literature', 'from', '45', 'BC', ',', 'making', 'it', 'over', '2000', 'years', 'old', '!', '.']
['Richard', "'s", 'McClintock', ',', 'a', 'Latin', 'professor', 'at', 'Hampden-Sydney', 'in', 'Virginia', ',', 'looked', 'up', 'one', 'of', 'the', 'more', 'obscure', 'Latin', 'words', '.']


In [13]:
# to over come the above issue

from nltk.tokenize import wordpunct_tokenize
for sentence in documents:
    print(wordpunct_tokenize(sentence))

# special character and the dot is not splitted

['Contrary', 'to', 'popular', 'belief', '!,', 'Lorem', 'Ipsum', 'is', 'not', 'simply', 'random', 'text', '.']
['It', 'has', 'root', "'", 's', 'in', 'a', 'piece', 'of', 'classical', 'Latin', 'literature', 'from', '45', 'BC', ',', 'making', 'it', 'over', '2000', 'years', 'old', '!.']
['Richard', "'", 's', 'McClintock', ',', 'a', 'Latin', 'professor', 'at', 'Hampden', '-', 'Sydney', 'in', 'Virginia', ',', 'looked', 'up', 'one', 'of', 'the', 'more', 'obscure', 'Latin', 'words', '.']


In [16]:
# to overcome the above issue
from nltk.tokenize import TreebankWordTokenizer
tokens = TreebankWordTokenizer()
for sentence in documents:
    print(tokens.tokenize(sentence))

['Contrary', 'to', 'popular', 'belief', '!', ',', 'Lorem', 'Ipsum', 'is', 'not', 'simply', 'random', 'text', '.']
['It', 'has', 'root', "'s", 'in', 'a', 'piece', 'of', 'classical', 'Latin', 'literature', 'from', '45', 'BC', ',', 'making', 'it', 'over', '2000', 'years', 'old', '!', '.']
['Richard', "'s", 'McClintock', ',', 'a', 'Latin', 'professor', 'at', 'Hampden-Sydney', 'in', 'Virginia', ',', 'looked', 'up', 'one', 'of', 'the', 'more', 'obscure', 'Latin', 'words', '.']


STEMMING - To find the root word

In [21]:
word_list = ['Programming', 'Programs', 'Acheive', 'Acheiving', 'enjoyment', 'eagerly',
            'enjoying', 'enjoyed', 'History', 'Historical','eating', 'eaten', 'orderly']

In [22]:
from nltk.stem import PorterStemmer
stemmed = PorterStemmer().stem
for word in word_list:
    print(word, '=', stemmed(word))


# not all the root words are identified
# we ca use porterstemmer for natural language understanding (siri, alexa, google assistance)

Programming = program
Programs = program
Acheive = acheiv
Acheiving = acheiv
enjoyment = enjoy
eagerly = eagerli
enjoying = enjoy
enjoyed = enjoy
History = histori
Historical = histor
eating = eat
eaten = eaten
orderly = orderli


In [23]:
from nltk.stem import SnowballStemmer
snow_ball = SnowballStemmer('english').stem
for word in word_list:
    print(word, '=', snow_ball(word))

# we can use snowball stemmer when we want to exclude the NLU consideration

Programming = program
Programs = program
Acheive = acheiv
Acheiving = acheiv
enjoyment = enjoy
eagerly = eager
enjoying = enjoy
enjoyed = enjoy
History = histori
Historical = histor
eating = eat
eaten = eaten
orderly = order


LEMMATIZATION

In [32]:
from nltk.stem import WordNetLemmatizer
lemmas = WordNetLemmatizer().lemmatize

for word in word_list:
    print(word, '=', lemmas(word, pos='n'))

# pos - Part Of Speech
# pos ---> n-noun, v-verb, a-adjective, r-adverb, s-satellite adjective

Programming = Programming
Programs = Programs
Acheive = Acheive
Acheiving = Acheiving
enjoyment = enjoyment
eagerly = eagerly
enjoying = enjoying
enjoyed = enjoyed
History = History
Historical = Historical
eating = eating
eaten = eaten
orderly = orderly


In [33]:
print(lemmas('goes', 'v'))   # v - verb

go
