# Tokenization - the process of converting a sequence of text into smaller parts, known as tokens. These tokens can be as small as characters or as long as words.

In [None]:
! pip install nltk



In [None]:
import nltk

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
text=" Hi john , How are you doing? I will be travelling to your city. Lets catchup"

sent_tokenize(text)


[' Hi john , How are you doing?',
 'I will be travelling to your city.',
 'Lets catchup']

In [None]:
word_tokenize(text)

['Hi',
 'john',
 ',',
 'How',
 'are',
 'you',
 'doing',
 '?',
 'I',
 'will',
 'be',
 'travelling',
 'to',
 'your',
 'city',
 '.',
 'Lets',
 'catchup']

# Stemming --Stemming is a text preprocessing technique used in natural language processing (NLP) to reduce words to their root or base form

In [None]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

print(stemmer.stem("Playing"))  # Output: Play
print(stemmer.stem("Plays"))    # Output: Play
print(stemmer.stem("Played"))   # Output: Play

print(stemmer.stem("increases")) # Output: increas (The word is not in dictionary)-->stemming is not a good choice for normalization.

play
play
play
increas


# Lemmatization- Lemmatization in NLP is a text normalization technique that switches any kind of word to its base root mode

In [None]:
import nltk
nltk.download('wordnet')


[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
from nltk.stem import WordNetLemmatizer

# Create a lemmatizer object
lemm = WordNetLemmatizer()

# Lemmatize the word "increases"
print( lemm.lemmatize("increases"))

print(lemm.lemmatize("running", pos='v'))

increase
run


In [None]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
from nltk.tag import pos_tag

text=" Hi john , How are you doing? I will be travelling to your city. Lets catchup"

tokens=word_tokenize(text)
pos_tag(tokens)

[('Hi', 'NNP'),
 ('john', 'NN'),
 (',', ','),
 ('How', 'WRB'),
 ('are', 'VBP'),
 ('you', 'PRP'),
 ('doing', 'VBG'),
 ('?', '.'),
 ('I', 'PRP'),
 ('will', 'MD'),
 ('be', 'VB'),
 ('travelling', 'VBG'),
 ('to', 'TO'),
 ('your', 'PRP$'),
 ('city', 'NN'),
 ('.', '.'),
 ('Lets', 'VBZ'),
 ('catchup', 'JJ')]

# Synonyms of particular word

In [None]:
from nltk.corpus import wordnet

wordnet.synsets('good')

[Synset('good.n.01'),
 Synset('good.n.02'),
 Synset('good.n.03'),
 Synset('commodity.n.01'),
 Synset('good.a.01'),
 Synset('full.s.06'),
 Synset('good.a.03'),
 Synset('estimable.s.02'),
 Synset('beneficial.s.01'),
 Synset('good.s.06'),
 Synset('good.s.07'),
 Synset('adept.s.01'),
 Synset('good.s.09'),
 Synset('dear.s.02'),
 Synset('dependable.s.04'),
 Synset('good.s.12'),
 Synset('good.s.13'),
 Synset('effective.s.04'),
 Synset('good.s.15'),
 Synset('good.s.16'),
 Synset('good.s.17'),
 Synset('good.s.18'),
 Synset('good.s.19'),
 Synset('good.s.20'),
 Synset('good.s.21'),
 Synset('well.r.01'),
 Synset('thoroughly.r.02')]

In [None]:
from nltk.corpus import wordnet

wordnet.synsets('computer')

[Synset('computer.n.01'), Synset('calculator.n.01')]

# Ngrams

In [22]:
from nltk.util import bigrams,trigrams,ngrams

In [23]:
text=" Hi john , How are you doing? I will be travelling to your city. Lets catchup"
quotes_token=nltk.word_tokenize(text)
quotes_token

['Hi',
 'john',
 ',',
 'How',
 'are',
 'you',
 'doing',
 '?',
 'I',
 'will',
 'be',
 'travelling',
 'to',
 'your',
 'city',
 '.',
 'Lets',
 'catchup']

In [24]:
quotes_bigrams=list(nltk.bigrams(quotes_token))
quotes_bigrams

[('Hi', 'john'),
 ('john', ','),
 (',', 'How'),
 ('How', 'are'),
 ('are', 'you'),
 ('you', 'doing'),
 ('doing', '?'),
 ('?', 'I'),
 ('I', 'will'),
 ('will', 'be'),
 ('be', 'travelling'),
 ('travelling', 'to'),
 ('to', 'your'),
 ('your', 'city'),
 ('city', '.'),
 ('.', 'Lets'),
 ('Lets', 'catchup')]

In [26]:
quotes_trigrams=list(nltk.trigrams(quotes_token))
quotes_trigrams

[('Hi', 'john', ','),
 ('john', ',', 'How'),
 (',', 'How', 'are'),
 ('How', 'are', 'you'),
 ('are', 'you', 'doing'),
 ('you', 'doing', '?'),
 ('doing', '?', 'I'),
 ('?', 'I', 'will'),
 ('I', 'will', 'be'),
 ('will', 'be', 'travelling'),
 ('be', 'travelling', 'to'),
 ('travelling', 'to', 'your'),
 ('to', 'your', 'city'),
 ('your', 'city', '.'),
 ('city', '.', 'Lets'),
 ('.', 'Lets', 'catchup')]

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [25]:
from nltk.util import bigrams,trigrams,ngrams

sentence = "I love to play games"
tokens = nltk.word_tokenize(sentence)

n = 2  # For bigrams
for gram in ngrams(tokens, n):
    print(gram)

('I', 'love')
('love', 'to')
('to', 'play')
('play', 'games')


#Output:

('I', 'love')
('love', 'to')
('to', 'play')
('play', 'games')

In [None]:
import nltk
print(nltk.__version__)

3.8.1


In [None]:
import nltk
nltk.download('treebank')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.


True

In [1]:
import nltk
nltk.download()

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> l
Packages:
  [ ] abc................. Australian Broadcasting Commission 2006
  [ ] alpino.............. Alpino Dutch Treebank
  [ ] averaged_perceptron_tagger Averaged Perceptron Tagger
  [ ] averaged_perceptron_tagger_ru Averaged Perceptron Tagger (Russian)
  [ ] basque_grammars..... Grammars for Basque
  [ ] bcp47............... BCP-47 Language Tags
  [ ] biocreative_ppi..... BioCreAtIvE (Critical Assessment of Information
                           Extraction Systems in Biology)
  [ ] bllip_wsj_no_aux.... BLLIP Parser: WSJ Model
  [ ] book_grammars....... Grammars from NLTK Book
  [ ] brown............... Brown Corpus
  [ ] brown_tei........... Brown Corpus (TEI XML Version)


True

In [6]:
import os
import nltk
import nltk.corpus

In [8]:
import nltk
nltk.download('brown')
from nltk.corpus import brown
brown.words()

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]

In [13]:
import nltk
nltk.download('gutenberg')
nltk.corpus.gutenberg.fileids()

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [14]:
hamlet=nltk.corpus.gutenberg.words("shakespeare-hamlet.txt")
hamlet

['[', 'The', 'Tragedie', 'of', 'Hamlet', 'by', ...]

In [16]:
for word in hamlet[:500]:
  print(word,sep=" ",end=" ")

[ The Tragedie of Hamlet by William Shakespeare 1599 ] Actus Primus . Scoena Prima . Enter Barnardo and Francisco two Centinels . Barnardo . Who ' s there ? Fran . Nay answer me : Stand & vnfold your selfe Bar . Long liue the King Fran . Barnardo ? Bar . He Fran . You come most carefully vpon your houre Bar . ' Tis now strook twelue , get thee to bed Francisco Fran . For this releefe much thankes : ' Tis bitter cold , And I am sicke at heart Barn . Haue you had quiet Guard ? Fran . Not a Mouse stirring Barn . Well , goodnight . If you do meet Horatio and Marcellus , the Riuals of my Watch , bid them make hast . Enter Horatio and Marcellus . Fran . I thinke I heare them . Stand : who ' s there ? Hor . Friends to this ground Mar . And Leige - men to the Dane Fran . Giue you good night Mar . O farwel honest Soldier , who hath relieu ' d you ? Fra . Barnardo ha ' s my place : giue you goodnight . Exit Fran . Mar . Holla Barnardo Bar . Say , what is Horatio there ? Hor . A peece of him Bar 

In [17]:
AI="""
Artificial intelligence (AI), in its broadest sense, is intelligence exhibited by machines, particularly computer systems, as opposed to the natural intelligence of living beings. As a field of research in computer science focusing on the automation of intelligent behavior through machine learning, it develops and studies methods and software which enable machines to perceive their environment and take actions that maximize their chances of achieving defined goals, with the aim of performing tasks typically associated with human intelligence. Such machines may be called AIs.
"""

In [18]:
type(AI)

str

In [19]:
from nltk.tokenize import word_tokenize


In [21]:
import nltk
nltk.download('punkt')
AI_tokens=word_tokenize(AI)
AI_tokens

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


['Artificial',
 'intelligence',
 '(',
 'AI',
 ')',
 ',',
 'in',
 'its',
 'broadest',
 'sense',
 ',',
 'is',
 'intelligence',
 'exhibited',
 'by',
 'machines',
 ',',
 'particularly',
 'computer',
 'systems',
 ',',
 'as',
 'opposed',
 'to',
 'the',
 'natural',
 'intelligence',
 'of',
 'living',
 'beings',
 '.',
 'As',
 'a',
 'field',
 'of',
 'research',
 'in',
 'computer',
 'science',
 'focusing',
 'on',
 'the',
 'automation',
 'of',
 'intelligent',
 'behavior',
 'through',
 'machine',
 'learning',
 ',',
 'it',
 'develops',
 'and',
 'studies',
 'methods',
 'and',
 'software',
 'which',
 'enable',
 'machines',
 'to',
 'perceive',
 'their',
 'environment',
 'and',
 'take',
 'actions',
 'that',
 'maximize',
 'their',
 'chances',
 'of',
 'achieving',
 'defined',
 'goals',
 ',',
 'with',
 'the',
 'aim',
 'of',
 'performing',
 'tasks',
 'typically',
 'associated',
 'with',
 'human',
 'intelligence',
 '.',
 'Such',
 'machines',
 'may',
 'be',
 'called',
 'AIs',
 '.']