In [1]:
import nltk

In [2]:
#TOKENIZE
text="Get me flight to Milan next week. I want the cheapest"
from nltk.tokenize import word_tokenize, sent_tokenize
sents = sent_tokenize(text)
print(sents)

['Get me flight to Milan next week.', 'I want the cheapest']


In [3]:
#TOKENIZE
words = word_tokenize(text)
print(words)              

['Get', 'me', 'flight', 'to', 'Milan', 'next', 'week', '.', 'I', 'want', 'the', 'cheapest']


In [4]:
#STOPWORDS REMOVAL
from nltk.corpus import stopwords #collection of stopwords from different languages
from string import punctuation #punctuation since they are considered as tokens
customStopWords=set(stopwords.words('english')+list(punctuation))

In [5]:
#filtering words in the sentence that are not in the list of stopwords
wordsWOStopwords=[word for word in word_tokenize(text) if word not in customStopWords]
print(wordsWOStopwords)

['Get', 'flight', 'Milan', 'next', 'week', 'I', 'want', 'cheapest']


In [6]:
#N-GRAMS
from nltk.collocations import *
bigrams_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(wordsWOStopwords) #construnct bigrams from list of words
sorted(finder.ngram_fd.items()) #bigrams and frequencies. most important on top

[(('Get', 'flight'), 1),
 (('I', 'want'), 1),
 (('Milan', 'next'), 1),
 (('flight', 'Milan'), 1),
 (('next', 'week'), 1),
 (('want', 'cheapest'), 1),
 (('week', 'I'), 1)]

In [7]:
#STEMMING for different morphological forms of the same word
text2 = "Mary closed on closing night when she was in the ood to close"
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()
stemmedWords=[st.stem(word) for word in word_tokenize(text2)]
print(stemmedWords)

['mary', 'clos', 'on', 'clos', 'night', 'when', 'she', 'was', 'in', 'the', 'ood', 'to', 'clos']


In [8]:
text3 = "I wanna fly with a night flight"
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()
stemmedWords=[st.stem(word) for word in word_tokenize(text3)]
print(stemmedWords)

['i', 'wan', 'na', 'fly', 'with', 'a', 'night', 'flight']


In [9]:
#PARTS OF SPEECH
nltk.pos_tag(word_tokenize(text2))

[('Mary', 'NNP'),
 ('closed', 'VBD'),
 ('on', 'IN'),
 ('closing', 'NN'),
 ('night', 'NN'),
 ('when', 'WRB'),
 ('she', 'PRP'),
 ('was', 'VBD'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('ood', 'NN'),
 ('to', 'TO'),
 ('close', 'VB')]

In [10]:
#WORD SENSE DISAMBIGUATION
from nltk.corpus import wordnet as wn # wordnet is a lexicon (vocabulary)
for ss in wn.synsets('bass'): #synset is one single definition of a word
    print(ss,ss.definition())

Synset('bass.n.01') the lowest part of the musical range
Synset('bass.n.02') the lowest part in polyphonic music
Synset('bass.n.03') an adult male singer with the lowest voice
Synset('sea_bass.n.01') the lean flesh of a saltwater fish of the family Serranidae
Synset('freshwater_bass.n.01') any of various North American freshwater fish with lean flesh (especially of the genus Micropterus)
Synset('bass.n.06') the lowest adult male singing voice
Synset('bass.n.07') the member with the lowest range of a family of musical instruments
Synset('bass.n.08') nontechnical name for any of numerous edible marine and freshwater spiny-finned fishes
Synset('bass.s.01') having or denoting a low vocal or instrumental range


In [11]:
from nltk.wsd import lesk #algorithm for word sense disambiguation
sensel = lesk(word_tokenize("Sing in a lower tone along with the bass"), "bass")
print(sensel, sensel.definition())

Synset('bass.n.07') the member with the lowest range of a family of musical instruments


In [12]:
sensel2 = lesk(word_tokenize("The sea bass was really hard to catch"), "bass")
print(sensel2, sensel2.definition())

Synset('sea_bass.n.01') the lean flesh of a saltwater fish of the family Serranidae
