In [1]:
import nltk

# Step 1: Tokenizing

In [2]:
text = "Mary had a little lamb. Her fleece was as white as snow."
from nltk.tokenize import word_tokenize, sent_tokenize

sentences = sent_tokenize(text)
print(sentences)

['Mary had a little lamb.', 'Her fleece was as white as snow.']


In [3]:
words = [word_tokenize(sentence) for sentence in sentences]
print(words)

[['Mary', 'had', 'a', 'little', 'lamb', '.'], ['Her', 'fleece', 'was', 'as', 'white', 'as', 'snow', '.']]


# Step 2: Removing Stopwords

In [4]:
from nltk.corpus import stopwords
from string import punctuation

print(punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [5]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [6]:
customStopWords = set(stopwords.words('english') + list(punctuation))

In [7]:
wordsMinusStopWords = [word for word in word_tokenize(text) if word not in customStopWords]

In [8]:
print(wordsMinusStopWords)

['Mary', 'little', 'lamb', 'Her', 'fleece', 'white', 'snow']


# Step 3: Identifying Grams

In [9]:
from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(wordsMinusStopWords)
sorted(finder.ngram_fd.items())

[(('Her', 'fleece'), 1),
 (('Mary', 'little'), 1),
 (('fleece', 'white'), 1),
 (('lamb', 'Her'), 1),
 (('little', 'lamb'), 1),
 (('white', 'snow'), 1)]

# Step 4: Stemming & Tagging Parts of Speech

In [10]:
text2 = "Mary had a little lamb. Her fleece was as white as snow."
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()
stemmedWords=[st.stem(word) for word in word_tokenize(text2)]
print(stemmedWords)

['mary', 'had', 'a', 'littl', 'lamb', '.', 'her', 'fleec', 'was', 'as', 'whit', 'as', 'snow', '.']


In [11]:
nltk.pos_tag(word_tokenize(text2))

[('Mary', 'NNP'),
 ('had', 'VBD'),
 ('a', 'DT'),
 ('little', 'JJ'),
 ('lamb', 'NN'),
 ('.', '.'),
 ('Her', 'PRP$'),
 ('fleece', 'NN'),
 ('was', 'VBD'),
 ('as', 'RB'),
 ('white', 'JJ'),
 ('as', 'IN'),
 ('snow', 'NN'),
 ('.', '.')]

# Step 5: Word Sense Disambiguation

In [19]:
from nltk.corpus import wordnet as wn
for ss in wn.synsets('blue'):
    print(ss, ss.definition())

Synset('blue.n.01') blue color or pigment; resembling the color of the clear sky in the daytime
Synset('blue.n.02') blue clothing
Synset('blue.n.03') any organization or party whose uniforms or badges are blue
Synset('blue_sky.n.01') the sky as viewed during daylight
Synset('bluing.n.01') used to whiten laundry or hair or give it a bluish tinge
Synset('amobarbital_sodium.n.01') the sodium salt of amobarbital that is used as a barbiturate; used as a sedative and a hypnotic
Synset('blue.n.07') any of numerous small butterflies of the family Lycaenidae
Synset('blue.v.01') turn blue
Synset('blue.s.01') of the color intermediate between green and violet; having a color similar to that of a clear unclouded sky
Synset('blue.s.02') used to signify the Union forces in the American Civil War (who wore blue uniforms)
Synset('gloomy.s.02') filled with melancholy and despondency
Synset('blasphemous.s.02') characterized by profanity or cursing
Synset('blue.s.05') suggestive of sexual impropriety
Syn

In [26]:
from nltk.wsd import lesk
sense1 = lesk(word_tokenize('This song is wicked good'), 'wicked')
print(sense1, sense1.definition())

Synset('wicked.a.01') morally bad in principle or practice
