In [1]:
import nltk

In [2]:
md = nltk.corpus.gutenberg.words("melville-moby_dick.txt")

In [4]:
md_22 = md[:22]

In [5]:
for word in md_22:
    if word.isalpha():
        print(word)

Moby
Dick
by
Herman
Melville
ETYMOLOGY
Supplied
by
a
Late
Consumptive
Usher
to
a
Grammar
School


In [6]:
for word in md_22:
    print(word.lower())

[
moby
dick
by
herman
melville
1851
]
etymology
.
(
supplied
by
a
late
consumptive
usher
to
a
grammar
school
)


In [8]:
norm = [word.lower() for word in md_22 if word.isalpha()]

In [9]:
norm

['moby',
 'dick',
 'by',
 'herman',
 'melville',
 'etymology',
 'supplied',
 'by',
 'a',
 'late',
 'consumptive',
 'usher',
 'to',
 'a',
 'grammar',
 'school']

In [10]:
porter = nltk.PorterStemmer()

In [12]:
my_list = ["cat", "cats", "lie", "lying", "run", "running", "city", "cities", "month", "monthly", "woman", "women"]

In [13]:
for word in my_list:
    print(porter.stem(word))

cat
cat
lie
lie
run
run
citi
citi
month
monthli
woman
women


In [14]:
lancaster = nltk.LancasterStemmer()

In [15]:
for word in my_list:
    print(lancaster.stem(word))

cat
cat
lie
lying
run
run
city
city
mon
month
wom
wom


In [16]:
wnlem = nltk.WordNetLemmatizer() #not the best way for normalization

In [17]:
for word in my_list:
    print(wnlem.lemmatize(word)) #long time to run

cat
cat
lie
lying
run
running
city
city
month
monthly
woman
woman


In [19]:
# part of speech tagging

In [21]:
text = "I walked to the cafe to buy coffee before work."

In [23]:
tokens = nltk.word_tokenize(text)

In [24]:
tokens

['I',
 'walked',
 'to',
 'the',
 'cafe',
 'to',
 'buy',
 'coffee',
 'before',
 'work',
 '.']

In [25]:
nltk.pos_tag(tokens) #part of speech

[('I', 'PRP'),
 ('walked', 'VBD'),
 ('to', 'TO'),
 ('the', 'DT'),
 ('cafe', 'NN'),
 ('to', 'TO'),
 ('buy', 'VB'),
 ('coffee', 'NN'),
 ('before', 'IN'),
 ('work', 'NN'),
 ('.', '.')]

In [27]:
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

In [28]:
md_tags = nltk.pos_tag(norm, tagset="universal") #find if it's a noun or not

In [29]:
md_tags[:5]

[('moby', 'NOUN'),
 ('dick', 'NOUN'),
 ('by', 'ADP'),
 ('herman', 'NOUN'),
 ('melville', 'NOUN')]

In [30]:
md_nouns = [word[0] for word in md_tags if word[1] == "NOUN"]

In [31]:
md_nouns[:10]

['moby',
 'dick',
 'herman',
 'melville',
 'etymology',
 'consumptive',
 'usher',
 'grammar',
 'school']

In [32]:
nouns_fd = nltk.FreqDist(md_nouns) #most commonly used noun words

In [34]:
nouns_fd.most_common(10)

[('moby', 1),
 ('dick', 1),
 ('herman', 1),
 ('melville', 1),
 ('etymology', 1),
 ('consumptive', 1),
 ('usher', 1),
 ('grammar', 1),
 ('school', 1)]

In [36]:
# multiple parts of speech (over, spoke, answer)

In [38]:
alice = nltk.corpus.gutenberg.words("carroll-alice.txt")

In [39]:
alice_norm = [word.lower() for word in alice if word.isalpha()]

In [40]:
alice_tags = nltk.pos_tag(alice_norm, tagset="universal")

In [41]:
alice_cfd = nltk.ConditionalFreqDist(alice_tags)

In [44]:
print(alice_cfd)

<ConditionalFreqDist with 2569 conditions>


In [46]:
alice_cfd["over"]

FreqDist({'ADP': 31, 'PRT': 5, 'ADV': 4})

In [47]:
alice_cfd["spoke"]

FreqDist({'VERB': 16, 'NOUN': 1})

In [48]:
alice_cfd["answer"]

FreqDist({'NOUN': 5, 'VERB': 3, 'ADP': 1})

bryant-storis.txt

In [49]:
stories = nltk.corpus.gutenberg.words("bryant-stories.txt")

In [50]:
tags = nltk.pos_tag(stories, tagset="universal")

In [51]:
tags[:10]

[('[', 'NOUN'),
 ('Stories', 'NOUN'),
 ('to', 'PRT'),
 ('Tell', 'VERB'),
 ('to', 'PRT'),
 ('Children', 'NOUN'),
 ('by', 'ADP'),
 ('Sara', 'NOUN'),
 ('Cone', 'NOUN'),
 ('Bryant', 'NOUN')]

In [53]:
for ((word1, tag1), (word2, tag2), (word3, tag3)) in nltk.trigrams(tags):
    if tag1 == "NOUN" and word2 == "or" and tag3 == "NOUN":
        print(word1 + " " + word2 + " " + word3)

ship or part
food or water
queens or princesses
rank or wealth


In [54]:
# chunking

In [55]:
text = "I will go to the coffee shop in New York after I get off the jet plane. "

In [58]:
text_tag = nltk.pos_tag(nltk.word_tokenize(text))

In [59]:
text_tag

[('I', 'PRP'),
 ('will', 'MD'),
 ('go', 'VB'),
 ('to', 'TO'),
 ('the', 'DT'),
 ('coffee', 'NN'),
 ('shop', 'NN'),
 ('in', 'IN'),
 ('New', 'NNP'),
 ('York', 'NNP'),
 ('after', 'IN'),
 ('I', 'PRP'),
 ('get', 'VBP'),
 ('off', 'IN'),
 ('the', 'DT'),
 ('jet', 'NN'),
 ('plane', 'NN'),
 ('.', '.')]

In [60]:
sequence = '''
            Chunk:
            {<NNPS>+}
            {<NNP>+}
            {<NN>+}'''

In [62]:
NPChunker = nltk.RegexpParser(sequence)

In [63]:
result = NPChunker.parse(text_tag)

In [64]:
print(result)

(S
  I/PRP
  will/MD
  go/VB
  to/TO
  the/DT
  (Chunk coffee/NN shop/NN)
  in/IN
  (Chunk New/NNP York/NNP)
  after/IN
  I/PRP
  get/VBP
  off/IN
  the/DT
  (Chunk jet/NN plane/NN)
  ./.)


In [65]:
# named entity recognition

In [66]:
# text = open("example.txt").read().decode('utf-8')

In [67]:
# text_tag = nltk.pos_tag(nltk.word_tokenize(text))

In [68]:
# text_ch = nltk.ne_chunk(text_tag)

In [69]:
# for chunk in text_ch:
#     if hasattr(chunk, 'label'):
#         print(chunk.label(), " ".join(c[0] for c in chunk.leaves()))

In [70]:
# remove stopwords (she, her, you, he etc)

In [71]:
alice

['[', 'Alice', "'", 's', 'Adventures', 'in', ...]

In [72]:
alice = [word.lower() for word in alice if word.isalpha()]

In [73]:
alice_fd = nltk.FreqDist(alice)

In [75]:
alice_100 = alice_fd.most_common(100)
common = [word[0] for word in alice_100]

In [76]:
common

['the',
 'and',
 'to',
 'a',
 'it',
 'she',
 'i',
 'of',
 'said',
 'you',
 'alice',
 'in',
 'was',
 'that',
 'as',
 'her',
 't',
 'at',
 's',
 'on',
 'all',
 'with',
 'had',
 'but',
 'for',
 'they',
 'so',
 'be',
 'not',
 'very',
 'what',
 'this',
 'little',
 'he',
 'out',
 'is',
 'one',
 'down',
 'up',
 'there',
 'if',
 'his',
 'then',
 'about',
 'no',
 'them',
 'know',
 'like',
 'were',
 'would',
 'went',
 'again',
 'herself',
 'do',
 'have',
 'when',
 'or',
 'could',
 'queen',
 'thought',
 'off',
 'time',
 'how',
 'me',
 'into',
 'see',
 'well',
 'did',
 'm',
 'who',
 'can',
 'king',
 'your',
 'don',
 'now',
 'by',
 'turtle',
 'began',
 'my',
 'its',
 'll',
 'an',
 'way',
 'hatter',
 'mock',
 'quite',
 'gryphon',
 'are',
 'think',
 'just',
 'their',
 'rabbit',
 'much',
 'say',
 'some',
 'first',
 'here',
 'head',
 'go',
 'only']

In [77]:
descriptive = set(common) - set(nltk.corpus.stopwords.words("english"))

In [78]:
descriptive

{'alice',
 'began',
 'could',
 'first',
 'go',
 'gryphon',
 'hatter',
 'head',
 'king',
 'know',
 'like',
 'little',
 'mock',
 'much',
 'one',
 'queen',
 'quite',
 'rabbit',
 'said',
 'say',
 'see',
 'think',
 'thought',
 'time',
 'turtle',
 'way',
 'well',
 'went',
 'would'}