In [1]:
import re
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import wordnet as wn

In [2]:
# Use path length in wordnet to find word similarity
# find sense of words via synonym set
# n=noun, 01=synonym set for first meaning of the word
deer = wn.synset('deer.n.01')
deer

Synset('deer.n.01')

In [3]:
elk = wn.synset('elk.n.01')
deer.path_similarity(elk)

0.5

In [4]:
horse = wn.synset('horse.n.01')
deer.path_similarity(horse)

0.14285714285714285

In [5]:
# Use an information criteria to find word similarity
from nltk.corpus import wordnet_ic
brown_ic = wordnet_ic.ic('ic-brown.dat')
deer.lin_similarity(elk, brown_ic)

0.8623778273893673

In [6]:
deer.lin_similarity(horse, brown_ic)

0.7726998936065773

In [7]:
# Use NLTK Collocation and Association Measures
from nltk.collocations import *
# load some text for examples
from nltk.book import *
# text1 is the book "Moby Dick"
# extract just the words without numbers and sentence marks and make them lower case
text = [w.lower() for w in list(text1) if w.isalpha()]

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


In [8]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(text)
finder.nbest(bigram_measures.pmi,10)

[('accidental', 'advantages'),
 ('adoring', 'cherubim'),
 ('affluent', 'cultivated'),
 ('agassiz', 'imagines'),
 ('agrarian', 'freebooting'),
 ('albert', 'durer'),
 ('alexanders', 'parcelling'),
 ('amphitheatrical', 'heights'),
 ('anacharsis', 'clootz'),
 ('andrew', 'jackson')]

In [9]:
# find all the bigrams with occurrence of at least 10, this modifies our "finder" object
finder.apply_freq_filter(10)
finder.nbest(bigram_measures.pmi,10)

[('mrs', 'hussey'),
 ('o', 'clock'),
 ('dough', 'boy'),
 ('cape', 'horn'),
 ('moby', 'dick'),
 ('town', 'ho'),
 ('try', 'works'),
 ('new', 'bedford'),
 ('ha', 'ha'),
 ('king', 'post')]

In [10]:
# Working with Latent Dirichlet Allocation (LDA) in Python
# Several packages available, such as gensim and lda. Text needs to be
# preprocessed: tokenizing, normalizing such as lower-casing, stopword
# removal, stemming, and then transforming into a (sparse) matrix for
# word (bigram, etc) occurences.
# generate a set of preprocessed documents
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.book import *

In [11]:
len(stopwords.words('english'))

179

In [12]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [13]:
# extract just the stemmed words without numbers and sentence marks and make them lower case
p_stemmer = PorterStemmer()
sw = stopwords.words('english')
doc1 = [p_stemmer.stem(w.lower()) for w in list(text1) if w.isalpha() and not w.lower() in sw]
doc2 = [p_stemmer.stem(w.lower()) for w in list(text2) if w.isalpha() and not w.lower() in sw]
doc3 = [p_stemmer.stem(w.lower()) for w in list(text3) if w.isalpha() and not w.lower() in sw]
doc4 = [p_stemmer.stem(w.lower()) for w in list(text4) if w.isalpha() and not w.lower() in sw]
doc5 = [p_stemmer.stem(w.lower()) for w in list(text5) if w.isalpha() and not w.lower() in sw]
doc_set = [doc1, doc2, doc3, doc4, doc5]

In [14]:
# under Windows this generates a warning
import gensim
from gensim import corpora, models

In [15]:
dictionary = corpora.Dictionary(doc_set)
dictionary

<gensim.corpora.dictionary.Dictionary at 0x292336afeb0>

In [16]:
# transform each document into a bag of words
corpus = [dictionary.doc2bow((doc)) for doc in doc_set]

In [17]:
# The corpus contains the 5 documents
# each document is a list of indexed features and occurrence count (freq)
print(type(corpus))
print(type(corpus[0]))
print(type(corpus[0][0]))
print(corpus[0][::2000])

<class 'list'>
<class 'list'>
<class 'tuple'>
[(0, 2), (2000, 1), (4000, 4), (6000, 1), (8000, 97), (10000, 1)]


In [18]:
# let's try 4 topics for our 5 documents
# 50 passes takes quite a while, let's try less
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=4, id2word=dictionary, passes=10)

In [19]:
print(ldamodel.print_topics(num_topics=4, num_words=10))

[(0, '0.010*"govern" + 0.010*"nation" + 0.009*"peopl" + 0.007*"us" + 0.007*"state" + 0.005*"upon" + 0.005*"power" + 0.005*"must" + 0.005*"countri" + 0.005*"great"'), (1, '0.014*"whale" + 0.008*"one" + 0.006*"like" + 0.005*"ship" + 0.005*"upon" + 0.005*"ye" + 0.005*"man" + 0.005*"sea" + 0.004*"ahab" + 0.004*"boat"'), (2, '0.027*"unto" + 0.021*"said" + 0.013*"son" + 0.013*"thou" + 0.012*"thi" + 0.012*"shall" + 0.011*"thee" + 0.010*"god" + 0.009*"lord" + 0.009*"father"'), (3, '0.014*"part" + 0.013*"join" + 0.010*"lol" + 0.009*"mr" + 0.009*"elinor" + 0.008*"hi" + 0.008*"could" + 0.007*"would" + 0.007*"mariann" + 0.005*"one"')]
