In [3]:
import nltk
import re
from nltk.corpus import brown
from nltk.collocations import *
from string import punctuation
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
from urllib.request import urlopen
from nltk.tag import pos_tag
from nltk import word_tokenize
import string, random
from nltk import bigrams
from nltk import trigrams
stop_words = nltk.corpus.stopwords.words('english')

In [4]:
# Training on all brown sentences, excluding news corpus
brown_tagged_sents = brown.tagged_sents(categories=['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies',
'humor', 'learned', 'lore', 'mystery', 'religion', 'reviews', 'romance',
'science_fiction'])

cooking_action_sents = [[('Strain', 'VB'), ('it', 'PPS'), ('well', 'RB'), ('.', '.')],
                        [('Mix', 'VB'), ('them', 'PPS'), ('well', 'RB'), ('.', '.')],
                        [('Season', 'VB'), ('them', 'PPS'), ('with', 'IN'), ('pepper', 'NN'), ('.', '.')], 
                        [('Wash', 'VB'), ('it', 'PPS'), ('well', 'RB'), ('.', '.')],
                        [('Chop', 'VB'), ('the', 'AT'), ('greens', 'NNS'), ('.', '.')],
                        [('Slice', 'VB'), ('it', 'PPS'), ('well', 'RB'), ('.', '.')],
                        [('Bake', 'VB'), ('the', 'AT'), ('cake', 'NN'), ('.', '.')],
                        [('Pour', 'VB'), ('into', 'IN'), ('a', 'AT'), ('mold', 'NN'), ('.', '.')],
                        [('Stir', 'VB'), ('the', 'AT'), ('mixture', 'NN'), ('.', '.')],
                        [('Moisten', 'VB'), ('the', 'AT'), ('grains', 'NNS'), ('.', '.')],
                        [('Cook', 'VB'), ('the', 'AT'), ('duck', 'NN'), ('.', '.')],
  
                        [('Drain', 'VB'), ('for', 'IN'), ('one', 'CD'), ('day', 'NN'), ('.', '.')]]

all_tagged_sents = cooking_action_sents + brown_tagged_sents
all_tagged_sents

def create_data_sets():
    size = int(len(all_tagged_sents) * 0.9)
    train_sents = all_tagged_sents[:size]
    test_sents = all_tagged_sents[size:]
    return train_sents, test_sents
train_sents, test_sents = create_data_sets()

def build_backoff_tagger (train_sents):
    t0 = nltk.DefaultTagger('NN')
    t1 = nltk.UnigramTagger(train_sents, backoff=t0)
    t2 = nltk.BigramTagger(train_sents, backoff=t1)
    return t2


ngram_tagger = build_backoff_tagger(train_sents)

more_stopwords = ["''", "--","``", "mr.", "mrs.", "n't", "'s", "'i","said"]
my_stopwords = stop_words + more_stopwords

In [5]:
def tokenize_and_tag_text(corpus):
    sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw_sents = sent_tokenizer.tokenize(corpus) # Split text into sentences 
    sents = [nltk.word_tokenize(word.lower()) for word in raw_sents] # tokenize sentences
    tagged_POS_sents = [ngram_tagger.tag(word) for word in sents] # tags sentences
    return tagged_POS_sents

In [6]:
def freq_normed_unigrams(tagged_sents, num_terms=50):
    wnl = WordNetLemmatizer() # to get word stems
        
    normed_tagged_words = [wnl.lemmatize(word[0].lower()) for sent in tagged_sents
                           for word in sent 
                           if word[0].lower() not in nltk.corpus.stopwords.words('english')
                           and word[0] not in punctuation # remove punctuation
                           and word[1].startswith('N')]  #retain only nouns 
   
    top_normed_tagpairs = nltk.FreqDist(normed_tagged_words).most_common(num_terms) #get the num_terms most frequent
    return [word for (word,count) in top_normed_tagpairs] #extract out the words from the pairs

# Frequent Unigram for Brown News Corpus

In [7]:
brown_top_unigrams = freq_normed_unigrams(brown.tagged_sents(categories = 'news'))
print('Printing top 50 unigrams in Brown News Corpus... \n')
print(brown_top_unigrams)


Printing top 50 unigrams in Brown News Corpus... 

['mrs.', 'year', 'state', 'mr.', 'president', 'home', 'school', 'time', 'week', 'day', 'member', 'house', 'city', 'bill', 'committee', 'service', 'government', 'program', 'county', 'game', 'month', 'man', 'university', 'company', 'law', 'car', 'board', 'tax', 'kennedy', 'john', 'night', 'meeting', 'administration', 'court', 'family', 'plan', 'library', 'club', 'sale', 'country', 'u.s.', 'problem', 'party', 'system', 'case', 'people', 'group', 'child', 'yesterday', 'cent']


# Frequent Bigrams and the conditional Freq Dist, given most frequent Unigram- Brown News Corpus
Finding frequent Bigrams by applying conditional freq distribution on each of the top unigram found above.

In [8]:

brown = nltk.corpus.brown
brown_sents = brown.sents(categories='news')

bigram_list = []

# generation function make a choice from the probable continuation words
def generate_model(cfdist, word, num=10):
    for i in range(num):
        print(word, "," ,end=' ')
        if cfdist[word]:
            words = list(cfdist[word])
            word = random.choice(words)
        else:
            return


for sent in brown_sents:
    sent_lower = [w.lower() for w in sent if w not in nltk.corpus.stopwords.words('english') 
                  and w not in punctuation and len(w) > 2]
    bigrams = list(nltk.bigrams(sent_lower))
    bigram_list.extend(bigrams)


cfd = nltk.ConditionalFreqDist(bigram_list)


for word in brown_top_unigrams:
    generate_model(cfd, word)

    
    


mrs. , samuel , goodis , representing , today , part , project , halted , end , seven , year , students , found , mantlepiece , attributed , samuel , mcintyre , salem , special , interest , state , widely , among , stage , filial , piety , totally , devoted , friend , beaten , mr. , pezza , 734 , hartford , avenue , central , figure , prominently , new , zealand , president , nothing , abstention , u.s. , first , jane , drury , lane , already , surpassed , home , run , well , winter , tour , reached , 1-1/2-story , brick , paneled , doors , school , dance , fuhrmann's , 770 , students , junior , year , ordered , university , loath , time , reaction , bitter , winter , long , pass , throw , outside , world , competition , week , world , parolees , mother , young , ladies , waiting , misses , ticker , day , reception , daughter , rhodes , semmes , baker , houston , dr. , michael , walsh , member , nevah , sholom , congregation , house , staff , librarian-board , relationships , two , sto

# Frequent Unigrams with Hyperterms

In [9]:
def categories_from_hypernyms(termlist, num_terms=50):
    
    hypterms = []
    hypterms_dict = defaultdict(list)
    for term in termlist:                  # for each term
        s = wn.synsets(term.lower(), 'n')  # get its nominal synsets
        for syn in s:                      # for each lemma synset
            for hyp in syn.hypernyms():    # It has a list of hypernyms
                hypterms = hypterms + [hyp.name()]      # Extract the hypernym name and add to list
                hypterms_dict[hyp.name()].append(term)  # Extract examples and add them to dict
                
    hypfd = nltk.FreqDist(hypterms)
    for (name, count) in hypfd.most_common(num_terms):
        print (name, '({0})'.format(count))
        print ('\t', ', '.join(set(hypterms_dict[name])))
        print ('\n')

In [12]:
# Frequent Unigrams with Hyperterms for Brown News corpus

brown = nltk.corpus.brown
brown_top_terms = freq_normed_unigrams(brown.tagged_sents(categories = 'news'))

print("PRINTING UNIGRAMS WITH HYPERNYMS FOR BROWN NEWS CORPUS..\n")
categories_from_hypernyms(brown_top_terms,100)


PRINTING UNIGRAMS WITH HYPERNYMS FOR BROWN NEWS CORPUS..

time_period.n.01 (15)
	 year, school, day, night, week, time, month


building.n.01 (6)
	 house, club, library, school


group.n.01 (5)
	 man, people, system


administrative_district.n.01 (5)
	 country, county, state, city


person.n.01 (5)
	 man, party, child, case


unit.n.03 (5)
	 house, home, family, company, member


time_unit.n.01 (4)
	 day, month, night


collection.n.01 (4)
	 library, family, law


educational_institution.n.01 (3)
	 university, school


room.n.01 (3)
	 john, library, court


body.n.02 (3)
	 administration, university, school


male.n.02 (3)
	 man


compartment.n.02 (3)
	 car


social_gathering.n.01 (3)
	 party, meeting, company


legal_document.n.01 (2)
	 law, bill


political_unit.n.01 (2)
	 country, state


game_equipment.n.01 (2)
	 man, game


attribute.n.02 (2)
	 state, time


system.n.04 (2)
	 government, program


association.n.01 (2)
	 club, family


selling.n.01 (2)
	 sale


social_control.n.01 

In [13]:

# Frequent Unigrams with Hyperterms for Oliver Twist corpus

with open('Oliver_Copy.txt', 'r') as text_file:
    oliver_text = text_file.read()
oliver_top_terms = freq_normed_unigrams(tokenize_and_tag_text(oliver_text), 50)


print("PRINTING UNIGRAMS WITH HYPERNYMS FOR OLIVER TWIST CORPUS...\n")
categories_from_hypernyms(oliver_top_terms,100)

PRINTING UNIGRAMS WITH HYPERNYMS FOR OLIVER TWIST CORPUS...

time_period.n.01 (11)
	 life, day, time, night


person.n.01 (7)
	 man, jew, face, child, life, friend


woman.n.01 (5)
	 girl, lady


male.n.02 (4)
	 man, boy


time_unit.n.01 (3)
	 day, night


man.n.01 (3)
	 sir, gentleman, boy


building.n.01 (3)
	 house


female.n.02 (3)
	 girl, woman


being.n.01 (3)
	 life


external_body_part.n.01 (3)
	 face, head


opportunity.n.01 (3)
	 day, room, street


communication.n.02 (2)
	 voice


lover.n.01 (2)
	 dear, girl


cognition.n.01 (2)
	 place, head


condition.n.01 (2)
	 place, way


structure.n.04 (2)
	 head


advocate.n.01 (2)
	 friend, voice


position.n.07 (2)
	 room, way


adult.n.01 (2)
	 man, woman


body_servant.n.01 (2)
	 man, gentleman


manservant.n.01 (2)
	 man, gentleman


title.n.06 (2)
	 mrs., mr.


play.n.08 (2)
	 doctor, house


point.n.02 (2)
	 place


thoroughfare.n.01 (2)
	 street


implementation.n.02 (1)
	 way


group.n.01 (1)
	 man


beginning.n.04 (1)
	 hea

# Chunking using Noun Phrases with prepositions

In [21]:
def chunker(sentence_list):
    cp = nltk.RegexpParser("CHUNK: {<DT>?<NN.*>+<IN><NN.*>+}")  # Noun phrases, ending with nouns including prepositions
    list_words = []
    for sent_no in range(len(sentence_list)):
        tree = cp.parse(sentence_list[sent_no])
        for subtree in tree.subtrees():
            if subtree.label() == 'CHUNK' and len(subtree) > 2: # retaining chunks with 3 or more words
                for word, tag in subtree.leaves():
                    if tag != 'IN':     # Excluding preposition from found chunks to get frequently gropued words
                        list_words.append(word)
    fd = nltk.FreqDist(list_words)
    most_common = fd.most_common(400)
    return most_common


# Applying Chunking Brown Corpus 

brown = nltk.corpus.brown
brown_sentence_list = brown.tagged_sents(categories = 'news')

brown_most_common = chunker(brown_sentence_list)

print(" PRINTING GIST OF BROWN NEWS CORPUS USING NOUN PHRASE CHUNKING... \n",)

for word, freq in brown_most_common:
    print(' ', word ,',', end="")
    

 PRINTING GIST OF BROWN NEWS CORPUS USING NOUN PHRASE CHUNKING... 

  number ,  years ,  kind ,  board ,  government ,  work ,  state ,  interest ,  use ,  education ,  service ,  program ,  Communist ,  President ,  game ,  members ,  libraries ,  sense ,  group ,  member ,  series ,  mind ,  labor ,  golf ,  need ,  front ,  charge ,  time ,  schools ,  plan ,  aid ,  machinery ,  director ,  hand ,  bonds ,  center ,  race ,  faculty ,  headquarters ,  Sen. ,  health ,  price ,  chairman ,  problem ,  election ,  day ,  State ,  cost ,  lot ,  base ,  farm ,  men ,  governor ,  hundreds ,  administration ,  acts ,  behalf ,  directors ,  excellence ,  lines ,  increase ,  thousands ,  conspiracy ,  couple ,  family ,  art ,  form ,  fire ,  discrimination ,  that ,  methods ,  history ,  Hill ,  control ,  months ,  grants ,  Capitol ,  side ,  families ,  traffic ,  sales ,  Mayor ,  population ,  office ,  Judge ,  field ,  prison ,  degree ,  trade ,  pair ,  support ,  construct

In [15]:
with open('Oliver_Copy.txt', 'r') as text_file:
    oliver_corpus = text_file.read()

In [16]:
oliver_tagged_sent = tokenize_and_tag_text(oliver_corpus)
oliver_most_common = chunker(oliver_tagged_sent)

#Chunking Oliver Twist corpus
print(" PRINTING GIST OF OLIVER CORPUS USING NOUN PHRASE CHUNKING... \n")

for word, freq in oliver_most_common:
    print(' ', word ,',', end="")

 PRINTING GIST OF OLIVER CORPUS USING NOUN PHRASE CHUNKING... 

  's ,  oliver ,  mr. ,  time ,  this ,  bumble ,  sikes ,  fagin ,  life ,  hand ,  look ,  head ,  way ,  couple ,  london ,  place ,  day ,  tears ,  mrs. ,  piece ,  bread ,  house ,  death ,  deal ,  man ,  face ,  side ,  course ,  expression ,  glass ,  pair ,  money ,  night ,  gentleman ,  business ,  sort ,  i ,  paper ,  brownlow ,  air ,  men ,  front ,  thing ,  hour ,  number ,  cry ,  nancy ,  fear ,  state ,  heart ,  home ,  spirits ,  toby ,  boy ,  question ,  mind ,  scrap ,  water ,  society ,  manner ,  street ,  foot ,  help ,  sound ,  noise ,  fire ,  point ,  flight ,  another ,  corney ,  crackit ,  bates ,  charley ,  terror ,  length ,  morning ,  alarm ,  that ,  company ,  exclamations ,  view ,  show ,  noah ,  loss ,  heaven ,  voice ,  office ,  eyes ,  love ,  breath ,  reply ,  ale ,  name ,  action ,  surprise ,  bit ,  word ,  passion ,  order ,  form ,  chitling ,  agony ,  thieves , 

# Collocations-Brown News Corpus
 1. Finding trigrams that occur at least 2 times, retain top 100 according to PMI
 2. Finding bigrams that occur at least 4 times, which are not subsumed by the retained trigrams, ordering by PMI, retaining top 100
 3. Excluding trigrams that begin or end with stopwords, removing stopwords from bigrams

In [17]:
trigram_measures = nltk.collocations.TrigramAssocMeasures()
finder = TrigramCollocationFinder.from_words(brown.words(categories = 'news'))

finder.apply_freq_filter(2)

finder.apply_word_filter(lambda w: w in string.punctuation)
finder.apply_word_filter(lambda w: w.lower() in my_stopwords)


trigram_list = finder.nbest(trigram_measures.pmi, 100)
    
trigram_set = set()
for a,b,c in trigram_list:
    trigram_set.add(a.lower())
    trigram_set.add(b.lower())
    trigram_set.add(c.lower())

        
    
bigram_measures = nltk.collocations.BigramAssocMeasures()

finder = BigramCollocationFinder.from_words(brown.words(categories = 'news'))

finder.apply_freq_filter(4)
finder.apply_word_filter(lambda w: w.lower() in trigram_set)
finder.apply_word_filter(lambda w: w in string.punctuation)
finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in my_stopwords)
bigram_list = finder.nbest(bigram_measures.pmi, 100)
 
    
print('PRINTING GIST OF COLLOCATIONS FOR BROWN NEWS CORPUS.... \n ')

for a, b, c in trigram_list:
    print(' ', a, b, c, ',', end="",)
for a, b in bigram_list:
    print(' ', a, b, ',', end="")

PRINTING GIST OF COLLOCATIONS FOR BROWN NEWS CORPUS.... 
 
  Ku Klux Klan ,  Pinar Del Rio ,  Rural Roads Authority ,  Post Office Box ,  Diversified Growth Stock ,  Patrick's Day Purse ,  Notre Dame Chapter ,  esprit de corps ,  Growth Stock Fund ,  electronic data processing ,  La Dolce Vita ,  oil mill supplies ,  Prince Souvanna Phouma ,  Holy Cross Hospital ,  First Lady Jacqueline ,  Duncan Phyfe furniture ,  Cherry Hill Road ,  Dow Jones industrial ,  Speaker Sam Rayburn ,  J. Clinton Bowman ,  Christian Youth Crusade ,  Stage 1 Residential ,  Big Four summit ,  15 pounds lighter ,  Mile Road East ,  Robert O. Spurdle ,  Philmont Country Club ,  test ban negotiations ,  Gin Supply Co. ,  potato chip industry ,  South Viet Nam ,  $800 billion economy ,  Air Force Academy ,  Armed Services Committee ,  Emory University's charter ,  nuclear test ban ,  Ivan Allen Jr. ,  fire fighters association ,  T. F. Zimmerman ,  Henry Hall Wilson ,  South Park Way ,  total passing yardage ,  c

# Collocations-Oliver Twist
  1. Finding trigrams that occur at least 2 times, retain top 100 according to PMI
  2. Finding bigrams that occur at least 4 times, which are not subsumed by the retained trigrams, ordering by PMI, retaining top 100
  3. Excluding trigrams that begin or end with stopwords, after removing stopwords from bigrams

In [18]:
with open('Oliver_Copy.txt', 'r') as text_file:
    oliver_corpus = text_file.read()
    
oliver_tokens=nltk.word_tokenize(oliver_corpus)


In [19]:
more_stopwords = ["''", "--","``", "mr.", "mrs.", "n't", "'s", "'i","said"]
my_stopwords = stop_words + more_stopwords

In [20]:

trigram_measures = nltk.collocations.TrigramAssocMeasures()
finder = TrigramCollocationFinder.from_words(oliver_tokens)

finder.apply_freq_filter(2)


finder.apply_word_filter(lambda w: w in string.punctuation)
finder.apply_word_filter(lambda w: w.lower() in my_stopwords)


trigram_list = finder.nbest(trigram_measures.pmi, 100)

    
trigram_set = set()
for a,b,c in trigram_list:
    trigram_set.add(a.lower())
    trigram_set.add(b.lower())
    trigram_set.add(c.lower())

    
    
bigram_measures = nltk.collocations.BigramAssocMeasures()

finder = BigramCollocationFinder.from_words(oliver_tokens)

finder.apply_freq_filter(4)
finder.apply_word_filter(lambda w: w.lower() in trigram_set)
finder.apply_word_filter(lambda w: w in string.punctuation)
finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in my_stopwords)
bigram_list = finder.nbest(bigram_measures.pmi, 100)

    
print('PRINTING GIST OF COLLOCATIONS FOR OLIVER TWIST.... \n ')

for a, b, c in trigram_list:
    print(' ', a, b, c, ',', end="",)
for a, b in bigram_list:
    print(' ', a, b, ',', end="")

PRINTING GIST OF COLLOCATIONS FOR OLIVER TWIST.... 
 
  large brass buttons ,  Little Saffron Hill ,  sun shone brightly ,  regular right-down bad ,  right-down bad 'un ,  d 'ye mean ,  D 'ye hear ,  shouting 'Stop thief ,  chairs closer together ,  flash Toby Crackit ,  soft blue eye ,  Master Charles Bates ,  crept softly upstairs ,  hardened little wretch ,  coach rattled away ,  three pound ten ,  sends back word ,  Come home directly ,  murdered woman lay ,  name engraved upon ,  'A porochial life ,  'You may depend ,  'Never say die ,  girl drew closer ,  night afore last ,  may depend upon ,  added Charley Bates ,  white-headed old gentleman ,  turning quickly round ,  Master Charley Bates ,  'We shall see ,  great deal better ,  Oliver lay awake ,  merry old gentleman ,  absent old gentleman ,  inquired Charley Bates ,  last two days ,  yer know yer ,  went away together ,  Oliver felt glad ,  'You 're right ,  replied Master Bates ,  face turned towards ,  'We must know ,  cri