In [1]:
# import the neccessary packages
from collections import defaultdict, Counter, namedtuple

### Working with nested dictionaries

### Problem ###

You are given two lists of sequences: A = tags and B = words. Return a dictionary keyed to each unique value in the first sequence list that counts the number of occurrences of the corresponding value from the second sequence list.
    
For example: if 1244 sequences in the list B contain the word "time" tagged as a NOUN, then you should return a dictionary such that pair_counts[NOUN][time] == 1244  

Note: B is a list of sequences, each sequence comprises the words of a senetence in the corpus. See example below with X (sentences) and Y the corresponding tags.

In [2]:
# consider two tuples, tags and words - easy to work with sample
seqA=('noun', 'pron', 'noun', 'noun', 'verb', 'prep', 'noun', 'noun', 'verb', 'noun', 'verb', 'pron', 'verb', 'verb')
seqB=('house', 'him', 'garden', 'house', 'is', 'on', 'house', 'house', 'is', 'garden', 'does', 'house', 'house', 'house')

In [3]:
# recall that zip() method in Python3 gives a zip object whose elements are pairs
seqAB=zip(seqA, seqB)

# if we want to translate the zip() object in a list
listAB = list(zip(seqA,seqB))
listAB[2] # a list of pairs (tag, word)

('noun', 'garden')

In [4]:
# accessing the elements in a zip()
#for tags, words in zip(seqA, seqB):
#    print(tags, words)

In [5]:
# we can count the occurences of each entry in the list
Counter(listAB) # type is Collections.Counter

Counter({('noun', 'garden'): 2,
         ('noun', 'house'): 4,
         ('prep', 'on'): 1,
         ('pron', 'him'): 1,
         ('pron', 'house'): 1,
         ('verb', 'does'): 1,
         ('verb', 'house'): 2,
         ('verb', 'is'): 2})

In [6]:
# defininng the counter as dictionary with tuple keys
dcnt = dict(Counter(zip(seqA, seqB)))
dcnt.keys()

dict_keys([('verb', 'is'), ('prep', 'on'), ('verb', 'does'), ('pron', 'him'), ('pron', 'house'), ('verb', 'house'), ('noun', 'garden'), ('noun', 'house')])

In [7]:
dcnt.items()

dict_items([(('verb', 'is'), 2), (('prep', 'on'), 1), (('verb', 'does'), 1), (('pron', 'him'), 1), (('pron', 'house'), 1), (('verb', 'house'), 2), (('noun', 'garden'), 2), (('noun', 'house'), 4)])

In [8]:
# one way to use default dictionary, two nested dictionaries
# the count is done in the dictionary 

def pair_counts(seqA, seqB):
    
    nd = defaultdict(lambda: defaultdict(int))
    
    for tag, word in zip(seqA, seqB):
        nd[tag][word] += 1
        # the above means: nd[tag][word] = nd[tag][word] + 1
        
    return nd

#notice the ogly printout
pair_counts(seqA,seqB)

defaultdict(<function __main__.pair_counts.<locals>.<lambda>()>,
            {'noun': defaultdict(int, {'garden': 2, 'house': 4}),
             'prep': defaultdict(int, {'on': 1}),
             'pron': defaultdict(int, {'him': 1, 'house': 1}),
             'verb': defaultdict(int, {'does': 1, 'house': 2, 'is': 2})})

In [9]:
# use defalut dictionary to handle possible missing keys
# here we combine the default dictionary with the Counter method

def pair_counts(seqA, seqB):
    # instatiation of the nested dictionary: default dictionary
    # with a built in dictionary
    dd = defaultdict(dict) 
    # nested dictionary: key = tag, nested key = word, form the pair in zip
    # value is the Counter output
    for pair,count in dict(Counter((zip(seqA, seqB)))).items():
        dd[pair[0]][pair[1]] = count
    return dd
# the printout is much better
pair_counts(seqA, seqB)

defaultdict(dict,
            {'noun': {'garden': 2, 'house': 4},
             'prep': {'on': 1},
             'pron': {'him': 1, 'house': 1},
             'verb': {'does': 1, 'house': 2, 'is': 2}})

### Problem ###

Use the pair_counts() function and the training dataset to find the most frequent class label for each word in the training data, and populate the mfc_table below. The table keys should be words, and the values should be the appropriate tag string.

Create a lookup table mfc_table where mfc_table[word] contains the tag label most frequently assigned to that word.


In [10]:
word_counts = pair_counts(seqB,seqA)

In [11]:
# a solution I found online
mfc_table = {}

for key, value in word_counts.items():
    pos_count = 0
    pos_name = None
    for pos, count in value.items():
        if count > pos_count:
            pos_count = count
            pos_name = pos
    mfc_table[key] = pos_name
mfc_table

{'does': 'verb',
 'garden': 'noun',
 'him': 'pron',
 'house': 'noun',
 'is': 'verb',
 'on': 'prep'}

In [12]:
# this is my solution 

mfc_table1={}
for key, value in word_counts.items():
    ps=Counter(value).most_common(1)
    mfc_table1[key]=ps[0][0]
mfc_table1

{'does': 'verb',
 'garden': 'noun',
 'him': 'pron',
 'house': 'noun',
 'is': 'verb',
 'on': 'prep'}

In [13]:
#tags = [pair[1] for pair in data.training_set.stream()] # this is seqA
#words = [pair[0] for pair in data.training_set.stream()] # this is seqB
#word_counts = pair_counts(words, tags)

#another version of the solution

def get_most_frequent_tag(dict_tags):
    tag, count = Counter(dict_tags).most_common(2)[0]
    return tag

mfc_table2 = {word: get_most_frequent_tag(word_counts[word]) for word in word_counts.keys()}
mfc_table2

{'does': 'verb',
 'garden': 'noun',
 'him': 'pron',
 'house': 'noun',
 'is': 'verb',
 'on': 'prep'}

In [14]:
# Python3 code to demonstrate 
# finding frequency in list of tuples 
# using map() + count() 
  
# initializing list of tuples 
listAB = list(zip(seqA, seqB))
  
# printing the original list 
print ("The original list is : " + str(listAB)) 
  
# using map() + count() 
# finding frequency in list of tuples  
res = list(map(lambda i : i[0], listAB)).count('noun') 
  
# printing result 
print ("The frequency of element is : " + str(res)) 


The original list is : [('noun', 'house'), ('pron', 'him'), ('noun', 'garden'), ('noun', 'house'), ('verb', 'is'), ('prep', 'on'), ('noun', 'house'), ('noun', 'house'), ('verb', 'is'), ('noun', 'garden'), ('verb', 'does'), ('pron', 'house'), ('verb', 'house'), ('verb', 'house')]
The frequency of element is : 6


### Project Scribbles

In [15]:
# this creates an empty  nested dictionary using defaultdict
# disadvantage: types in default dictionary on every line, messy

dd = defaultdict(lambda: defaultdict(int))

# add the first entry as with tag as key1, word as key2
for tag, word in zip(seqA, seqB): #need to write zip(.,.) not use label
        dd[tag][word] += 1
dd

defaultdict(<function __main__.<lambda>()>,
            {'noun': defaultdict(int, {'garden': 2, 'house': 4}),
             'prep': defaultdict(int, {'on': 1}),
             'pron': defaultdict(int, {'him': 1, 'house': 1}),
             'verb': defaultdict(int, {'does': 1, 'house': 2, 'is': 2})})

In [16]:
# this creates an empty  nested dictionary using defaultdict
# disadvantage: types in default dictionary on every line, messy

de = defaultdict(lambda: defaultdict(int))

# add the first entry as with tag as key1, word as key2
for tag, word in zip(seqB, seqA): #need to write zip(.,.) not use label
        de[tag][word] += 1
de

defaultdict(<function __main__.<lambda>()>,
            {'does': defaultdict(int, {'verb': 1}),
             'garden': defaultdict(int, {'noun': 2}),
             'him': defaultdict(int, {'pron': 1}),
             'house': defaultdict(int, {'noun': 4, 'pron': 1, 'verb': 2}),
             'is': defaultdict(int, {'verb': 2}),
             'on': defaultdict(int, {'prep': 1})})

In [17]:
de.values()

dict_values([defaultdict(<class 'int'>, {'pron': 1, 'verb': 2, 'noun': 4}), defaultdict(<class 'int'>, {'noun': 2}), defaultdict(<class 'int'>, {'verb': 2}), defaultdict(<class 'int'>, {'prep': 1}), defaultdict(<class 'int'>, {'pron': 1}), defaultdict(<class 'int'>, {'verb': 1})])

In [18]:
# enumerate method includes an index 

for i, pair in enumerate(dd):
    print(i, pair)

0 pron
1 verb
2 noun
3 prep


In [19]:
D = {'emp1': {'name': 'Bob', 'job': 'Mgr'},
     'emp2': {'name': 'Kim', 'job': 'Dev'},
     'emp3': {'name': 'Sam', 'job': 'Dev'}}

for tag, word in dd.items():
    print("\nthe tag:", tag)
    for key in word:
        print(key + ':', word[key])


the tag: pron
house: 1
him: 1

the tag: verb
house: 2
is: 2
does: 1

the tag: noun
house: 4
garden: 2

the tag: prep
on: 1


In [20]:
people = {1: {'Name': 'John', 'Age': '27', 'Sex': 'Male'},
          2: {'Name': 'Marie', 'Age': '22', 'Sex': 'Female'}}

for p_id, p_info in people.items():
    print("\nPerson ID:", p_id)
    
for key in p_info:
        print(key + ':', p_info[key])


Person ID: 1

Person ID: 2
Age: 22
Name: Marie
Sex: Female


In [21]:
dc = defaultdict(int)

# add the first entry as with tag as key1, word as key2
for tag in zip(seqA, seqB): #need to write zip(.,.) not use label
        dc[tag] += 1
dc

defaultdict(int,
            {('noun', 'garden'): 2,
             ('noun', 'house'): 4,
             ('prep', 'on'): 1,
             ('pron', 'him'): 1,
             ('pron', 'house'): 1,
             ('verb', 'does'): 1,
             ('verb', 'house'): 2,
             ('verb', 'is'): 2})

In [22]:
special_tag='verb'

dlist=defaultdict(int)

for tag, word in zip(seqA, seqB):
    if tag==special_tag:
        dlist[word] +=1
dlist

defaultdict(int, {'does': 1, 'house': 2, 'is': 2})

In [23]:
# program to count the frequency of elements in a list using a dictionary 
  
def CountFrequency(my_list): 
  
    # Creating an empty dictionary  
    freq = {} 
    for item in my_list: 
        if (item in freq): 
            freq[item] += 1
        else: 
            freq[item] = 1
  
    for key, value in freq.items(): 
        print ((key, value))
        
CountFrequency(seqA)

('pron', 2)
('verb', 5)
('noun', 6)
('prep', 1)


In [24]:
# how to create a dictionary from a zip() object
# since the keys are unique, this doesn't work!!!

dict={k:v for k, v in zip(seqA, seqB)}
dict

{'noun': 'garden', 'prep': 'on', 'pron': 'house', 'verb': 'house'}

In [25]:
 # create an empty dictionary
nd = {}

# use the zip() method to combine the two tuples
for tag, word in zip(seqA, seqB):
    # define the default state
    if tag not in nd.keys(): nd[tag] = {} 
    if word not in nd[tag].keys(): nd[tag][word] = 0
    # loop through the entries of the zip() object, count occurences
    nd[tag][word] +=1

# print the nested dictionary
nd


{'noun': {'garden': 2, 'house': 4},
 'prep': {'on': 1},
 'pron': {'him': 1, 'house': 1},
 'verb': {'does': 1, 'house': 2, 'is': 2}}

In [26]:
# interesting way to build a nested dictionary 

# this is our sample data
data = [("Milter", "Miller", 4), ("Milter", "Miler", 4), ("Milter", "Malter", 2)]

# dictionary we want for the result
dictionary = {}

# loop that makes it work
for realName, falseName, position in data:
    dictionary.setdefault(realName, {})[falseName] = position
    
dictionary

{'Milter': {'Malter': 2, 'Miler': 4, 'Miller': 4}}

In [27]:
# another example of computing frequencies

frq={}
for pair in listAB:
    frq[pair] = listAB.count(pair)
frq

{('noun', 'garden'): 2,
 ('noun', 'house'): 4,
 ('prep', 'on'): 1,
 ('pron', 'him'): 1,
 ('pron', 'house'): 1,
 ('verb', 'does'): 1,
 ('verb', 'house'): 2,
 ('verb', 'is'): 2}

In [28]:
# another way to build a dictionary, where values are lists
city_list = [('TX','Austin'), ('TX','Houston'), ('NY','Albany'), ('NY', 'Syracuse'),
             ('NY', 'Buffalo'), ('NY', 'Rochester'), ('TX', 'Dallas'), ('CA','Sacramento'), 
             ('CA', 'Palo Alto'), ('GA', 'Atlanta')]

cities_by_state = defaultdict(list)
for state, city in city_list:
    cities_by_state[state].append(city)

#for state, cities in cities_by_state.iteritems():print state, ', '.join(cities)
cities_by_state

defaultdict(list,
            {'CA': ['Sacramento', 'Palo Alto'],
             'GA': ['Atlanta'],
             'NY': ['Albany', 'Syracuse', 'Buffalo', 'Rochester'],
             'TX': ['Austin', 'Houston', 'Dallas']})

### Problem ###

Return a dictionary keyed to each unique value in the input sequence list that counts the number of occurrences of the value in the sequences list. The sequences collection should be a 2-dimensional array.

For example, if the tag NOUN appears 275558 times over all the input sequences, then you should return a dictionary such that your_unigram_counts[NOUN] == 275558.

In [29]:
dttrain = [["sunny","hot","high","false","no"],
    ["sunny","hot","high","true","no"],
    ["overcast","hot","high","false","yes"]]


In [30]:
def CountFrequency(my_list): 
  
    # Creating an empty dictionary  
    freq = {} 
    for item in my_list: 
        if (item in freq): 
            freq[item] += 1
        else: 
            freq[item] = 1
    return freq
        
CountFrequency(seqA)

{'noun': 6, 'prep': 1, 'pron': 2, 'verb': 5}

In [31]:
seqB


('house',
 'him',
 'garden',
 'house',
 'is',
 'on',
 'house',
 'house',
 'is',
 'garden',
 'does',
 'house',
 'house',
 'house')

In [32]:
seqA

('noun',
 'pron',
 'noun',
 'noun',
 'verb',
 'prep',
 'noun',
 'noun',
 'verb',
 'noun',
 'verb',
 'pron',
 'verb',
 'verb')

In [33]:
import collections
import re

def count_pairs(s):
    """
    Returns a mapping that links each pair of words
    to its number of occurrences.
    """
    words = re.findall('\w+', s.lower())
    pairs = zip(words, words[1:])
    return collections.Counter(pairs)

def print_freqs(s):
    """
    Prints the number of occurrences of word pairs
    from the most common to the least common.
    """
    cnt = count_pairs(s)
    for pair, count in cnt.most_common():
        print(list(pair, count))

In [34]:
pairs = zip(seqA, seqA[1:])
ab=list(pairs)
Counter(ab)

Counter({('noun', 'noun'): 2,
         ('noun', 'pron'): 1,
         ('noun', 'verb'): 3,
         ('prep', 'noun'): 1,
         ('pron', 'noun'): 1,
         ('pron', 'verb'): 1,
         ('verb', 'noun'): 1,
         ('verb', 'prep'): 1,
         ('verb', 'pron'): 1,
         ('verb', 'verb'): 1})

In [35]:
dt = [["sunny","hot","high","false","no"],
    ["sunny","hot","high","true","no"],
    ["overcast","hot","high","false","yes"]]

In [36]:
for entry in dt:
    print(Counter(entry[0]))
    

Counter({'n': 2, 'y': 1, 'u': 1, 's': 1})
Counter({'n': 2, 'y': 1, 'u': 1, 's': 1})
Counter({'o': 1, 'c': 1, 'e': 1, 'r': 1, 't': 1, 'a': 1, 'v': 1, 's': 1})


In [37]:
dt[1]

['sunny', 'hot', 'high', 'true', 'no']

In [38]:
ls=[]
for entry in dt:
    ls.append(entry[0])
Counter(ls)

Counter({'overcast': 1, 'sunny': 2})

In [39]:
begin_list = [entry[0] for entry in dt]
Counter(begin_list)

Counter({'overcast': 1, 'sunny': 2})

In [40]:
begin_list

['sunny', 'sunny', 'overcast']

In [45]:
# this corresponds to dataset.X

smallX = (('eat', 'breakfast', 'at', 'morning','time'), ('take', 'time', 'with','arrow', 'projects'), 
          ('horse', 'riders', 'like', 'the', 'airport'), ('paper', 'flies', 'on', 'hydrogen', 'gas'),
          ('bees','sting', 'like', 'some', 'flies'), ('beans', 'soil', 'an', 'iron', 'grill'), 
          ('flies', 'smell', 'an', 'arrow', 'drink'), 
          ('people', 'like', 'an', 'army', 'arrow'), ('dinner', 'time', 'flies', 'all', 'day'),
          ('horse', 'flies', 'time', 'morning', 'rays'))# this corresponds to dataset.Y

In [47]:
# this corresponds to dataset.Y

smallY = [('VB', 'NN', 'IN','NN','NN'), ('VB', 'NN', 'IN','NN', 'NN'),
          ('NN', 'NN', 'VB','DT', 'NN'), ('NN', 'VB', 'IN','NN', 'NN'),
          ('NN', 'VB', 'IN','DT', 'NN'), ('NN', 'VB', 'DT','NN', 'NN'),
         ('NN', 'VB', 'DT','NN', 'NN'), ('NN', 'VB', 'DT','NN', 'NN'),
          ('NN', 'NN', 'VB','DT', 'NN'), ('NN', 'NN', 'VB','NN', 'NN')]

### Implement HMM Tagger

In [41]:
import warnings
warnings.filterwarnings('ignore')

In [42]:
# Jupyter "magic methods" -- only need to be run once per kernel restart
%load_ext autoreload
%aimport helpers, tests
%autoreload 1

In [43]:
# import python modules -- this cell needs to be run again if you make changes to any of the files
import matplotlib.pyplot as plt
import numpy as np

from IPython.core.display import HTML
from itertools import chain
from collections import Counter, defaultdict
from helpers import show_model, Dataset
from pomegranate import State, HiddenMarkovModel, DiscreteDistribution

In [60]:
data_small=Dataset("brown_small.txt", "brown-universal.txt")

print("There are {} sentences in the corpus.".format(len(data_small)))

print(len(data_small))


There are 57340 sentences in the corpus.
57340


In [59]:
# use Dataset.stream() (word, tag) samples for the entire corpus

print("\nStream (word, tag) pairs:\n")
for i, pair in enumerate(data_small):
    print("\t", pair)


Stream (word, tag) pairs:

	 (('eat', 'breakfast', 'at', 'morning', 'time'), ('VB', 'NN', 'IN', 'NN', 'NN'))
	 (('take', 'time', 'with', 'arrow', 'projects'), ('VB', 'NN', 'IN', 'NN', 'NN'))
	 (('horse', 'riders', 'like', 'the', 'airport'), ('NN', 'NN', 'VB', 'DT', 'NN'))
	 (('paper', 'flies', 'on', 'hydrogen', 'gas'), ('NN', 'VB', 'IN', 'NN', 'NN'))
	 (('bees', 'sting', 'like', 'some', 'flies'), ('NN', 'VB', 'IN', 'DT', 'NN'))
	 (('beans', 'soil', 'an', 'iron', 'grill'), ('NN', 'VB', 'DT', 'NN', 'NN'))
	 (('flies', 'smell', 'an', 'arrow', 'drink'), ('NN', 'VB', 'DT', 'NN', 'NN'))


In [61]:
nested = {'noun':{'house': 12, 'car': 2, 'elephant': 4}, 'verb': {'is':10, 'was':2}}

In [62]:
nested.keys()

dict_keys(['verb', 'noun'])

In [63]:
nested.items()

dict_items([('verb', {'was': 2, 'is': 10}), ('noun', {'car': 2, 'house': 12, 'elephant': 4})])

In [64]:
nested.values()

dict_values([{'was': 2, 'is': 10}, {'car': 2, 'house': 12, 'elephant': 4}])

In [72]:
for key, value in nested.items():
    print(value.keys())

dict_keys(['was', 'is'])
dict_keys(['car', 'house', 'elephant'])


In [76]:
for item1, item2 in nested.items():
    print(item2)
    print
    print(item1)

{'was': 2, 'is': 10}
verb
{'car': 2, 'house': 12, 'elephant': 4}
noun


In [77]:
for item in nested:
    print(item)

verb
noun


In [79]:
for item in nested.items():
    print(item)

('verb', {'was': 2, 'is': 10})
('noun', {'car': 2, 'house': 12, 'elephant': 4})


In [81]:
for key, value in nested.items():
    print(list(value.values()))

[2, 10]
[2, 12, 4]


In [85]:
small_stream = list(zip(smallX,smallY))

In [86]:
tags = [pair[1] for pair in small_stream]

In [88]:
smallY

[('VB', 'NN', 'IN', 'NN', 'NN'),
 ('VB', 'NN', 'IN', 'NN', 'NN'),
 ('NN', 'NN', 'VB', 'DT', 'NN'),
 ('NN', 'VB', 'IN', 'NN', 'NN'),
 ('NN', 'VB', 'IN', 'DT', 'NN'),
 ('NN', 'VB', 'DT', 'NN', 'NN'),
 ('NN', 'VB', 'DT', 'NN', 'NN'),
 ('NN', 'VB', 'DT', 'NN', 'NN'),
 ('NN', 'NN', 'VB', 'DT', 'NN'),
 ('NN', 'NN', 'VB', 'NN', 'NN')]

In [91]:
flatY = [item for sublist in smallY for item in sublist]
flatX = [item for sublist in smallX for item in sublist]

In [93]:
Counter(zip(flatX, flatY));