In [1]:
import re, nltk, itertools, sklearn
import numpy as np
from nltk.corpus import treebank
from collections import Counter
from nltk.classify import MaxentClassifier

In [2]:
corpus = list(treebank.tagged_words())

In [3]:
corpus = [(w, t) for w, t in corpus if t not in ['-NONE-', '-RRB-', '-LRB-']]

In [4]:
tags = [t for w, t in corpus]
words = [w for w, t in corpus]

## Frequent vs rare words in the data

In [99]:
wordfreq = nltk.FreqDist(words).most_common()
frequent = [w for w, c in wordfreq if c >= 5]
rare = [w for w, c in wordfreq if c < 5]

print('We have %d frequent words and %d rare words in training.' % (len(frequent), len(rare)))

We have 2475 frequent words and 9489 rare words in training.


In [106]:
rareset = [(w, t) for w, t in zip(words, tags) if w in rare]

In [108]:
print('The size of the rare corpus is %d' % len(rareset))

The size of the rare corpus is 14769


In [109]:
rareset[0:5]

[('Pierre', 'NNP'),
 ('Vinken', 'NNP'),
 ('join', 'VB'),
 ('Vinken', 'NNP'),
 ('Elsevier', 'NNP')]

## Prefixes and suffixes

In [113]:
prefixes = Counter([w[0:i].lower() for w, t in rareset for i in range(1, 5) if w.isalpha() and len(w) > 4]).most_common()

In [117]:
candidates = [pr for pr, c in prefixes if c >= 5]

print('%d candidate prefixes' % len(candidates))

1524 candidate prefixes


In [86]:
def ig(feature_set, ws, labels):
    # Calculate the entropy of a Counter object, a dictionary of (class, count) pairs,
    # with s equal to the total count. 
    
    N = float(len(labels))

    def entropy(count_dict, s):
        ts = [count_dict[label]/s * np.log2(count_dict[label]/s) for label in count_dict.keys()]
        return sum(ts)
        
    # Calculate the gain from a given word
    def gain(prefix):
        # Calculate the entropy of the set of reviews when w is in the review
        # This entropy is multiplied by the probability of w
        prefix_in_set = Counter([label for label, w in zip(labels, ws) if prefix in [w[0:i].lower() for i in range(1, 5)]])
        s_in_set = sum(prefix_in_set.values())
        entropy_in_set = entropy(prefix_in_set, s_in_set) * s_in_set / N
        # Calculate the entropy of the set of reviews when w is not in the review        
        prefix_not_in_set = Counter([label for label, w in zip(labels, ws) if prefix not in [w[0:i].lower() for i in range(1, 5)]])
        s_not_in_set = sum(prefix_not_in_set.values())
        entropy_not_in_set = entropy(prefix_not_in_set, s_not_in_set) * s_not_in_set / N
        # The sum of class entropy and the two entropies makes the gain
        return class_entropy + entropy_in_set + entropy_not_in_set
    
    # Calculate the entropy of the corpus
    class_entropy = -entropy(Counter(labels), N)
    ig_scores = []
    for feature in feature_set:
        ig_scores.append((feature, gain(feature)))
    return ig_scores

In [118]:
ig_scores = ig(candidates, [w for w, t in rareset], [t for w, t in rareset])

In [119]:
ig_scores = sorted(ig_scores, key = lambda x: -x[1])

In [130]:
ig_scores[0:10]

[('re', 0.018572926248108246),
 ('c', 0.01581403394539338),
 ('s', 0.015478164455462462),
 ('p', 0.014231028979970528),
 ('r', 0.014045551638506737),
 ('u', 0.012183629968901322),
 ('b', 0.011791700659400384),
 ('a', 0.011212705927205135),
 ('un', 0.010751169259090254),
 ('m', 0.010535547978256066)]

In [123]:
suffixes = Counter([w[-i:].lower() for w, t in rareset for i in range(1, 5) if w.isalpha() and len(w) > 4]).most_common()
candidates = [p for p, c in suffixes if c >= 5]

print('%d candidate suffixes.' % len(candidates))

1132 candidate suffixes.


In [125]:
def ig(feature_set, ws, labels):
    # Calculate the entropy of a Counter object, a dictionary of (class, count) pairs,
    # with s equal to the total count. 
    
    N = float(len(labels))

    def entropy(count_dict, s):
        ts = [count_dict[label]/s * np.log2(count_dict[label]/s) for label in count_dict.keys()]
        return sum(ts)
        
    # Calculate the gain from a given word
    def gain(suffix):
        # Calculate the entropy of the set of reviews when w is in the review
        # This entropy is multiplied by the probability of w
        suffix_in_set = Counter([label for label, w in zip(labels, ws) if suffix in [w[-i:].lower() for i in range(1, 5)]])
        s_in_set = sum(suffix_in_set.values())
        entropy_in_set = entropy(suffix_in_set, s_in_set) * s_in_set / N
        # Calculate the entropy of the set of reviews when w is not in the review        
        suffix_not_in_set = Counter([label for label, w in zip(labels, ws) if suffix not in [w[-i:].lower() for i in range(1, 5)]])
        s_not_in_set = sum(suffix_not_in_set.values())
        entropy_not_in_set = entropy(suffix_not_in_set, s_not_in_set) * s_not_in_set / N
        # The sum of class entropy and the two entropies makes the gain
        return class_entropy + entropy_in_set + entropy_not_in_set
    
    # Calculate the entropy of the corpus
    class_entropy = -entropy(Counter(labels), N)
    ig_scores = []
    for feature in feature_set:
        ig_scores.append((feature, gain(feature)))
    return ig_scores

In [126]:
ig_scores2 = ig(candidates, [w for w, t in rareset], [t for w, t in rareset])

In [127]:
ig_scores2 = sorted(ig_scores2, key = lambda x: -x[1])

In [129]:
ig_scores2[0:100]

[('s', 0.48460878400949658),
 ('ed', 0.30264209203916659),
 ('d', 0.2645901646878821),
 ('ing', 0.22843062383353363),
 ('ng', 0.22376797866087195),
 ('g', 0.22012741361743782),
 ('ly', 0.11826558116849384),
 ('es', 0.11013502876243297),
 ('y', 0.10328141511898803),
 ('e', 0.091312708165125933),
 ('rs', 0.069583697078176687),
 ('ted', 0.065480553533864949),
 ('ts', 0.064016197702356337),
 ('r', 0.052517821627971362),
 ('n', 0.051689131243768838),
 ('ers', 0.05141280360775502),
 ('t', 0.048043972535427137),
 ('ting', 0.047435192132976756),
 ('on', 0.044013522820365303),
 ('ion', 0.043646994609473744),
 ('er', 0.042113134829287802),
 ('ns', 0.041119722345153598),
 ('tion', 0.037397858544500284),
 ('l', 0.03448849853192959),
 ('lly', 0.03396912339646585),
 ('ally', 0.03165722717036612),
 ('ons', 0.029700602830903033),
 ('ions', 0.028275033009527828),
 ('al', 0.027520059472585956),
 ('a', 0.024044403755754562),
 ('red', 0.02393994040807268),
 ('ies', 0.023512349723991299),
 ('ated', 0.02296

In [167]:
legal_prefixes = [pr for pr, score in ig_scores if score > 0.01]
legal_suffixes = [pr for pr, score in ig_scores2 if score > 0.01]

In [168]:
len(legal_prefixes), len(legal_suffixes)

(10, 79)

## Word shape

In [7]:
upper = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
lower = 'abcdefghijklmnopqrstuvwxyz'
mapping = dict((l, 'X') for l in upper)
mapping.update(dict((l, 'x') for l in lower))
mapping.update(dict((d, 'd') for d in '0123456789'))

In [133]:
def wordshape(w, mapping = mapping):
    def mapper(letter):
        if letter in mapping:
            return mapping[letter]
        else:
            return letter
    return ''.join(mapper(l) for l in w)

In [134]:
def short_wordshape(w, mapping=mapping):
    long = wordshape(w, mapping)
    prev = long[0]
    short = prev
    for l in long[1:]:
        if l != prev:
            prev = l
            short += prev
    return short

In [140]:
def ig(feature_set, ws, labels):
    # Calculate the entropy of a Counter object, a dictionary of (class, count) pairs,
    # with s equal to the total count. 
    
    N = float(len(labels))

    def entropy(count_dict, s):
        ts = [count_dict[label]/s * np.log2(count_dict[label]/s) for label in count_dict.keys()]
        return sum(ts)
        
    # Calculate the gain from a given word
    def gain(shape):
        # Calculate the entropy of the set of reviews when w is in the review
        # This entropy is multiplied by the probability of w
        shape_in_set = Counter([label for label, w in zip(labels, ws) if shape == wordshape(w)])
        s_in_set = sum(shape_in_set.values())
        entropy_in_set = entropy(shape_in_set, s_in_set) * s_in_set / N
        # Calculate the entropy of the set of reviews when w is not in the review        
        shape_not_in_set = Counter([label for label, w in zip(labels, ws) if shape != wordshape(w)])
        s_not_in_set = sum(shape_not_in_set.values())
        entropy_not_in_set = entropy(shape_not_in_set, s_not_in_set) * s_not_in_set / N
        # The sum of class entropy and the two entropies makes the gain
        return class_entropy + entropy_in_set + entropy_not_in_set
    
    # Calculate the entropy of the corpus
    class_entropy = -entropy(Counter(labels), N)
    ig_scores = []
    for feature in feature_set:
        ig_scores.append((feature, gain(feature)))
    return ig_scores

In [141]:
shapes = Counter([wordshape(w) for w, t in rareset]).most_common()
candidates = [shape for shape, count in shapes if count >= 5]

In [142]:
ig_score_shape = ig(candidates, [w for w, t in rareset], [t for w, t in rareset])

In [143]:
ig_score_shape = sorted(ig_score_shape, key=lambda x: -x[1])

In [144]:
ig_score_shape[0:10]

[('Xxxxxx', 0.065511138546845871),
 ('xxxxxxx', 0.062358958396059094),
 ('Xxxxx', 0.060549769672417764),
 ('Xxxxxxx', 0.059726230421480953),
 ('xxxxxx', 0.059357639303121434),
 ('xxxxxxxx', 0.056474484098516076),
 ('xxxxxxxxx', 0.050422977086370668),
 ('d.dd', 0.046854614223687641),
 ('xxxxx', 0.044827886833670672),
 ('ddd', 0.041882102348762551)]

In [145]:
def ig(feature_set, ws, labels):
    # Calculate the entropy of a Counter object, a dictionary of (class, count) pairs,
    # with s equal to the total count. 
    
    N = float(len(labels))

    def entropy(count_dict, s):
        ts = [count_dict[label]/s * np.log2(count_dict[label]/s) for label in count_dict.keys()]
        return sum(ts)
        
    # Calculate the gain from a given word
    def gain(shape):
        # Calculate the entropy of the set of reviews when w is in the review
        # This entropy is multiplied by the probability of w
        shape_in_set = Counter([label for label, w in zip(labels, ws) if shape == short_wordshape(w)])
        s_in_set = sum(shape_in_set.values())
        entropy_in_set = entropy(shape_in_set, s_in_set) * s_in_set / N
        # Calculate the entropy of the set of reviews when w is not in the review        
        shape_not_in_set = Counter([label for label, w in zip(labels, ws) if shape != short_wordshape(w)])
        s_not_in_set = sum(shape_not_in_set.values())
        entropy_not_in_set = entropy(shape_not_in_set, s_not_in_set) * s_not_in_set / N
        # The sum of class entropy and the two entropies makes the gain
        return class_entropy + entropy_in_set + entropy_not_in_set
    
    # Calculate the entropy of the corpus
    class_entropy = -entropy(Counter(labels), N)
    ig_scores = []
    for feature in feature_set:
        ig_scores.append((feature, gain(feature)))
    return ig_scores

In [146]:
shapes = Counter([short_wordshape(w) for w, t in rareset]).most_common()
candidates = [shape for shape, count in shapes if count >= 5]

In [149]:
ig_score_shape2 = ig(candidates, [w for w, t in rareset], [t for w, t in rareset])

In [150]:
ig_score_shape2 = sorted(ig_score_shape2, key=lambda x: -x[1] ) 

In [151]:
ig_score_shape2[0:10]

[('x', 0.59397088282276156),
 ('Xx', 0.44331098828203874),
 ('d.d', 0.17628408818709751),
 ('d', 0.08465020470904161),
 ('x-x', 0.075550385688024413),
 ('d,d', 0.03074741695201455),
 ('X', 0.016850989288419438),
 ('d-x', 0.0083882236320960146),
 ('Xx-x', 0.0066339574579599159),
 ('X.X.', 0.0060607364375004913)]

In [153]:
legal_shapes = [shape for shape, freq in ig_score_shape if freq > 0.01]
legal_short_shapes = [shape for shape, freq in ig_score_shape2 if freq > 0.01]

len(legal_shapes), len(legal_short_shapes)

(29, 7)

In [162]:
def features(w):
    feats = {}
    if w in frequent:
        feats['word'] = w
    else:
        for prefixe in [w[0:i].lower() for i in range(1, 5)]:
            if prefixe in legal_prefixes:
                feats['pref has ' + prefixe] = 1
        for suffixe in [w[-i:].lower() for i in range(1, 5)]:
            if suffixe in legal_suffixes:
                feats['suff has ' + suffixe] = 1        
        feats['shape'] = wordshape(w)
        feats['short_shape'] = short_wordshape(w)
        feats['has_upper'] = 1 if re.search('[A-Z]', w) else 0
        feats['has_hyphen'] = 1 if re.search('[-]', w) else 0
        feats['has_digit'] = 1 if re.search('[0-9]', w) else 0
        feats['is_upper'] = 1 if w.isupper() else 0
    return feats

In [175]:
prev = 'START'
featuresset = []
for i in range(len(corpus)):
    w = words[i]
    y = tags[i]
    feats = features(w)
    feats['prevTAG'] = prev
    featuresset.append((feats, y))
    if y == '.':
        prev = 'START'
    else:
        prev = y

In [181]:
model = MaxentClassifier.train(featuresset[0:10000], max_iter=20)

  ==> Training (20 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -3.63759        0.005
             2          -1.46524        0.895
             3          -0.92967        0.940
             4          -0.68835        0.949
             5          -0.54962        0.953
             6          -0.46007        0.956
             7          -0.39770        0.958
             8          -0.35184        0.959
             9          -0.31672        0.961
            10          -0.28897        0.962
            11          -0.26649        0.963
            12          -0.24791        0.963
            13          -0.23229        0.963
            14          -0.21897        0.964
            15          -0.20748        0.965
            16          -0.19745        0.965
            17          -0.18863        0.967
            18          -0.18080        0.967
            19          -0.17381        0.967
  

In [216]:
model.show_most_informative_features(100)

  12.053 word=='whose' and label is 'WP$'
  11.480 word=='Its' and label is 'PRP$'
  11.248 word=='publishing' and label is 'VBG'
  11.185 word=='when' and label is 'WRB'
  11.065 word=='enough' and label is 'RB'
  10.981 word=='where' and label is 'WRB'
  10.686 word=='operate' and label is 'VBP'
  10.672 word=='earlier' and label is 'JJR'
  10.640 word=='More' and label is 'RBR'
  10.524 word=='35' and label is 'CD'
  10.524 word=='11' and label is 'CD'
  10.439 word=='causing' and label is 'VBG'
  10.379 word=='computer' and label is 'NN'
  10.376 word=='what' and label is 'WP'
  10.348 word=='who' and label is 'WP'
  10.321 word=='preferred' and label is 'VBN'
  10.265 word=='offering' and label is 'VBG'
  10.195 word=='reporting' and label is 'VBG'
  10.195 word=='stronger' and label is 'JJR'
  10.102 word=='more' and label is 'RBR'
  10.044 word=='least' and label is 'JJS'
  10.031 word=='rather' and label is 'RB'
  10.012 word=='likely' and label is 'JJ'
   9.987 word=='why' and

## Tagger

In [None]:
start = 0
sequences = []
for i in range(len(corpus)):
    if corpus[i][1] == '.':
        sequences.append(corpus[start:i+1])
        start = i+1

In [None]:
n = len(sequences)
X = [[w for w, tag in sequence] for sequence in sequences]
y = [[tag for w, tag in sequence] for sequence in sequences]
print(n)

In [195]:
MaxentClassifier.classify?

In [208]:
model._encoding.encode(, 'VB')


[(22, 1), (24, 1), (1457, 1), (26, 1), (1621, 1)]

In [210]:
d = model.prob_classify(features(words[1118]))

In [215]:
model.labels()

['IN',
 'NNPS',
 'RB',
 'NNS',
 'DT',
 'NN',
 'VBP',
 'VB',
 'NNP',
 'MD',
 ',',
 'RBR',
 '.',
 '$',
 'VBZ',
 'JJR',
 'VBN',
 '``',
 'POS',
 'JJS',
 'WP',
 'CD',
 'WRB',
 'RBS',
 'WDT',
 'WP$',
 'JJ',
 'PRP',
 ':',
 'VBD',
 'VBG',
 'CC',
 'PRP$',
 'EX',
 'PDT',
 'TO',
 'RP',
 "''"]

In [None]:
def tagger(sequence, states, transition, emission):
    T = len(sequence)
    N = len(states)
    treillis = np.zeros((N, T))
    max_came_from = {}
    
    # Initialization
    first_word = features(sequence[0])
    for s in range(N):
        treillis[s, 0] = transition((states[s], 'START')) * emission((sequence[0], states[s]))
        max_came_from[s, 0] = 'START'
    
    # Recursion
    for t in range(1, T):
        for s in range(N):
            em = emission((sequence[t], states[s]))
            inputs = [transition((states[s], states[k])) * em * treillis[k, t-1] for k in range(N)]
            treillis[s, t] = np.max(inputs)
            max_came_from[s, t] = (np.argmax(inputs), t-1)
    
    # Termination
    inputs = [treillis[k, -1] * transition(('.', states[k])) for k in range(N)]
    max_came_from['END'] = (np.argmax(inputs), T-1)
    
    # Reconstruct the path
    prev = max_came_from['END']
    best_sequence = []
    while prev != 'START':
        best_sequence.append(states[prev[0]])
        prev = max_came_from[prev]
    return best_sequence[::-1]