## Part 1

In [1]:
partial='./data/partial'
partial_train_path = './data/partial/train'
partial_devin_path = './data/partial/dev.in'
partial_devout_path = './data/partial/dev.out'

In [2]:
from pathlib import Path
partial = Path('./data/partial')
full = Path('./data/full')

In [3]:
def labelled(path):
    with open(path) as f:  
        X, Y, x, y = list(), list(), list(), list()
        for line in f:
            if line == '\n':
                X.append(x)
                Y.append(y)
                x, y = list(), list()
            else:
                word, tag = line.strip().split()
                x.append(word)
                y.append(tag)
    return X, Y

def unlabelled(path):
    with open(path) as f:  
        X, x = list(), list()
        for line in f:
            if line == '\n':
                X.append(x)
                x = list()
            else:
                word = line.strip()
                x.append(word)
    return X

def read_data(root):
    train, devin, devout = root/'train', root/'dev.in', root/'dev.out'     
    return labelled(train), unlabelled(devin), labelled(devout)

In [4]:
train_ds, devin_ds, devout_ds = read_data(partial)

In [5]:
def get_tag2index(Y):
    tags = list(set([word for sentence in Y for word in sentence]))
    return {tag: i for i, tag in enumerate(tags)}

In [6]:
tag2index = get_tag2index(train_ds[1])

In [7]:
import numpy as np

def emission_weight(train_ds):
    def str_(x, y):
        return 'emission:' + str(y) + '+' + str(x)
    
    vocabulary = list(set([word for sentence in train_ds[0] for word in sentence]))
    tags = list(set([word for sentence in train_ds[1] for word in sentence]))
    word2index = {word: i for i, word in enumerate(vocabulary)}
    tag2index = {tag: i for i, tag in enumerate(tags)}
    count_table = np.zeros((len(tags), len(vocabulary)))
    for X, Y in zip(train_ds[0], train_ds[1]):
        for word, tag in zip(X, Y):
            count_table[tag2index[tag], word2index[word]] += 1
            
    count_table/=count_table.sum(1)[:, None]
    
    dict_output = dict()
    for i in range(len(tags)):
        for j in range(len(vocabulary)):
            if count_table[i, j] != 0:
                dict_output[str_(vocabulary[j], tags[i])] = np.log(count_table[i, j])
    
    return dict_output             

In [8]:
emission_dict = emission_weight(train_ds)

In [9]:
from collections import defaultdict

def emission_weight_alternative(train_ds):
    def str_(x, y):
        return 'emission:' + str(y) + '+' + str(x)
    
    dict_count = defaultdict(lambda: defaultdict(int))
    for X, Y in zip(train_ds[0], train_ds[1]):
        for word, tag in zip(X, Y):
            dict_count[tag][word] += 1
    
    dict_output = dict()
    for tag, words in dict_count.items():
        tag_count = sum(words.values())
        for word in words:
            dict_output[str_(word, tag)] = np.log(words[word]/tag_count)
    
    return dict_output

In [10]:
emission_dict_alternative = emission_weight_alternative(train_ds)

### Testing for emission

In [11]:
emission_dict['emission:I-tim+decades'], emission_dict_alternative['emission:I-tim+decades']

(-3.9318256327243257, -3.9318256327243257)

In [12]:
emission_dict['emission:O+The'], emission_dict_alternative['emission:O+The']

(-4.333515843240958, -4.333515843240958)

In [13]:
assert len(emission_dict.keys()) == len(emission_dict_alternative.keys())

In [14]:
emission_dict['emission:B-per+John'], emission_dict_alternative['emission:B-per+John']

(-3.817712325956905, -3.817712325956905)

In [15]:
def emission_weight_smooth(train_ds, k):
    def str_(x, y):
        return 'emission:' + str(y) + '+' + str(x)
    
    vocabulary = list(set([word for sentence in train_ds[0] for word in sentence]))
    tags = list(set([word for sentence in train_ds[1] for word in sentence]))
    word2index = {word: i for i, word in enumerate(vocabulary)}
    tag2index = {tag: i for i, tag in enumerate(tags)}
    count_table = np.zeros((len(tags), len(vocabulary)))
    for X, Y in zip(train_ds[0], train_ds[1]):
        for word, tag in zip(X, Y):
            count_table[tag2index[tag], word2index[word]] += 1
    
    removed_index = np.sum(count_table, 0) < k
    
    print('Number of removed words:', np.sum(removed_index))
    print('Total number of words:', len(vocabulary))
    
    if (np.sum(removed_index) > 0):
        
        count_table = np.append(count_table, np.sum(count_table[:, removed_index], 1)[:,None], 1)
        count_table = np.delete(count_table, np.nonzero(removed_index), 1)
        
        new_vocab = [vocabulary[j] for j in range(len(vocabulary)) if not removed_index[j]]+['#UNK#']
        word2index = {w:i for i,w in enumerate(new_vocab)}
    
    count_table/=count_table.sum(1)[:, None]
    
    dict_output = dict()
    for i in range(len(tags)):
        for j in range(len(new_vocab)):
            if count_table[i, j] != 0:
                dict_output[str_(new_vocab[j], tags[i])] = np.log(count_table[i, j])
            else:
                dict_output[str_(new_vocab[j], tags[i])] = -np.inf
    
    return dict_output, word2index 

In [16]:
emission_smth_dict, word2index = emission_weight_smooth(train_ds, 2)

Number of removed words: 2528
Total number of words: 4068


In [17]:
emission_smth_dict['emission:B-per+#UNK#'], emission_smth_dict['emission:I-per+#UNK#']

(-1.0043016091968684, -0.5253267144567906)

In [18]:
def emission_weight_all(train_ds):
    def str_(x, y):
        return 'emission:' + str(y) + '+' + str(x)
    
    vocabulary = list(set([word for sentence in train_ds[0] for word in sentence]))+['#UNK#']
    tags = list(set([word for sentence in train_ds[1] for word in sentence]))
    word2index = {word: i for i, word in enumerate(vocabulary)}
    tag2index = {tag: i for i, tag in enumerate(tags)}
    count_table = np.zeros((len(tags), len(vocabulary)))
    for X, Y in zip(train_ds[0], train_ds[1]):
        for word, tag in zip(X, Y):
            count_table[tag2index[tag], word2index[word]] += 1
    
    count_table[:, -1] = 1
            
    count_table/=count_table.sum(1)[:, None]
    
    dict_output = dict()
    for i in range(len(tags)):
        for j in range(len(vocabulary)):
            if count_table[i, j] != 0:
                dict_output[str_(vocabulary[j], tags[i])] = np.log(count_table[i, j])
            else:
                dict_output[str_(vocabulary[j], tags[i])] = -np.inf
    
    return dict_output             

In [19]:
emission_dict_all = emission_weight_all(train_ds)

In [20]:
emission_dict_all['emission:I-per+#UNK#'], emission_dict_all['emission:B-geo+#UNK#']

(-5.71042701737487, -6.278521424165844)

In [21]:
import itertools
import copy

def transition_weight(train_ds):
    def str_(y1, y2):
        return 'transition:' + str(y1) + '+' + str(y2)
    
    def pairwise(iterable):
        a, b = itertools.tee(iterable)
        next(b, None)
        return zip(a, b)
    
    dict_count = defaultdict(lambda: defaultdict(int))
    for Y in train_ds[1]:
        Y = copy.deepcopy(Y)
        Y.insert(0, 'START')
        Y.append('STOP')
        for tag1, tag2 in pairwise(Y):
            dict_count[tag1][tag2] += 1
    
    dict_output = dict()
    for tag1, tag2s in dict_count.items():
        tag1_count = sum(tag2s.values())
        for tag2 in tag2s:
            dict_output[str_(tag1, tag2)] = np.log(tag2s[tag2]/tag1_count)
    
    return dict_output

In [22]:
transition_dict = transition_weight(train_ds)

In [23]:
def transition_weight_alternative(train_ds):
    def str_(y1, y2):
        return 'transition:' + str(y1) + '+' + str(y2)
    
    def pairwise(iterable):
        a, b = itertools.tee(iterable)
        next(b, None)
        return zip(a, b)
    
    tags = list(set([word for sentence in train_ds[1] for word in sentence]).union({'START'})) + ['STOP']
    tag2index = {tag: i for i, tag in enumerate(tags)}
    count_table = np.zeros((len(tags)-1, len(tags)))
    for Y in train_ds[1]:
        Y = copy.deepcopy(Y)
        Y.insert(0, 'START')
        Y.append('STOP')
        for tag1, tag2 in pairwise(Y):
            count_table[tag2index[tag1], tag2index[tag2]] += 1
            
    count_table/=count_table.sum(1)[:, None]
    dict_output = dict()
    for i in range(len(tags)-1):
        for j in range(len(tags)):
            if count_table[i, j] != 0:
                dict_output[str_(tags[i], tags[j])] = np.log(count_table[i, j])
    
    return dict_output               

In [24]:
transition_dict_alternative = transition_weight_alternative(train_ds)

In [25]:
assert(len(transition_dict.keys()) == len(transition_dict_alternative.keys()))

In [26]:
transition_dict['transition:B-geo+I-geo'], transition_dict_alternative['transition:B-geo+I-geo']

(-1.5945122622174248, -1.5945122622174248)

In [27]:
transition_dict['transition:START+B-geo'], transition_dict_alternative['transition:START+B-geo']

(-2.744417845273085, -2.744417845273085)

In [28]:
def transition_weight_all(train_ds):
    def str_(y1, y2):
        return 'transition:' + str(y1) + '+' + str(y2)
    
    def pairwise(iterable):
        a, b = itertools.tee(iterable)
        next(b, None)
        return zip(a, b)
    
    tags = list(set([word for sentence in train_ds[1] for word in sentence]).union({'START'})) + ['STOP']
    tag2index = {tag: i for i, tag in enumerate(tags)}
    count_table = np.zeros((len(tags)-1, len(tags)))
    for Y in train_ds[1]:
        Y = copy.deepcopy(Y)
        Y.insert(0, 'START')
        Y.append('STOP')
        for tag1, tag2 in pairwise(Y):
            count_table[tag2index[tag1], tag2index[tag2]] += 1
            
    count_table/=count_table.sum(1)[:, None]
    dict_output = dict()
    for i in range(len(tags)-1):
        for j in range(len(tags)):
            if count_table[i, j] != 0:
                dict_output[str_(tags[i], tags[j])] = np.log(count_table[i, j])
            else:
                dict_output[str_(tags[i], tags[j])] = -np.inf
    
    return dict_output  

In [29]:
transition_dict_all = transition_weight_all(train_ds)

In [30]:
transition_dict_all['transition:START+B-geo']

-2.744417845273085

## Part 2

In [188]:
def eval_sequence(X, Y, emission_dict, transition_dict):
    # First evaluate emission features
    def str_emission(x, y):
        return 'emission:' + str(y) + '+' + str(x)
    def str_transition(y1, y2):
        return 'transition:' + str(y1) + '+' + str(y2)
    def pairwise(iterable):
        a, b = itertools.tee(iterable)
        next(b, None)
        return zip(a, b)
    score = 0
    Y = copy.deepcopy(Y)
    Y.insert(0, 'START')
    Y.append('STOP')
    for word, tag in zip(X, Y):
        if str_emission(word, tag) in emission_dict:
            score += emission_dict[str_emission(word, tag)]
    for tag1, tag2 in pairwise(Y):
        if str_transition(tag1, tag2) in transition_dict:
            score += transtion_dict[str_transition(tag1, tag2)]
    return score

In [32]:
def str_emission(x, y):
    return 'emission:' + str(y) + '+' + str(x)
def str_transition(y1, y2):
    return 'transition:' + str(y1) + '+' + str(y2)

In [33]:
def viterbi(X, tag2index, emission_dict, transition_dict):
    
    index2tag = {value: key for key, value in tag2index.items()}
    score_matrix = np.zeros((len(tag2index), len(X)))
    path_matrix = np.zeros((len(tag2index), len(X)), dtype='int')
    
    for i in range(len(X)):
        if i == 0:
            for tag2 in tag2index:
                emission_key = str_emission(X[i], tag2)
                default_key = str_emission('#UNK#', tag2)
                emission_score = emission_dict[emission_key] if emission_key in emission_dict else emission_dict[default_key]
                transition_key = str_transition('START', tag2)
                transition_score = transition_dict[transition_key]
                score_matrix[tag2index[tag2], i] = transition_score + emission_score
        else:
            for tag2 in tag2index:
                competitors = np.zeros((len(tag2index)))
                for tag1 in tag2index:
                    emission_key = str_emission(X[i], tag2)
                    default_key = str_emission('#UNK#', tag2)
                    emission_score = emission_dict[emission_key] if emission_key in emission_dict else emission_dict[default_key]
                    transition_key = str_transition(tag1, tag2)
                    transition_score = transition_dict[transition_key]
                    competitors[tag2index[tag1]] = score_matrix[tag2index[tag1], i-1] + transition_score + emission_score
                score_matrix[tag2index[tag2], i] = np.max(competitors)
                path_matrix[tag2index[tag2], i] = np.argmax(competitors)
    competitors = np.zeros((len(tag2index)))
    for tag1 in tag2index:
        transition_key = str_transition(tag1, 'STOP')
        transition_score = transition_dict[transition_key]
        competitors[tag2index[tag1]] = score_matrix[tag2index[tag1], -1] + transition_score
    
#     for i in range(len(X)):
#         print('**************************')
#         print(path_matrix[:, i])
#         print(score_matrix[:, i])
    
    last_idx = np.argmax(competitors)
    path = [last_idx]
    for m in range(len(X)-1, 0, -1):
        path.insert(0, path_matrix[path[0], m])
    output_tags = [index2tag[idx] for idx in path]
    return output_tags

### Test for one case

In [34]:
viterbi(devout_ds[0][0], tag2index, emission_smth_dict, transition_dict_all), devout_ds[1][0]

(['O',
  'O',
  'B-org',
  'I-org',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-org',
  'I-org',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'B-org',
  'I-org',
  'I-org',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-org',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'])

In [35]:
def viterbi_output(dev_out_path, X, tag2index, emission_dict, transition_dict):
    
    tags = [viterbi(sentence, tag2index, emission_dict, transition_dict) for sentence in X]
    
    output_string = ''
    for i in range(len(X)):
        for j in range(len(X[i])):
            output_string += X[i][j] + ' ' + tags[i][j] + '\n'
        output_string += '\n'
    
    with open(dev_out_path, 'w') as f:
        f.write(output_string)
    
    print('Done with writing predictions')
    return None

In [36]:
dev_out_path = partial/'dev.p2.out'
viterbi_output(dev_out_path, devin_ds[0], tag2index, emission_smth_dict, transition_dict_all)

Done with writing predictions


In [37]:
from conlleval_ import evaluate

def sequence_evaluation(X, Y, tag2index, emission_dict, transition_dict):
    tags_ = [viterbi(sentence, tag2index, emission_dict, transition_dict) for sentence in X]
    tags = [tag for tags in tags_ for tag in tags]
    Y  = [tag for tags in Y for tag in tags]
    assert len(Y) == len(tags)
    return evaluate(Y, tags)

In [38]:
sequence_evaluation(devin_ds, devout_ds[1], tag2index, emission_smth_dict, transition_dict_all)

processed 2097 tokens with 236 phrases; found: 165 phrases; correct: 114.
accuracy:  53.82%; (non-O)
accuracy:  89.94%; precision:  69.09%; recall:  48.31%; FB1:  56.86
              art: precision:   0.00%; recall:   0.00%; FB1:   0.00  0
              eve: precision:   0.00%; recall:   0.00%; FB1:   0.00  0
              geo: precision:  76.36%; recall:  49.41%; FB1:  60.00  55
              gpe: precision:  88.89%; recall:  64.00%; FB1:  74.42  18
              nat: precision:   0.00%; recall:   0.00%; FB1:   0.00  0
              org: precision:  41.67%; recall:  28.57%; FB1:  33.90  24
              per: precision:  50.00%; recall:  37.50%; FB1:  42.86  24
              tim: precision:  77.27%; recall:  64.15%; FB1:  70.10  44


(69.0909090909091, 48.30508474576271, 56.85785536159601)

In [39]:
sequence_evaluation(devin_ds, devout_ds[1], tag2index, emission_dict_all, transition_dict_all)

processed 2097 tokens with 236 phrases; found: 380 phrases; correct: 117.
accuracy:  53.82%; (non-O)
accuracy:  77.54%; precision:  30.79%; recall:  49.58%; FB1:  37.99
              art: precision:   0.00%; recall:   0.00%; FB1:   0.00  18
              eve: precision:   0.00%; recall:   0.00%; FB1:   0.00  21
              geo: precision:  65.79%; recall:  58.82%; FB1:  62.11  76
              gpe: precision:  40.62%; recall:  52.00%; FB1:  45.61  32
              nat: precision:   0.00%; recall:   0.00%; FB1:   0.00  131
              org: precision:  33.33%; recall:  28.57%; FB1:  30.77  30
              per: precision:  60.00%; recall:  37.50%; FB1:  46.15  20
              tim: precision:  61.54%; recall:  60.38%; FB1:  60.95  52


(30.789473684210527, 49.57627118644068, 37.98701298701299)

### It turns out that smoothing method introduced in ML course perform better

## Part 3

In [40]:
def forward(X, tag2index, emission_dict, transition_dict):
    """
    The returned value should be the forward matrix for sentence X and the final alpha
    The implementation so far explicitly pass emission and transition dictionary as arguments
    This can be replaced by FeatureSum function in the implementation for part 5
    """
    forward_matrix = np.zeros((len(tag2index), len(X)))
    emission_dict_exp = {k: np.exp(v) for (k, v) in emission_dict.items()}
    transition_dict_exp = {k: np.exp(v) for (k, v) in transition_dict.items()}
    for i in range(len(X)):
        if i == 0:
            for tag2 in tag2index:
                emission_key = str_emission(X[i], tag2)
                default_key = str_emission('#UNK#', tag2)
                emission_score = emission_dict_exp[emission_key] if emission_key in emission_dict_exp else emission_dict_exp[default_key]
                transition_key = str_transition('START', tag2)
                transition_score = transition_dict_exp[transition_key]
                forward_matrix[tag2index[tag2], i] = transition_score*emission_score
        else:
            for tag2 in tag2index:
                SumPotential = 0
                for tag1 in tag2index:
                    emission_key = str_emission(X[i], tag2)
                    default_key = str_emission('#UNK#', tag2)
                    emission_score = emission_dict_exp[emission_key] if emission_key in emission_dict_exp else emission_dict_exp[default_key]
                    transition_key = str_transition(tag1, tag2)
                    transition_score = transition_dict_exp[transition_key]
                    SumPotential += forward_matrix[tag2index[tag1], i-1]*transition_score*emission_score
                forward_matrix[tag2index[tag2], i] = SumPotential
    
    SumPotential = 0
    for tag1 in tag2index:
        transition_key = str_transition(tag1, 'STOP')
        transition_score = transition_dict_exp[transition_key]
        SumPotential += forward_matrix[tag2index[tag1], -1]*transition_score
    
    return forward_matrix, SumPotential

In [41]:
def Loss(X, Y, tag2index, emission_dict, transition_dict):
    
    pair_score = 0
    for i in range(len(X)):
        if i == 0:
            emission_key = str_emission(X[i], Y[i])
            default_key = str_emission('#UNK#', Y[i])
            emission_score = emission_dict[emission_key] if emission_key in emission_dict else emission_dict[default_key]
            transition_key = str_transition('START', Y[i])
            transition_score = transition_dict[transition_key]
            pair_score += (transition_score+emission_score)
        else:
            emission_key = str_emission(X[i], Y[i])
            default_key = str_emission('#UNK#', Y[i])
            emission_score = emission_dict[emission_key] if emission_key in emission_dict else emission_dict[default_key]
            transition_key = str_transition(Y[i-1], Y[i])
            transition_score = transition_dict[transition_key]
            pair_score += (transition_score+emission_score)
    transition_key = str_transition(Y[-1], 'STOP')
    transition_score = transition_dict[transition_key]
    pair_score += transition_score
    
    _, SumPotential = forward(X, tag2index, emission_dict, transition_dict)
    
    return -(pair_score-np.log(SumPotential))

In [42]:
def LossDataset(Xs, Ys, tag2index, emission_dict, transition_dict):
    return np.sum([Loss(X, Y, tag2index, emission_dict, transition_dict) for X, Y in zip(Xs, Ys)])

In [43]:
LossDataset(train_ds[0], train_ds[1], tag2index, emission_smth_dict, transition_dict_all)

2156.7293881993182

In [44]:
def backward_alternative(X, tag2index, emission_dict, transition_dict):
    """
    The returned value should be the forward matrix for sentence X and the final beta
    final beta should be the same as final alpha
    """
    backward_matrix = np.zeros((len(tag2index), len(X)))
    emission_dict_exp = {k: np.exp(v) for (k, v) in emission_dict.items()}
    transition_dict_exp = {k: np.exp(v) for (k, v) in transition_dict.items()}
    for i in range(len(X)-1, -1, -1):
        if i == len(X)-1:
            for tag1 in tag2index:
                emission_key = str_emission(X[i], tag1)
                default_key = str_emission('#UNK#', tag1)
                emission_score = emission_dict_exp[emission_key] if emission_key in emission_dict_exp else emission_dict_exp[default_key]
                transition_key = str_transition(tag1, 'STOP')
                transition_score = transition_dict_exp[transition_key]
                backward_matrix[tag2index[tag1], i] = transition_score*emission_score
        else:
            for tag1 in tag2index:
                SumPotential = 0
                for tag2 in tag2index:
                    emission_key = str_emission(X[i], tag1)
                    default_key = str_emission('#UNK#', tag1)
                    emission_score = emission_dict_exp[emission_key] if emission_key in emission_dict_exp else emission_dict_exp[default_key]
                    transition_key = str_transition(tag1, tag2)
                    transition_score = transition_dict_exp[transition_key]
                    SumPotential += backward_matrix[tag2index[tag2], i+1]*transition_score*emission_score
                backward_matrix[tag2index[tag1], i] = SumPotential
    
    SumPotential = 0
    for tag2 in tag2index:
        transition_key = str_transition('START', tag2)
        transition_score = transition_dict_exp[transition_key]
        SumPotential += backward_matrix[tag2index[tag2], 0]*transition_score
    
    return backward_matrix, SumPotential

In [45]:
def backward(X, tag2index, emission_dict, transition_dict):
    """
    The returned value should be the forward matrix for sentence X and the final beta
    final beta should be the same as final alpha
    This is considered correct according to the definition of backward algorithm
    """
    backward_matrix = np.zeros((len(tag2index), len(X)))
    emission_dict_exp = {k: np.exp(v) for (k, v) in emission_dict.items()}
    transition_dict_exp = {k: np.exp(v) for (k, v) in transition_dict.items()}
    
    for tag1 in tag2index:
        transition_key = str_transition(tag1, 'STOP')
        transition_score = transition_dict_exp[transition_key]
        backward_matrix[tag2index[tag1], -1] = transition_score
    
    for i in range(len(X)-2, -1, -1):
        for tag1 in tag2index:
            SumPotential = 0
            for tag2 in tag2index:
                emission_key = str_emission(X[i+1], tag2)
                default_key = str_emission('#UNK#', tag2)
                emission_score = emission_dict_exp[emission_key] if emission_key in emission_dict_exp else emission_dict_exp[default_key]
                transition_key = str_transition(tag1, tag2)
                transition_score = transition_dict_exp[transition_key]
                SumPotential += backward_matrix[tag2index[tag2], i+1]*transition_score*emission_score
            backward_matrix[tag2index[tag1], i] = SumPotential
    
    SumPotential = 0
    for tag2 in tag2index:
        emission_key = str_emission(X[0], tag2)
        default_key = str_emission('#UNK#', tag2)
        emission_score = emission_dict_exp[emission_key] if emission_key in emission_dict_exp else emission_dict_exp[default_key]
        transition_key = str_transition('START', tag2)
        transition_score = transition_dict_exp[transition_key]
        SumPotential += backward_matrix[tag2index[tag2], 0]*transition_score*emission_score
    
    return backward_matrix, SumPotential

### Check the implemented forward-backward algorithm

In [47]:
_, forward_sum = forward(train_ds[0][0], tag2index, emission_smth_dict, transition_dict_all)
_, backward_sum = backward_alternative(train_ds[0][0], tag2index, emission_smth_dict, transition_dict_all)
forward_sum, backward_sum

(1.1244349940099905e-50, 1.12443499400999e-50)

In [48]:
_, backward_sum_alt = backward_alternative(train_ds[0][0], tag2index, emission_smth_dict, transition_dict_all)
assert backward_sum_alt == backward_sum

In [49]:
def expected_count_emission_alternative(X, tag, word, tag2index, word2index, emission_dict, transition_dict):
    forward_matrix, NormalizationTerm = forward(X, tag2index, emission_dict, transition_dict)
    backward_matrix, _ = backward(X, tag2index, emission_dict, transition_dict)
    SumPotential = 0
    X = [x if x in word2index else '#UNK#' for x in X]
    if word not in word2index:
        word = '#UNK#'
    for i in range(len(X)-1):
        if X[i] == word:
            print('index at which the word appears:', i)
            for tag2 in tag2index:
                transition_score = np.exp(transition_dict[str_transition(tag, tag2)])
                emission_score = np.exp(emission_dict[str_emission(X[i+1], tag2)])
                SumPotential += forward_matrix[tag2index[tag], i]*backward_matrix[tag2index[tag2], i+1]*emission_score*transition_score
    transition_score = np.exp(transition_dict[str_transition(tag, 'STOP')])
    SumPotential += forward_matrix[tag2index[tag], -1]*transition_score
    return SumPotential/NormalizationTerm

In [50]:
def expected_count_emission(X, tag, word, tag2index, word2index, emission_dict, transition_dict):
    """
    This is considered correct according to the definition of f(y_i-1, y_i, x_i)
    """
    forward_matrix, NormalizationTerm = forward(X, tag2index, emission_dict, transition_dict)
    backward_matrix, _ = backward(X, tag2index, emission_dict, transition_dict)
    SumPotential = 0
    X = [x if x in word2index else '#UNK#' for x in X]
    if word not in word2index:
        word = '#UNK#'
    
    emission_score = np.exp(emission_dict[str_emission(word, tag)])
    if X[0] == word:
        transition_score = np.exp(transition_dict[str_transition('START', tag)])
        SumPotential += backward_matrix[tag2index[tag], 0]*emission_score*transition_score
    for i in range(1, len(X)):
        if X[i] == word:
            print('index at which the word appears:', i)
            for tag1 in tag2index:
                transition_score = np.exp(transition_dict[str_transition(tag1, tag)])
                SumPotential += forward_matrix[tag2index[tag1], i-1]*backward_matrix[tag2index[tag], i]*emission_score*transition_score
    return SumPotential/NormalizationTerm

### Test means nothing

In [51]:
expected_count_emission(train_ds[0][1], 'I-per', 'Omi', tag2index, word2index, emission_smth_dict, transition_dict_all)

index at which the word appears: 1
index at which the word appears: 9


0.909108875859154

In [52]:
expected_count_emission_alternative(train_ds[0][1], 'I-per', 'Omi', tag2index, word2index, emission_smth_dict, transition_dict_all)

index at which the word appears: 1
index at which the word appears: 9


0.9091088758591538

In [53]:
expected_count_emission(train_ds[0][2], 'B-geo', 'DRC', tag2index, word2index, emission_smth_dict, transition_dict_all)

index at which the word appears: 13


0.47734588521399984

In [54]:
expected_count_emission_alternative(train_ds[0][2], 'B-geo', 'DRC', tag2index, word2index, emission_smth_dict, transition_dict_all)

index at which the word appears: 13


0.47734588521399984

In [57]:
expected_count_emission_alternative(train_ds[0][1], 'O', '#UNK#', tag2index, word2index, emission_smth_dict, transition_dict_all)

index at which the word appears: 1
index at which the word appears: 9


1.9397328051797769

In [58]:
expected_count_emission(train_ds[0][1], 'O', '#UNK#', tag2index, word2index, emission_smth_dict, transition_dict_all)

index at which the word appears: 1
index at which the word appears: 9


0.9397328051797771

In [59]:
def actual_count_emission(X, Y, tag, word, tag2index, word2index):
    X = [x if x in word2index else '#UNK#' for x in X]
    if word not in word2index:
        word = '#UNK#'
    count = 0
    for x, y in zip(X, Y):
        if x == word and y == tag:
            count += 1
    return count

In [60]:
actual_count_emission(train_ds[0][1], train_ds[1][1], 'O', '#UNK#', tag2index, word2index)

1

In [72]:
def expected_count_transition(X, tag1, tag2, tag2index, word2index, emission_dict, transition_dict):
    forward_matrix, NormalizationTerm = forward(X, tag2index, emission_dict, transition_dict)
    backward_matrix, _ = backward(X, tag2index, emission_dict, transition_dict)
    X = [x if x in word2index else '#UNK#' for x in X]
    SumPotential = 0
    transition_score = np.exp(transition_dict[str_transition(tag1, tag2)])
    if tag1 == 'START':
        emission_score = np.exp(emission_dict[str_emission(X[0], tag2)])
        SumPotential += transition_score*emission_score*backward_matrix[tag2index[tag2], 0]
    elif tag2 == 'STOP':
        SumPotential += forward_matrix[tag2index[tag1], -1]*transition_score
    else:
        for i in range(len(X)-1):
            emission_score = np.exp(emission_dict[str_emission(X[i+1], tag2)])
            SumPotential += forward_matrix[tag2index[tag1], i]*transition_score*emission_score*backward_matrix[tag2index[tag2], i+1]
    return SumPotential/NormalizationTerm                                   

In [155]:
def actual_count_transition(X, Y, tag1, tag2, tag2index, word2index):
    count = 0
    def pairwise(iterable):
        a, b = itertools.tee(iterable)
        next(b, None)
        return zip(a, b)
    Y = copy.deepcopy(Y)
    Y.append('STOP')
    Y.insert(0, 'START')
    for tag1_, tag2_ in pairwise(Y):
        if tag1 == tag1_ and tag2 == tag2_:
            count += 1
    return count

### Test means nothing

In [76]:
expected_count_transition(train_ds[0][1], 'O', 'O', tag2index, word2index, emission_smth_dict, transition_dict_all)

14.783140987648911

In [77]:
expected_count_transition(train_ds[0][1], 'START', 'O', tag2index, word2index, emission_smth_dict, transition_dict_all)

0.0

In [78]:
expected_count_transition(train_ds[0][1], 'START', 'B-per', tag2index, word2index, emission_smth_dict, transition_dict_all)

1.0

In [79]:
len(train_ds[0][1]), train_ds[1][1][0]

(18, 'B-per')

In [80]:
actual_count_transition(train_ds[0][1], train_ds[1][1], 'O', 'O', tag2index, word2index)

15

### Test according to instructions

In [75]:
# Increase 'O'-'O' by epsilon
epsilon = 0.0001
transition_dict_all_copy = copy.deepcopy(transition_dict_all)
transition_dict_all_copy[str_transition('O', 'O')] += epsilon
old_loss = LossDataset(train_ds[0], train_ds[1], tag2index, emission_smth_dict, transition_dict_all)
new_loss = LossDataset(train_ds[0], train_ds[1], tag2index, emission_smth_dict, transition_dict_all_copy)
expected_count = np.sum([expected_count_transition(X, 'O', 'O', tag2index, word2index, emission_smth_dict, transition_dict_all) for X in train_ds[0]])
actual_count = np.sum([actual_count_transition(X, Y, 'O', 'O', tag2index, word2index) for (X, Y) in zip(train_ds[0], train_ds[1])])
gradient = expected_count - actual_count
gradient*epsilon, new_loss-old_loss

(-0.0068095859707824275, -0.006801962072131573)

In [81]:
epsilon = 0.0001
X_test, Y_test = train_ds[0][0], train_ds[1][0]
transition_dict_all_copy = copy.deepcopy(transition_dict_all)
transition_dict_all_copy[str_transition('O', 'O')] += epsilon
old_loss = Loss(X_test, Y_test, tag2index, emission_smth_dict, transition_dict_all)
new_loss = Loss(X_test, Y_test, tag2index, emission_smth_dict, transition_dict_all_copy)
expected_count = expected_count_transition(X_test, 'O', 'O', tag2index, word2index, emission_smth_dict, transition_dict_all)
actual_count = actual_count_transition(X_test, Y_test, 'O', 'O', tag2index, word2index)
gradient = expected_count - actual_count
new_loss - old_loss, gradient*epsilon

(-0.0003452907908894076, -0.0003453194032164845)

In [69]:
epsilon = 0.0001
X_test, Y_test = train_ds[0][0], train_ds[1][0]
emission_smth_dict_copy = copy.deepcopy(emission_smth_dict)
emission_smth_dict_copy[str_emission('#UNK#', 'O')] += epsilon
old_loss = Loss(X_test, Y_test, tag2index, emission_smth_dict, transition_dict_all)
new_loss = Loss(X_test, Y_test, tag2index, emission_smth_dict_copy, transition_dict_all)
expected_count = expected_count_emission(X_test, 'O', '#UNK#', tag2index, word2index, emission_smth_dict, transition_dict_all)
actual_count = actual_count_emission(X_test, Y_test, 'O', '#UNK#', tag2index, word2index)
gradient = expected_count - actual_count
print('Actual loss change: {}, Change according to gradient: {}'.format(new_loss - old_loss, gradient*epsilon))

index at which the word appears: 12
index at which the word appears: 13
index at which the word appears: 15
index at which the word appears: 16
index at which the word appears: 20
index at which the word appears: 21
index at which the word appears: 22
Actual loss change: -0.00021219469253708212, Change according to gradient: -0.00021220724764608195


In [70]:
epsilon = 0.0001
X_test, Y_test = train_ds[0][1], train_ds[1][1]
emission_smth_dict_copy = copy.deepcopy(emission_smth_dict)
emission_smth_dict_copy[str_emission('#UNK#', 'O')] += epsilon
old_loss = Loss(X_test, Y_test, tag2index, emission_smth_dict, transition_dict_all)
new_loss = Loss(X_test, Y_test, tag2index, emission_smth_dict_copy, transition_dict_all)
expected_count = expected_count_emission(X_test, 'O', '#UNK#', tag2index, word2index, emission_smth_dict, transition_dict_all)
actual_count = actual_count_emission(X_test, Y_test, 'O', '#UNK#', tag2index, word2index)
gradient = expected_count - actual_count
print('Actual loss change: {}, Change according to gradient: {}'.format(new_loss - old_loss, gradient*epsilon))

index at which the word appears: 1
index at which the word appears: 9
Actual loss change: -6.02587614650929e-06, Change according to gradient: -6.026719482022292e-06


#### The only bug now is in expected_count_emission_alternative, error in some edge cases

In [104]:
import os, sys
class HiddenPrints:
    def __enter__(self):
        self._original_stdout = sys.stdout
        sys.stdout = open(os.devnull, 'w')

    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout.close()
        sys.stdout = self._original_stdout

In [105]:
def GradientEmission(Xs, Ys, tag, word, tag2index, word2index, emission_dict, transition_dict):
    with HiddenPrints():
        expected_count = np.sum([expected_count_emission(X, tag, word, tag2index, word2index, emission_smth_dict, transition_dict_all) for X in Xs])
        actual_count = np.sum([actual_count_emission(X, Y, tag, word, tag2index, word2index) for X, Y in zip(Xs, Ys)])
    return expected_count - actual_count

In [111]:
def GradientTransition(Xs, Ys, tag1, tag2, tag2index, word2index, emission_dict, transition_dict):
    with HiddenPrints():
        expected_count = np.sum([expected_count_transition(X, tag1, tag2, tag2index, word2index, emission_smth_dict, transition_dict_all) for X in Xs])
        actual_count = np.sum([actual_count_transition(X, Y, tag1, tag2, tag2index, word2index) for X, Y in zip(Xs, Ys)])
    return expected_count - actual_count

In [110]:
GradientEmission(train_ds[0], train_ds[1], 'O', '#UNK', tag2index, word2index, emission_smth_dict, transition_dict_all)

33.86508333377901

In [112]:
GradientTransition(train_ds[0], train_ds[1], 'O', 'O', tag2index, word2index, emission_smth_dict, transition_dict_all)

-68.09585970782427

In [151]:
def GradientTransitionAll(Xs, Ys, tag2index, word2index, emission_dict, transition_dict):
    i = 1
    gradient = dict()
    tag2index_ = copy.deepcopy(tag2index)
    tag2index_['START'] = len(tag2index_)
    tag2index_['STOP'] = len(tag2index_)
    for tag1, tag2 in itertools.product(tag2index_, tag2index_):
        if tag1 != 'STOP':
            gradient[str_transition(tag1, tag2)] = GradientTransition(Xs, Ys, tag1, tag2, tag2index, word2index, emission_dict, transition_dict)
            print('done with the {}th gradient'.format(i))
            i += 1
            if i == 4:
                return gradient
    return gradient

In [146]:
def GradientEmissionAll(Xs, Ys, tag2index, word2index, emission_dict, transition_dict):
    gradient = dict()
    for word, tag in itertools.product(word2index, tag2index):
        gradient[str_emission(word, tag)] = GradientEmission(Xs, Ys, tag, word, tag2index, word2index, emission_dict, transition_dict)
    return gradient

In [152]:
import time
start = time.time()
gradient_trans_dict = GradientTransitionAll(train_ds[0], train_ds[1], tag2index, word2index, emission_smth_dict, transition_dict_all)
end = time.time()
end - start

done with the 1th gradient
done with the 2th gradient
done with the 3th gradient


154.70667815208435

In [153]:
gradient_trans_dict

{'transition:I-gpe+I-gpe': 0.0,
 'transition:I-gpe+I-eve': 0.0,
 'transition:I-gpe+I-per': 0.0}

In [154]:
np.sum([actual_count_transition(X, Y, 'I-gpe', 'I-gpe', tag2index, word2index) for X, Y in zip(train_ds[0], train_ds[1])])

0

## Part 4

In [99]:
def LossDatasetRegularization(Xs, Ys, tag2index, emission_dict, transition_dict, param):
    return LossDataset(Xs, Ys, tag2index, emission_dict, transition_dict) +\
        param*(np.sum([w**2 for w in emission_dict.values() if w != -np.inf]) +\
               np.sum([w**2 for w in transition_dict.values() if w != -np.inf]))

In [100]:
eta = 0.1
LossDatasetRegularization(train_ds[0], train_ds[1], tag2index, emission_smth_dict, transition_dict_all, eta)

12066.057801496318

In [148]:
def feature2index(tag2index, word2index):
    emission = [str_emission(word, tag) for word, tag in itertools.product(word2index, tag2index)]
    tag2index = copy.deepcopy(tag2index)
    tag2index['START'] = len(tag2index)
    tag2index['STOP'] = len(tag2index)
    transition = [str_transition() for tag1, tag2 in itertools.product(tag2index, tag2index) if tag1 != 'STOP']
    return {feature: i for i, feature in enumerate(emission+transition)}

### Restructure Everything for Part 4 in CRFNumpy

## Part 5

In [172]:
def labelled_full(path):
    with open(path) as f:  
        X, Y, Z, x, y, z = list(), list(), list(), list(), list(), list()
        for line in f:
            if line == '\n':
                X.append(x)
                Y.append(y)
                Z.append(z)
                x, y, z = list(), list(), list()
            else:
                word, pos_tag, tag = line.strip().split()
                x.append(word)
                y.append(tag)
                z.append(pos_tag)
    return X, Z, Y

def unlabelled_full(path):
    with open(path) as f:  
        X, Z, x, z = list(), list(), list(), list()
        for line in f:
            if line == '\n':
                X.append(x)
                Z.append(z)
                x, z = list(), list()
            else:
                word, pos_tag = line.strip().split()
                x.append(word)
                z.append(pos_tag)
    return X, Z

def read_data_full(root):
    train, devin, devout = root/'train', root/'dev.in', root/'dev.out'     
    return labelled_full(train), unlabelled_full(devin), labelled_full(devout)

In [173]:
train_ds, devin_ds, devout_ds = read_data_full(full)

In [178]:
def emission_weight_POS(train_ds, tag2index):
    
    T = len(tag2index)
    postags = list(set([postag for postags in train_ds[1] for postag in postags]))
    postag2index = {postag: i for i, postag in enumerate(postags)}
    T_ = len(postag2index)
    count_table = np.zeros((T, T_))
    for Z, Y in zip(train_ds[1], train_ds[2]):
        for postag, tag in zip(Z, Y):
            count_table[tag2index[tag], postag2index[postag]] += 1
            
    count_table/=count_table.sum(1)[:, None]
    
    transition_weight_pos = np.ma.log(count_table).filled(-np.inf)
    
    return transition_weight_pos, postag2index

In [179]:
transition_weight_pos, postag2index = emission_weight_POS(train_ds, tag2index)

In [181]:
len(postag2index), transition_weight_pos.shape

(40, (16, 40))

### Word Embeddings for LSTM-CRF

In [217]:
from bert_serving.client import BertClient
bc = BertClient()
texts = ['hello world!', 'good day']
tokenized_texts = [s.split() for s in texts]
print(tokenized_texts)
vectors = bc.encode(tokenized_texts, is_tokenized=True)
vectors.shape

[['hello', 'world!'], ['good', 'day']]


(2, 4, 768)

In [218]:
from gensim.scripts.glove2word2vec import glove2word2vec
glove_input_file = '../Embeddings/glove.6B/glove.6B.50d.txt'
word2vec_output_file = '../Embeddings/glove.6B/glove.6B.50d.txt.word2vec'
glove2word2vec(glove_input_file, word2vec_output_file)

Exception ignored in: <function Context.__del__ at 0x1065a2840>
Traceback (most recent call last):
  File "/Users/wutianyu/anaconda3/lib/python3.7/site-packages/zmq/sugar/context.py", line 50, in __del__
    self.term()
  File "zmq/backend/cython/context.pyx", line 91, in zmq.backend.cython.context.Context.term
  File "zmq/backend/cython/checkrc.pxd", line 12, in zmq.backend.cython.checkrc._check_rc
KeyboardInterrupt
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


(400000, 50)

In [None]:
from gensim.models import KeyedVectors
filename = '../Embeddings/glove.6B/glove.6B.50d.txt.word2vec'
model = KeyedVectors.load_word2vec_format(filename, binary=False)
# calculate (king - man) + woman
result = model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
model['easy'].shape, result