### Read Data

In [119]:
from pathlib import Path
import numpy as np
import copy
import itertools
partial = Path('./data/partial')
full = Path('./data/full')

In [2]:
def labelled(path):
    with open(path) as f:  
        X, Y, x, y = list(), list(), list(), list()
        for line in f:
            if line == '\n':
                X.append(x)
                Y.append(y)
                x, y = list(), list()
            else:
                word, tag = line.strip().split()
                x.append(word)
                y.append(tag)
    return X, Y

def unlabelled(path):
    with open(path) as f:  
        X, x = list(), list()
        for line in f:
            if line == '\n':
                X.append(x)
                x = list()
            else:
                word = line.strip()
                x.append(word)
    return X

def read_data(root):
    train, devin, devout = root/'train', root/'dev.in', root/'dev.out'     
    return labelled(train), unlabelled(devin), labelled(devout)

In [3]:
train_ds, devin_ds, devout_ds = read_data(partial)

### Emission weight

In [39]:
def emission_weight_smooth(train_ds, k):
    
    vocabulary = list(set([word for sentence in train_ds[0] for word in sentence]))
    tags = list(set([tag for tags in train_ds[1] for tag in tags]))
    word2index = {word: i for i, word in enumerate(vocabulary)}
    tag2index = {tag: i for i, tag in enumerate(tags)}
    count_table = np.zeros((len(tags), len(vocabulary)))
    for X, Y in zip(train_ds[0], train_ds[1]):
        for word, tag in zip(X, Y):
            count_table[tag2index[tag], word2index[word]] += 1
    
    removed_index = np.sum(count_table, 0) < k
    
    print('Number of removed words:', np.sum(removed_index))
    print('Total number of words:', len(vocabulary))
    
    if (np.sum(removed_index) > 0):
        
        count_table = np.append(count_table, np.sum(count_table[:, removed_index], 1)[:,None], 1)
        count_table = np.delete(count_table, np.nonzero(removed_index), 1)
        
        new_vocab = [vocabulary[j] for j in range(len(vocabulary)) if not removed_index[j]]+['#UNK#']
        word2index = {w:i for i,w in enumerate(new_vocab)}
    
    count_table/=count_table.sum(1)[:, None]
    
    emission_weight = np.ma.log(count_table).filled(-np.inf)
    
    return emission_weight, word2index, tag2index

In [40]:
emission_weight, word2index, tag2index = emission_weight_smooth(train_ds, 2)

Number of removed words: 2528
Total number of words: 4068


### Tokenization

In [83]:
train_X, train_Y = train_ds[0], train_ds[1]
test_X, test_Y = devout_ds[0], devout_ds[1]
def tokenize(sentence):
    return [word2index[word] if word in word2index else word2index['#UNK#'] for word in sentence]
def tag2idx(tags):
    return [tag2index[tag] for tag in tags]
train_X = [tokenize(sentence) for sentence in train_X]
test_X = [tokenize(sentence) for sentence in test_X]
train_Y = [tag2idx(tags) for tags in train_Y]
test_Y = [tag2idx(tags) for tags in test_Y]

### Transition Weight

In [37]:
def transition_weight(train_ds, tag2index):
    
    T = len(tag2index)
    count_table = np.zeros((T+1, T+1))
    for Y in train_ds[1]:
        count_table[-1, tag2index[Y[0]]] += 1
        for i in range(len(Y)-1):
            count_table[tag2index[Y[i]], tag2index[Y[i+1]]] += 1
        count_table[tag2index[Y[-1]], -1] += 1
            
    count_table/=count_table.sum(1)[:, None]
    
    transition_weight = np.ma.log(count_table).filled(-np.inf)
    
    return transition_weight

In [38]:
transition_weight = transition_weight(train_ds, tag2index)

### Viterbi and Evaluation for HMM

In [73]:
def viterbi(X, tag2index, emission_weight, transition_weight):
    
    index2tag = {value: key for key, value in tag2index.items()}
    score_matrix = np.zeros((len(tag2index), len(X)))
    path_matrix = np.zeros((len(tag2index), len(X)), dtype='int')
    
    for i in range(len(X)):
        if i == 0:
            score_matrix[:, i] = transition_weight[-1, :-1] + emission_weight[:, X[i]]
        else:
            for j in range(len(tag2index)):
                competitors = emission_weight[j, X[i]] + transition_weight[:-1, j] + score_matrix[:, i-1]
                score_matrix[j, i] = np.max(competitors)
                path_matrix[j, i] = np.argmax(competitors)
    
    competitors = transition_weight[:-1, -1] + score_matrix[:, -1]
    last_idx = np.argmax(competitors)
    path = [last_idx]
    for m in range(len(X)-1, 0, -1):
        path.insert(0, path_matrix[path[0], m])
    output_tags = [index2tag[idx] for idx in path]
    return output_tags

In [44]:
def viterbi_output(dev_out_path, X_raw, tag2index, emission_weight, transition_weight):
    
    X = [tokenize(sentence) for sentence in X_raw]
    tags = [viterbi(sentence, tag2index, emission_weight, transition_weight) for sentence in X]
    
    output_string = ''
    for i in range(len(X)):
        for j in range(len(X[i])):
            output_string += X_raw[i][j] + ' ' + tags[i][j] + '\n'
        output_string += '\n'
    
    with open(dev_out_path, 'w') as f:
        f.write(output_string)
    
    print('Done with writing predictions')
    return None

In [45]:
dev_out_path = partial/'dev.p2.out'
viterbi_output(dev_out_path, devin_ds[0], tag2index, emission_weight, transition_weight)

Done with writing predictions


In [176]:
from conlleval_ import evaluate

def sequence_evaluation(X, Y, tag2index, emission_dict, transition_dict):
    index2tag = {value: key for key, value in tag2index.items()}
    tags_ = [viterbi(sentence, tag2index, emission_dict, transition_dict) for sentence in X]
    tags = [tag for tags in tags_ for tag in tags]
    Y  = [tag for tags in Y for tag in tags]
    Y = [index2tag[idx] for idx in Y]
    assert len(Y) == len(tags)
    return evaluate(Y, tags)

In [177]:
sequence_evaluation(test_X, test_Y, tag2index, emission_weight, transition_weight)

processed 2097 tokens with 236 phrases; found: 187 phrases; correct: 115.
accuracy:  54.12%; (non-O)
accuracy:  89.99%; precision:  61.50%; recall:  48.73%; FB1:  54.37
              art: precision:   0.00%; recall:   0.00%; FB1:   0.00  0
              eve: precision:   0.00%; recall:   0.00%; FB1:   0.00  0
              geo: precision:  55.13%; recall:  50.59%; FB1:  52.76  78
              gpe: precision:  94.12%; recall:  64.00%; FB1:  76.19  17
              nat: precision:   0.00%; recall:   0.00%; FB1:   0.00  0
              org: precision:  41.67%; recall:  28.57%; FB1:  33.90  24
              per: precision:  50.00%; recall:  37.50%; FB1:  42.86  24
              tim: precision:  77.27%; recall:  64.15%; FB1:  70.10  44


(61.49732620320856, 48.728813559322035, 54.37352245862884)

### Loss & Forward & Backward

In [60]:
def forward(X, tag2index, emission_weight, transition_weight):
    """
    The returned value should be the forward matrix for sentence X and the final alpha
    The implementation so far explicitly pass emission and transition dictionary as arguments
    This can be replaced by FeatureSum function in the implementation for part 5
    """
    forward_matrix = np.zeros((len(tag2index), len(X)))
    emission_weight_exp = np.exp(emission_weight)
    transition_weight_exp = np.exp(transition_weight)
    for i in range(len(X)):
        if i == 0:
            forward_matrix[:, i] = transition_weight_exp[-1, :-1]*emission_weight_exp[:, X[i]]
        else:
            for j in range(len(tag2index)):
                SumPotential = np.sum(emission_weight_exp[j, X[i]]*transition_weight_exp[:-1, j]*forward_matrix[:, i-1])
                forward_matrix[j, i] = SumPotential
    
    SumPotential = np.sum(transition_weight_exp[:-1, -1]*forward_matrix[:, -1])
    
    return forward_matrix, SumPotential

In [67]:
def backward(X, tag2index, emission_weight, transition_weight):
    """
    The returned value should be the forward matrix for sentence X and the final beta
    final beta should be the same as final alpha
    This is considered correct according to the definition of backward algorithm
    """
    backward_matrix = np.zeros((len(tag2index), len(X)))
    emission_weight_exp = np.exp(emission_weight)
    transition_weight_exp = np.exp(transition_weight)
    
    
    backward_matrix[:, -1] = transition_weight_exp[:-1, -1]
    
    for i in range(len(X)-2, -1, -1):
        for j in range(len(tag2index)):
            SumPotential = np.sum(transition_weight_exp[j, :-1]*emission_weight_exp[:, X[i+1]]*backward_matrix[:, i+1])
            backward_matrix[j, i] = SumPotential
    
    SumPotential = np.sum(transition_weight_exp[-1, :-1]*emission_weight_exp[:, X[0]]*backward_matrix[:, 0])
    
    return backward_matrix, SumPotential

In [86]:
def Loss(X, Y, tag2index, emission_weight, transition_weight):
    
    pair_score = 0
    for i in range(len(X)):
        if i == 0:
            emission_score = emission_weight[Y[i], X[i]]
            transition_score = transition_weight[-1, Y[i]]
            pair_score += (transition_score+emission_score)
        else:
            emission_score = emission_weight[Y[i], X[i]]
            transition_score = transition_weight[Y[i-1], Y[i]]
            pair_score += (transition_score+emission_score)
    
    transition_score = transition_weight[Y[-1], -1]
    pair_score += transition_score
    
    _, SumPotential = forward(X, tag2index, emission_weight, transition_weight)
    
    return -(pair_score-np.log(SumPotential))

In [87]:
def LossDataset(Xs, Ys, tag2index, emission_weight, transition_weight):
    return np.sum([Loss(X, Y, tag2index, emission_weight, transition_weight) for X, Y in zip(Xs, Ys)])

In [88]:
LossDataset(train_X, train_Y, tag2index, emission_weight, transition_weight)

2156.7293881993182

In [68]:
_, forward_sum = forward(train_X[0], tag2index, emission_weight, transition_weight)
_, backward_sum = backward(train_X[0], tag2index, emission_weight, transition_weight)
forward_sum, backward_sum

(1.12443499400999e-50, 1.12443499400999e-50)

### Expectation Count & Empirical Count

In [76]:
def expected_count_emission(X, tag, word, tag2index, word2index, emission_weight, transition_weight):
    """
    This is considered correct according to the definition of f(y_i-1, y_i, x_i)
    tag is index and word is also index
    """
    forward_matrix, NormalizationTerm = forward(X, tag2index, emission_weight, transition_weight)
    backward_matrix, _ = backward(X, tag2index, emission_weight, transition_weight)
    SumPotential = 0
    
    emission_score = np.exp(emission_weight[tag, word])
    if X[0] == word:
        print('index at which the word appears:', 0)
        transition_score = np.exp(transition_weight[-1, tag])
        SumPotential += backward_matrix[tag, 0]*emission_score*transition_score
    for i in range(1, len(X)):
        if X[i] == word:
            print('index at which the word appears:', i)
            transition_scores = np.exp(transition_weight[:-1, tag])
            SumPotential += np.sum(forward_matrix[:, i-1]*backward_matrix[tag, i]*emission_score*transition_scores)
    return SumPotential/NormalizationTerm

In [77]:
expected_count_emission(train_X[1], tag2index['I-per'], 
                        word2index['#UNK#'], tag2index, word2index, 
                        emission_weight, transition_weight)

index at which the word appears: 1
index at which the word appears: 9


0.9091088758591536

In [84]:
def actual_count_emission(X, Y, tag, word, tag2index, word2index):  
    count = 0
    for x, y in zip(X, Y):
        if x == word and y == tag:
            count += 1
    return count

In [85]:
actual_count_emission(train_X[1], train_Y[1], tag2index['O'], word2index['#UNK#'], tag2index, word2index)

1

In [89]:
def expected_count_transition(X, tag1, tag2, tag2index, word2index, emission_weight, transition_weight):
    forward_matrix, NormalizationTerm = forward(X, tag2index, emission_weight, transition_weight)
    backward_matrix, _ = backward(X, tag2index, emission_weight, transition_weight)
    SumPotential = 0
    
    transition_score = np.exp(transition_weight[tag1, tag2])
    if tag1 == -1:
        emission_score = np.exp(emission_weight[tag2, X[0]])
        SumPotential += transition_score*emission_score*backward_matrix[tag2, 0]
    elif tag2 == -1:
        SumPotential += forward_matrix[tag1, -1]*transition_score
    else:
        for i in range(len(X)-1):
            emission_score = np.exp(emission_weight[tag2, X[i+1]])
            SumPotential += forward_matrix[tag1, i]*transition_score*emission_score*backward_matrix[tag2, i+1]
    return SumPotential/NormalizationTerm    

In [90]:
expected_count_transition(train_X[1], tag2index['O'], tag2index['O'], 
                          tag2index, word2index, 
                          emission_weight, transition_weight)

14.783140987648915

In [106]:
def actual_count_transition(X, Y, tag1, tag2, tag2index, word2index):
    count = 0
    if tag1 == -1 and Y[0] == tag2:
        count += 1
    elif tag2 == -1 and Y[-1] == tag1:
        count += 1
    else:
        for i in range(len(Y)-1):
            if tag1 == Y[i] and tag2 == Y[i+1]:
                count += 1
    return count

### Gradient and Test

In [91]:
import os, sys
class HiddenPrints:
    def __enter__(self):
        self._original_stdout = sys.stdout
        sys.stdout = open(os.devnull, 'w')

    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout.close()
        sys.stdout = self._original_stdout

In [92]:
def GradientEmission(Xs, Ys, tag, word, tag2index, word2index, emission_weight, transition_weight):
    with HiddenPrints():
        expected_count = np.sum([expected_count_emission(X, tag, word, tag2index, word2index, emission_weight, transition_weight) for X in Xs])
        actual_count = np.sum([actual_count_emission(X, Y, tag, word, tag2index, word2index) for X, Y in zip(Xs, Ys)])
    return expected_count - actual_count

In [93]:
def GradientTransition(Xs, Ys, tag1, tag2, tag2index, word2index, emission_weight, transition_weight):
    with HiddenPrints():
        expected_count = np.sum([expected_count_transition(X, tag1, tag2, tag2index, word2index, emission_weight, transition_weight) for X in Xs])
        actual_count = np.sum([actual_count_transition(X, Y, tag1, tag2, tag2index, word2index) for X, Y in zip(Xs, Ys)])
    return expected_count - actual_count

In [98]:
def test_emission(epsilon, X_test, Y_test, tag, word, tag2index, word2index, emission_weight, transition_weight):
    emission_weight_copy = copy.deepcopy(emission_weight)
    emission_weight_copy[tag, word] += epsilon
    old_loss = Loss(X_test, Y_test, tag2index, emission_weight, transition_weight)
    new_loss = Loss(X_test, Y_test, tag2index, emission_weight_copy, transition_weight)
    expected_count = expected_count_emission(X_test, tag, word, tag2index, word2index, emission_weight, transition_weight)
    actual_count = actual_count_emission(X_test, Y_test, tag, word, tag2index, word2index)
    gradient = expected_count - actual_count
    print('Actual loss change: {}, Change according to gradient: {}'.format(new_loss - old_loss, gradient*epsilon))

In [99]:
X_test, Y_test = train_X[1], train_Y[1]
epsilon = 0.0001
test_emission(epsilon, X_test, Y_test, tag2index['O'], word2index['#UNK#'], 
              tag2index, word2index, emission_weight, transition_weight)

index at which the word appears: 1
index at which the word appears: 9
Actual loss change: -6.02587614650929e-06, Change according to gradient: -6.026719482022292e-06


In [184]:
def test_transition(epsilon, X_test, Y_test, tag1, tag2, tag2index, word2index, emission_weight, transition_weight):
    transition_weight_copy = copy.deepcopy(transition_weight)
    transition_weight_copy[tag1, tag2] += epsilon
    old_loss = Loss(X_test, Y_test, tag2index, emission_weight, transition_weight)
    new_loss = Loss(X_test, Y_test, tag2index, emission_weight, transition_weight_copy)
    expected_count = expected_count_transition(X_test, tag1, tag2, tag2index, word2index, emission_weight, transition_weight)
    actual_count = actual_count_transition(X_test, Y_test, tag1, tag2, tag2index, word2index)
    gradient = expected_count - actual_count
    print('Actual loss change: {}, Change according to gradient: {}'.format(new_loss - old_loss, gradient*epsilon))

In [109]:
epsilon = 0.0001
X_test, Y_test = train_X[0], train_Y[0]
test_transition(epsilon, X_test, Y_test, tag2index['O'], tag2index['O'], 
                tag2index, word2index, emission_weight, transition_weight)

Actual loss change: -0.0003452907908894076, Change according to gradient: -0.00034531940321648556


In [116]:
def LossDatasetRegularization(Xs, Ys, tag2index, emission_weight, transition_weight, param):   
    return LossDataset(Xs, Ys, tag2index, emission_weight, transition_weight) +\
        param*(np.sum(emission_weight[emission_weight != -np.inf]**2) +\
               np.sum(transition_weight[transition_weight != -np.inf]**2))

In [117]:
Regularization = 0.1
LossDatasetRegularization(train_X, train_Y, tag2index, emission_weight, transition_weight, Regularization)

12066.057801496318

In [122]:
def GradientTransitionAll(Xs, Ys, tag2index, word2index, emission_weight, transition_weight):
    i = 1
    T = len(tag2index)
    gradient = np.zeros((T+1, T+1))
    for tag1, tag2 in itertools.product(range(-1, T), range(-1, T)):
        gradient[tag1, tag2] = GradientTransition(Xs, Ys, tag1, tag2, 
                                                  tag2index, word2index, emission_weight, transition_weight)
        if i%10 == 0:
            print('done with the {}th gradient'.format(i))
        i += 1
    return gradient

In [None]:
def GradientEmissionAll(Xs, Ys, tag2index, word2index, emission_weight, transition_weight):
    i = 1
    T = len(tag2index)
    V = len(word2index)
    gradient = np.zeros((T, V))
    for tag, word in itertools.product(range(T), range(V)):
        gradient[tag, word] = GradientEmission(Xs, Ys, tag, word, tag2index, word2index, 
                                               emission_weight, transition_weight)
        if i%10 == 0:
            print('done with the {}th gradient'.format(i))
        i += 1
    return gradient

In [136]:
import time
start = time.time()
gradient_transition = GradientTransitionAll(train_X, train_Y, tag2index, word2index, 
                                            emission_weight, transition_weight)
end = time.time()
end - start

done with the 10th gradient
done with the 20th gradient
done with the 30th gradient
done with the 40th gradient
done with the 50th gradient
done with the 60th gradient
done with the 70th gradient
done with the 80th gradient
done with the 90th gradient
done with the 100th gradient
done with the 110th gradient
done with the 120th gradient
done with the 130th gradient
done with the 140th gradient
done with the 150th gradient
done with the 160th gradient
done with the 170th gradient
done with the 180th gradient
done with the 190th gradient
done with the 200th gradient
done with the 210th gradient
done with the 220th gradient
done with the 230th gradient
done with the 240th gradient
done with the 250th gradient
done with the 260th gradient
done with the 270th gradient
done with the 280th gradient


1072.772547006607

### Make use of forward backward to calculate all features (emission and transition separately) all at once

In [159]:
def expected_count_transition_all(Xs, tag2index, word2index, emission_weight, transition_weight):
    
    T = len(tag2index)
    counter = 1
    Expected_count = np.zeros((T+1, T+1))
    
    for X in Xs:
        forward_matrix, NormalizationTerm = forward(X, tag2index, emission_weight, transition_weight)
        backward_matrix, _ = backward(X, tag2index, emission_weight, transition_weight)

        expected_count = np.zeros((T+1, T+1))
        for tag1, tag2 in itertools.product(range(-1, T), range(-1, T)):
            SumPotential = 0

            transition_score = np.exp(transition_weight[tag1, tag2])
            if tag1 == -1:
                emission_score = np.exp(emission_weight[tag2, X[0]])
                SumPotential += transition_score*emission_score*backward_matrix[tag2, 0]
            elif tag2 == -1:
                SumPotential += forward_matrix[tag1, -1]*transition_score
            else:
                for i in range(len(X)-1):
                    emission_score = np.exp(emission_weight[tag2, X[i+1]])
                    SumPotential += forward_matrix[tag1, i]*transition_score*emission_score*backward_matrix[tag2, i+1]
            expected_count[tag1, tag2] = SumPotential/NormalizationTerm
        Expected_count += expected_count
        if counter%100 == 0:
            print('Transition: done with the {}th instances'.format(counter))
        counter += 1
    return Expected_count
        
def actual_count_transition_all(Xs, Ys, tag2index, word2index):
    T = len(tag2index)
    Empirical_count = np.zeros((T+1, T+1))
    for X, Y in zip(Xs, Ys):
        empirical_count = np.zeros((T+1, T+1))
        for tag1, tag2 in itertools.product(range(-1, T), range(-1, T)):
            empirical_count[tag1, tag2] = actual_count_transition(X, Y, tag1, tag2, tag2index, word2index)
        Empirical_count += empirical_count
    return Empirical_count

def GradientTransitionAllFast(Xs, Ys, tag2index, word2index, emission_weight, transition_weight):
    return expected_count_transition_all(Xs, tag2index, word2index, emission_weight, transition_weight) -\
            actual_count_transition_all(Xs, Ys, tag2index, word2index)

In [160]:
import time
start = time.time()
gradient_transition_fast = GradientTransitionAllFast(train_X, train_Y, tag2index, word2index, 
                                            emission_weight, transition_weight)
end = time.time()
end - start

Transition: done with the 100th instances
Transition: done with the 200th instances
Transition: done with the 300th instances
Transition: done with the 400th instances
Transition: done with the 500th instances
Transition: done with the 600th instances
Transition: done with the 700th instances


13.653486013412476

### Compare two methods for gradient calculation

In [139]:
np.sum(gradient_transition_fast - gradient_transition)

4.912958928571243e-12

In [158]:
def expected_count_emission_all(Xs, tag2index, word2index, emission_weight, transition_weight):
    
    T = len(tag2index)
    V = len(word2index)
    counter = 1
    Expected_count = np.zeros((T, V))
    
    for X in Xs:
        forward_matrix, NormalizationTerm = forward(X, tag2index, emission_weight, transition_weight)
        backward_matrix, _ = backward(X, tag2index, emission_weight, transition_weight)
    
        expected_count = np.zeros((T, V))
        for tag, word in itertools.product(range(T), range(V)):
            SumPotential = 0
        
            emission_score = np.exp(emission_weight[tag, word])
            if X[0] == word:
                transition_score = np.exp(transition_weight[-1, tag])
                SumPotential += backward_matrix[tag, 0]*emission_score*transition_score
            for i in range(1, len(X)):
                if X[i] == word:
                    transition_scores = np.exp(transition_weight[:-1, tag])
                    SumPotential += np.sum(forward_matrix[:, i-1]*backward_matrix[tag, i]*emission_score*transition_scores)
            
            expected_count[tag, word] = SumPotential/NormalizationTerm
    
        Expected_count += expected_count
        if counter%100 == 0:
            print('Emission: done with the {}th instances'.format(counter))
        counter += 1
    return Expected_count

def actual_count_emission_all(Xs, Ys, tag2index, word2index):
    T = len(tag2index)
    V = len(word2index)
    Empirical_count = np.zeros((T, V))
    for X, Y in zip(Xs, Ys):
        empirical_count = np.zeros((T, V))
        for tag, word in itertools.product(range(T), range(V)):
            empirical_count[tag, word] = actual_count_emission(X, Y, tag, word, tag2index, word2index)
        Empirical_count += empirical_count
    return Empirical_count

def GradientEmissionAllFast(Xs, Ys, tag2index, word2index, emission_weight, transition_weight):
    return expected_count_emission_all(Xs, tag2index, word2index, emission_weight, transition_weight) -\
            actual_count_emission_all(Xs, Ys, tag2index, word2index)

In [144]:
import time
start = time.time()
gradient_emission_fast = GradientEmissionAllFast(train_X, train_Y, tag2index, word2index, 
                                            emission_weight, transition_weight)
end = time.time()
end - start

done with the 100th instances
done with the 200th instances
done with the 300th instances
done with the 400th instances
done with the 500th instances
done with the 600th instances
done with the 700th instances


91.55844497680664

In [145]:
def GradientTransitionAllFastRegularization(Xs, Ys, tag2index, word2index, emission_weight, transition_weight, param):
    return expected_count_transition_all(Xs, tag2index, word2index, emission_weight, transition_weight) -\
            actual_count_transition_all(Xs, Ys, tag2index, word2index) +\
            2*param*transition_weight

def GradientEmissionAllFastRegularization(Xs, Ys, tag2index, word2index, emission_weight, transition_weight, param):
    return expected_count_emission_all(Xs, tag2index, word2index, emission_weight, transition_weight) -\
            actual_count_emission_all(Xs, Ys, tag2index, word2index) +\
            2*param*emission_weight

### Optimization for transition and emission features

In [171]:
from scipy.optimize import fmin_l_bfgs_b

T = len(tag2index)
V = len(word2index)

start = time.time()

def callbackF(w):
    """
    This function will only be called by "fmin_l_bfgs_b"
    Arg:
    w: weights, numpy array
    """
    loss = get_loss_grad(w)[0]
    print('Loss:{0:.4f}'.format(loss))

def get_loss_grad(w):
    """
    This function will only be called by "fmin_l_bfgs_b"
    Arg:
    w: weights, numpy array
    Returns:
    loss: loss, float
    grads: gradients, numpy array
    """
    # to be completed by you,
    # based on the modified loss and gradients,
    # with L2 regularization included
    with HiddenPrints():
        transition_weight = w[:(T+1)*(T+1)].reshape((T+1, T+1))
        emission_weight = w[(T+1)*(T+1):].reshape((T, V))
        loss = LossDatasetRegularization(train_X, train_Y, tag2index, 
                                         emission_weight, transition_weight, Regularization)
        grads_transition = GradientTransitionAllFastRegularization(train_X, train_Y, tag2index, word2index, 
                                                emission_weight, transition_weight, Regularization)
        grads_emission = GradientEmissionAllFastRegularization(train_X, train_Y, tag2index, word2index, 
                                              emission_weight, transition_weight, Regularization)
        grads = np.concatenate((grads_transition.reshape(-1), grads_emission.reshape(-1)))
    return loss, grads

init_w = np.zeros(((T+1)*(T+1)+T*V,))
result = fmin_l_bfgs_b(get_loss_grad, init_w, pgtol=0.01, callback=callbackF)

end = time.time()
print('{} seconds have passed'.format(end - start))

Loss:18660.6024
Loss:14330.6897
Loss:13133.8326
Loss:12688.4027
Loss:12240.1333
Loss:10976.3865
Loss:10278.0715
Loss:9360.2376
Loss:8857.4770
Loss:8058.7878
Loss:7588.8933
Loss:7307.3267
Loss:7102.4883
Loss:6987.0037
Loss:6775.7142
Loss:6582.0261
Loss:6348.0543
Loss:6145.4270
Loss:5935.1744
Loss:5586.2842
Loss:5416.3661
Loss:5256.6581
Loss:5123.4694
Loss:5061.2558
Loss:4977.0529
Loss:4839.2782
Loss:4784.0254
Loss:4734.2525
Loss:4622.3720
Loss:4463.0371
Loss:4323.5630
Loss:4168.4023
Loss:4097.3472
Loss:4050.5129
Loss:4004.9653
Loss:3952.8393
Loss:3884.9582
Loss:3850.4644
Loss:3802.7858
Loss:3767.4955
Loss:3735.7016
Loss:3683.9396
Loss:3645.8149
Loss:3629.8610
Loss:3612.8606
Loss:3587.9989
Loss:3571.6220
Loss:3526.7502
Loss:3495.6895
Loss:3470.2333
Loss:3454.3672
Loss:3429.2142
Loss:3410.1779
Loss:3399.8245
Loss:3373.5858
Loss:3363.6444
Loss:3353.8590
Loss:3332.3631
Loss:3322.0590
Loss:3310.8206
Loss:3305.1944
Loss:3298.2365
Loss:3286.7607
Loss:3282.0531
Loss:3268.5851
Loss:3260.2488
Los

### Evaluation

In [178]:
optimal_weight, final_loss, _ = result
transition_weight_optimal = optimal_weight[:(T+1)*(T+1)].reshape((T+1, T+1))
emission_weight_optimal = optimal_weight[(T+1)*(T+1):].reshape((T, V))
sequence_evaluation(test_X, test_Y, tag2index, emission_weight_optimal, transition_weight_optimal)

processed 2097 tokens with 236 phrases; found: 157 phrases; correct: 114.
accuracy:  54.12%; (non-O)
accuracy:  91.08%; precision:  72.61%; recall:  48.31%; FB1:  58.02
              art: precision:   0.00%; recall:   0.00%; FB1:   0.00  0
              eve: precision:   0.00%; recall:   0.00%; FB1:   0.00  0
              geo: precision:  77.19%; recall:  51.76%; FB1:  61.97  57
              gpe: precision: 100.00%; recall:  60.00%; FB1:  75.00  15
              nat: precision:   0.00%; recall:   0.00%; FB1:   0.00  0
              org: precision:  50.00%; recall:  31.43%; FB1:  38.60  22
              per: precision:  50.00%; recall:  34.38%; FB1:  40.74  22
              tim: precision:  80.49%; recall:  62.26%; FB1:  70.21  41


(72.61146496815286, 48.30508474576271, 58.01526717557252)

### Writing output

In [179]:
dev_out_path_p4 = partial/'dev.p4.out'
viterbi_output(dev_out_path, devin_ds[0], tag2index, emission_weight, transition_weight)

Done with writing predictions


### Compare with the new implementation

In [180]:
from scipy.optimize import fmin_l_bfgs_b

T = len(tag2index)
V = len(word2index)

start = time.time()

def callbackF(w):
    """
    This function will only be called by "fmin_l_bfgs_b"
    Arg:
    w: weights, numpy array
    """
    loss = get_loss_grad(w)[0]
    print('Loss:{0:.4f}'.format(loss))

def get_loss_grad(w):
    """
    This function will only be called by "fmin_l_bfgs_b"
    Arg:
    w: weights, numpy array
    Returns:
    loss: loss, float
    grads: gradients, numpy array
    """
    # to be completed by you,
    # based on the modified loss and gradients,
    # with L2 regularization included
    with HiddenPrints():
        transition_weight = w[:(T+1)*(T+1)].reshape((T+1, T+1))
        emission_weight = w[(T+1)*(T+1):].reshape((T, V))
        loss = LossDatasetRegularization(train_X[:2], train_Y[:2], tag2index, 
                                         emission_weight, transition_weight, Regularization)
        grads_transition = GradientTransitionAllFastRegularization(train_X[:2], train_Y[:2], tag2index, word2index, 
                                                emission_weight, transition_weight, Regularization)
        grads_emission = GradientEmissionAllFastRegularization(train_X[:2], train_Y[:2], tag2index, word2index, 
                                              emission_weight, transition_weight, Regularization)
        grads = np.concatenate((grads_transition.reshape(-1), grads_emission.reshape(-1)))
    return loss, grads

init_w = np.zeros(((T+1)*(T+1)+T*V,))
result = fmin_l_bfgs_b(get_loss_grad, init_w, pgtol=0.01, callback=callbackF)

end = time.time()
print('{} seconds have passed'.format(end - start))

Loss:13.0610
Loss:8.0035
Loss:6.4260
Loss:5.0369
Loss:4.6725
Loss:4.6349
Loss:4.6184
Loss:4.6063
Loss:4.5948
Loss:4.5935
Loss:4.5933
Loss:4.5931
10.841186046600342 seconds have passed


### Begin again

In [188]:
def testTransition(epsilon, Xs, Ys, tag1, tag2, tag2index, word2index, emission_weight, transition_weight, param=0.1):
    transition_weight_copy = copy.deepcopy(transition_weight)
    transition_weight_copy[tag1, tag2] += epsilon
    old_loss = LossDatasetRegularization(Xs, Ys, tag2index, emission_weight, transition_weight, param)
    new_loss = LossDatasetRegularization(Xs, Ys, tag2index, emission_weight, transition_weight_copy, param)
    gradient = GradientTransitionAllFastRegularization(Xs, Ys, tag2index, word2index, emission_weight, transition_weight, param)
    print('Actual loss change: {}, Change according to gradient: {}'.format(new_loss - old_loss, gradient[tag1, tag2]*epsilon))

In [189]:
epsilon = 0.0001
testTransition(epsilon, train_X, train_Y, tag2index['B-per'], tag2index['I-per'], tag2index, word2index, emission_weight, transition_weight)

Transition: done with the 100th instances
Transition: done with the 200th instances
Transition: done with the 300th instances
Transition: done with the 400th instances
Transition: done with the 500th instances
Transition: done with the 600th instances
Transition: done with the 700th instances
Actual loss change: -0.00021317379105312284, Change according to gradient: -0.0002133992138216281


In [194]:
def testEmission(epsilon, Xs, Ys, tag, word, tag2index, word2index, emission_weight, transition_weight, param=0.1):
    emission_weight_copy = copy.deepcopy(emission_weight)
    emission_weight_copy[tag, word] += epsilon
    old_loss = LossDatasetRegularization(Xs, Ys, tag2index, emission_weight, transition_weight, param)
    new_loss = LossDatasetRegularization(Xs, Ys, tag2index, emission_weight_copy, transition_weight, param)
    gradient = GradientEmissionAllFastRegularization(Xs, Ys, tag2index, word2index, emission_weight, transition_weight, param)
    print('Actual loss change: {}, Change according to gradient: {}'.format(new_loss - old_loss, gradient[tag, word]*epsilon))

In [195]:
epsilon = 0.0001
testEmission(epsilon, train_X, train_Y, tag2index['O'], word2index['the'], tag2index, word2index, emission_weight, transition_weight)

Emission: done with the 100th instances
Emission: done with the 200th instances
Emission: done with the 300th instances
Emission: done with the 400th instances
Emission: done with the 500th instances
Emission: done with the 600th instances
Emission: done with the 700th instances
Actual loss change: -2.1004194422857836e-05, Change according to gradient: -2.1028369081452903e-05


In [196]:
epsilon = 0.0001
testTransition(epsilon, train_X, train_Y, tag2index['B-per'], tag2index['I-per'], tag2index, word2index, np.zeros((len(tag2index), len(word2index))), np.zeros((len(tag2index)+1, len(tag2index)+1)))

Transition: done with the 100th instances
Transition: done with the 200th instances
Transition: done with the 300th instances
Transition: done with the 400th instances
Transition: done with the 500th instances
Transition: done with the 600th instances
Transition: done with the 700th instances
Actual loss change: -0.015170809550909325, Change according to gradient: -0.015171093750000001


In [228]:
import pandas as pd
import csv

glove_data_file = '../Embeddings/glove.6B/glove.6B.50d.txt'
words = pd.read_table(glove_data_file, sep=" ", index_col=0, header=None, quoting=csv.QUOTE_NONE)

In [241]:
def vec(w):
    return words.loc[w].to_numpy() if w in words.index else np.zeros(50,)