In [1]:
import numpy as np

# Part 1

In [2]:
def read_train_file(filename):
    with open(filename, encoding='utf-8') as f:
        file_content = f.read()

    # Split the entire file into sentences. Output: List of sentences
    sentences = file_content.strip().split('\n\n')

    # Split each sentence into their token_tag pair
    # Output: List of sentences. Each sentence is a list of token_tag_pair
    token_tag_pairs = [i.split('\n') for i in sentences]

    # Separate each token_tag_pair into a list of [token, tag].
    # Output: [[[token, tag], [token, tag], ...], [[token, tag], [token, tag], ...], ...]
    for idx, sentence in enumerate(token_tag_pairs):
        token_tags = [i.rsplit(' ', maxsplit=1) for i in sentence]
        token_tag_pairs[idx] = token_tags

    return token_tag_pairs

def read_dev_in_file(filename):
    with open(filename, encoding='utf-8') as f:
        file_content = f.read()

    # Split the entire file into sentences. Output: List of sentences
    sentences = file_co
    ntent.strip().split('\n\n')

    # Split each sentence into their tokens
    # Output: List of sentences. Each sentence is a list of tokens
    tokens = [i.split('\n') for i in sentences]

    return tokens

In [3]:
train_dataset_path = './dataset/train'
dev_dataset_path = './dataset/dev.in'

train_dataset = read_train_file(train_dataset_path)
dev_dataset = read_dev_in_file(dev_dataset_path)

In [4]:
def get_possible_states(train_dataset):
    possible_states = []

    for sentence in train_dataset:
        for token, tag in sentence:
            if tag not in possible_states:
                possible_states.append(tag)

    return possible_states

possible_states = get_possible_states(train_dataset)

In [5]:
def get_feature_counts(train_dataset):
    emission_count = {}
    transition_count = {}
    state_count = {}
    

    transition_count['START'] = {}
    for sentence in train_dataset:
        prev_state = None
        
        for token, tag in sentence:

            if emission_count.get(token) == None:
                emission_count[token] = {}
            emission_count[token][tag] = emission_count[token].get(tag, 0) + 1

            if prev_state != None:
                if transition_count.get(prev_state) == None:
                    transition_count[prev_state] = {}
                transition_count[prev_state][tag] = transition_count[prev_state].get(tag, 0) + 1

            else:
                transition_count['START'][tag] = transition_count['START'].get(tag, 0) + 1
                state_count['START'] = state_count.get('START', 0) + 1

            state_count[tag] = state_count.get(tag, 0) + 1
            prev_state = tag

        transition_count[prev_state]['STOP'] = transition_count[prev_state].get('STOP', 0) + 1

    return emission_count, transition_count, state_count

e_count, t_count, state_count = get_feature_counts(train_dataset)

In [6]:
def get_feature_dictionary(e_count, t_count, state_count):
    f = {}
    for token, tags in e_count.items():
        for tag, e_count in tags.items():
            key = "emission: " + tag + '+' + token
            e_prob = np.log(e_count/state_count[tag])
            f[key] = e_prob

    for prev_tag, next_tags in t_count.items():
        for next_tag, t_count in next_tags.items():
            key = "transition: " + prev_tag + '+' + next_tag
            t_prob = np.log(t_count/state_count[prev_tag])
            f[key] = t_prob

    return f

f = get_feature_dictionary(e_count, t_count, state_count)

# Part 2

In [7]:
'''
calculate_score(x,y):
Helps to calulate the score for a given pair of input and output sequence pair (x,y)
Based on 2 features, emission and transition

Parameters:
x: List of tokens, e.g. x = x1, x2, ..., xn         Type: list[str]
y: List of tags, e.g. y = y1, y2, ..., yn           Type: list[str]
f: Dictionary of feature weights                    Type: Dict{features: weights}
'''

def calculate_score(x,y,f):
    assert len(x) == len(y)

    feature_count = {}

    prev_tag = 'START'
    score = 0

    length = len(x)
    for i in range(length):
        e_key = "emission: " + y[i] + '+' + x[i]
        t_key = "transition: " + prev_tag + '+' + y[i]

        if e_key in f.keys():
            feature_count[e_key] = feature_count.get(e_key, 0) + 1

        if t_key in f.keys():
            feature_count[t_key] = feature_count.get(t_key, 0) + 1

        prev_tag = y[i]
        
    t_key = "transition: " + prev_tag + '+' + 'STOP'
    if t_key in f.keys():
        feature_count[t_key] = feature_count.get(t_key, 0) + 1

    for feature, count in feature_count.items():
        score += f[feature] * count

    return score

In [8]:
def viterbi(x, possible_states, f, default_index=0):
    n = len(x)
    d = len(possible_states)
    scores = np.full((n, d), -np.inf)
    bp = np.full((n, d), default_index, dtype=np.int32)

    for i in range(len(possible_states)):
        t_key = "transition: START"+possible_states[i]
        e_key = "emission: "+possible_states[i]+"+"+x[0]
        t_prob = f.get(t_key, -2**31)
        e_prob = f.get(e_key, -2**31)
        scores[0, i] = t_prob + e_prob
    
    for i in range(1, n):
        for k in range(len(possible_states)):
            for j in range(len(possible_states)):
                t_key = "transition: "+possible_states[k]+"+"+possible_states[j]
                e_key = "emission: "+possible_states[j]+"+"+x[i]
                t_prob = f.get(t_key, -2**31)
                e_prob = f.get(e_key, -2**31)
                overall_score = e_prob + t_prob + scores[i-1, k]
                if overall_score > scores[i, j]:
                    scores[i, j] = overall_score
                    bp[i,j] = k
    
    highest_score = -2**31
    highest_bp = default_index
    for i in range(len(possible_states)):
        t_key = "transition: "+possible_states[i]+"+STOP"
        t_prob = f.get(t_key, -2**31)
        overall_score = t_prob + scores[n-1, i]
        
        if overall_score > highest_score:
            highest_score = overall_score
            highest_bp = i
    
    result = [possible_states[highest_bp]]
    prev_bp = highest_bp
    for i in range(n-1, 0, -1):
        prev_bp = bp[i, prev_bp]
        output = possible_states[prev_bp]
        result = [output] + result
    
    return result

In [9]:
def decode(path, states, f, output_filename):
    default_index = states.index('O')
    sentences = list()

    with open(path) as file:
        lines = file.readlines()
        sentence = list()
        for line in lines:
            formatted_line = line.strip()   
            
            if(len(formatted_line) ==0):
                sentences.append(sentence)
                sentence = []
                continue
            sentence.append(formatted_line)

    with open(output_filename, "w") as wf:
        for sentence in sentences:
            pred_sentence = viterbi(sentence, states, f, default_index)        
            for i in range(len(sentence)):
                wf.write(sentence[i] + " " + pred_sentence[i] + "\n")
                
            wf.write("\n")

In [10]:
decode("dataset/dev.in", possible_states, f, 'dataset/dev.p2.out')

In [11]:
from conlleval import evaluate, evaluate_conll_file

def get_tags(pred,gold):
    f_pred = open(pred,encoding = 'utf-8')
    f_gold = open(gold,encoding = 'utf-8')
    data_pred = f_pred.readlines()
    data_gold = f_gold.readlines()
    gold_tags = list()
    pred_tags = list()
    
    for sentence in range(len(data_gold)):
        words_pred = data_pred[sentence].strip().split(' ')
        words_gold = data_gold[sentence].strip().split(' ')  
        if len(words_gold)==1:
            continue
        # Write original word and predicted tags
        gold_tags.append(words_gold[1])
        pred_tags.append(words_pred[1])
        # End of sentence, write newline
    return gold_tags,pred_tags


g_tags, p_tags = get_tags('dataset/dev.p2.out', 'dataset/dev.out')
print(evaluate(g_tags,p_tags,verbose=True))

processed 3809 tokens with 210 phrases; found: 168 phrases; correct: 66.
accuracy:  28.62%; (non-O)
accuracy:  92.57%; precision:  39.29%; recall:  31.43%; FB1:  34.92
         negative: precision:  32.26%; recall:  15.38%; FB1:  20.83  31
          neutral: precision:   7.14%; recall:  12.50%; FB1:   9.09  14
         positive: precision:  44.72%; recall:  40.15%; FB1:  42.31  123
(39.285714285714285, 31.428571428571427, 34.920634920634924)


# Part 3 (i)

In [12]:
def forward_algorithm(x, f, possible_states):
    forward_scores = np.zeros((len(x), len(possible_states)))
    threshold = 700

    for i in range(len(possible_states)):
        t_key = "transition: START+"+possible_states[i]
        t_prob = f.get(t_key, -2**31)
        e_key = "emission: "+possible_states[i]+"+"+x[0]
        e_prob = f.get(e_key, -2**31)
        forward_scores[0, i] = t_prob + e_prob
    
    for i in range(1, len(x)):
        for k in range(len(possible_states)):
            temp_score = 0
            for j in range(len(possible_states)):
                t_key = "transition: "+possible_states[j]+"+"+possible_states[k]
                t_prob = f.get(t_key, -2**31)
                e_key = "emission: "+possible_states[k]+"+"+x[i]
                e_prob = f.get(e_key, -2**31)
                score = e_prob + t_prob + forward_scores[i-1,j]
                if score > threshold:
                    score = threshold
                temp_score += np.exp(score)
                
            forward_scores[i, k] = np.log(temp_score) if temp_score else -2**31

    forward_prob = 0
    for j in range(len(possible_states)):
        t_key = "transition: "+possible_states[j]+"+STOP"
        t_prob = f.get(t_key, -2**31)
        score = t_prob + forward_scores[len(x)-1,j]
        if score > threshold:
            score = threshold
        overall_score = np.exp(score)
        forward_prob += overall_score
    if forward_prob > 0:
        alpha = np.log(forward_prob)
    else:
        alpha = threshold
        
    return forward_scores, alpha

In [13]:
def CRF_loss(train_dataset,f,possible_states):
    temp_loss = 0
    for sentence in train_dataset:
        x = [token_tag_pair[0] for token_tag_pair in sentence]
        y = [token_tag_pair[1] for token_tag_pair in sentence]
        _, alpha = forward_algorithm(x, f, possible_states)
        temp_loss += calculate_score(x,y,f) - alpha

    crf_loss = -temp_loss
    return crf_loss

In [14]:
CRF_loss(train_dataset,f,possible_states)

2050.7405338353615

# Part 3 (ii)

In [15]:
def backward_algorithm(x, f, possible_states):
    
    backward_scores = np.zeros((len(x), len(possible_states)))
    threshold = 700
    
    for i in range(len(possible_states)):
        t_key = "transition: "+ possible_states[i] +"+STOP"
        t_prob = f.get(t_key, -2**31)
        backward_scores[len(x)-1, i] = t_prob

    for i in range(len(x)-1,0,-1):
        for j  in range(len(possible_states)):
            temp_score = 0 
            for k in range(len(possible_states)):
                t_key = "transition: " + possible_states[j] + "+" + possible_states[k]
                e_key = "emission: "+ possible_states[k] + "+" + x[i] 
                t_prob = f.get(t_key, -2**31)
                e_prob = f.get(e_key, -2**31)
                temp_score += np.exp(min(e_prob + t_prob + backward_scores[i, k], threshold))
            
            if temp_score!=0:
                backward_scores[i-1, j] = np.log(temp_score)
            else:
                backward_scores[i-1, j] = -2**31
    
    backward_prob = 0
    for i in range(len(possible_states)):
        t_key = "transition: " + "START" + "+" + possible_states[i]
        e_key = "emission: " + possible_states[i] + "+" + x[0]
        t_prob = f.get(t_key, -2**31)
        e_prob = f.get(e_key, -2**31)
        overall_score = np.exp(min(e_prob + t_prob + backward_scores[0, i], threshold))
        backward_prob += overall_score
        
    if backward_prob!=0:
        beta = np.log(backward_prob)
    else:
        beta = -threshold    

    return backward_scores, beta


def forward_backward(x, f, possible_states):
    
    threshold = 700
    
    forward_scores, alpha = forward_algorithm(x, f, possible_states)
    forward_prob = np.exp(min(alpha, threshold))
    backward_scores, beta = backward_algorithm(x, f, possible_states)
    backward_prob = np.exp(min(beta, threshold))
    feature_expected_counts = {}

    for i in range(len(x)):
        for j in range(len(possible_states)):
            e_key = "emission: " + possible_states[j] + "+" + x[i]
            feature_expected_counts[e_key] = feature_expected_counts.get(e_key, 0.0) + np.exp(min(forward_scores[i, j] + backward_scores[i, j] - alpha, threshold))
    
    for i in range(len(possible_states)):
        start_t_key =  "transition: " + "START" + "+" + possible_states[i]
        feature_expected_counts[start_t_key] = feature_expected_counts.get(start_t_key, 0.0) + np.exp(min(forward_scores[0, i] + backward_scores[0, i] - alpha, threshold))
        stop_t_key =  "transition: " + possible_states[i] + "+"  + "STOP"
        feature_expected_counts[stop_t_key] = feature_expected_counts.get(stop_t_key, 0.0) + np.exp(min(forward_scores[len(x)-1, i] + backward_scores[len(x)-1, i] - alpha, threshold))
    
    for i in range(len(possible_states)):
        for j in range(len(possible_states)):
            t_key =  "transition: " + possible_states[i] + "+"  + possible_states[j]
            t_prob = f.get(t_key, -2**31) 
            total = 0
            for k in range(len(x)-1):
                e_key =  "emission: " + possible_states[j] + "+"  + x[k+1]
                e_prob = f.get(e_key, -2**31)

                total += np.exp(min(forward_scores[k, i] + backward_scores[k+1, j] + t_prob + e_prob - alpha, threshold))

            feature_expected_counts[t_key] = total
    
    return feature_expected_counts

In [16]:
def get_feature_count(x, y, feature_dict):
    n = len(x)
    feature_count = {}
    
    for i in range(n):
        formatted_word = x[i]
        emission_key = "emission: "+ y[i] + "+" + formatted_word
        feature_count[emission_key] = feature_count.get(emission_key, 0) + 1
    
    updated_y = ["START"] + y + ["STOP"]
    for i in range(1, n+2):
        prev_y = updated_y[i-1]
        y_i = updated_y[i]
        transition_key = "transition: " + prev_y + "+" + y_i
        feature_count[transition_key] = feature_count.get(transition_key, 0) + 1
    
    return feature_count

In [17]:
def compute_gradients(train_dataset, f, possible_states):
    feature_gradients = {}
    for sentence in train_dataset:
        x = [token_tag_pair[0] for token_tag_pair in sentence]
        y = [token_tag_pair[1] for token_tag_pair in sentence]
        feature_expected_counts = forward_backward(x, f, possible_states)
        actual_counts = get_feature_count(x, y, f)
        
        for k, v in feature_expected_counts.items():
            feature_gradients[k] = feature_gradients.get(k, 0) + v
            
        for k, v in actual_counts.items():
            feature_gradients[k] = feature_gradients.get(k, 0) - v

    return feature_gradients

In [18]:
feature_key_checks = ['emission: O+the', 'transition: START+O', 'transition: O+O']
feature_gradients = compute_gradients(train_dataset, f, possible_states)
loss1 = CRF_loss(train_dataset, f, possible_states)
delta = 1e-6

for feature_key in feature_key_checks:
    print("Running", feature_key)
    new_f = f.copy()
    new_f[feature_key] += delta

    loss2 = CRF_loss(train_dataset, new_f, possible_states)

    numerical_gradient = (loss2 - loss1) / delta
    analytical_gradient = feature_gradients[feature_key]
    print(numerical_gradient, analytical_gradient)
    
    # SANITY CHECK
    assert(abs(numerical_gradient - analytical_gradient) / max(abs(numerical_gradient), 1e-8) <= 1e-3)

Running emission: O+the
-0.3243840183131397 -0.3243869287425456
Running transition: START+O
-0.9774762474989984 -0.97748380541465
Running transition: O+O
-240.81111541818245 -240.81194339196077


# Part 4

In [19]:
from scipy.optimize import fmin_l_bfgs_b

def compute_gradients_with_reg(train_dataset, f, possible_states, eta = 0):
    feature_gradients = {}
    # for i in range(len(train_labels)):
    #     x = train_inputs[i]
    #     y = train_labels[i]
    for sentence in train_dataset:
        x = [token_tag_pair[0] for token_tag_pair in sentence]
        y = [token_tag_pair[1] for token_tag_pair in sentence]
        feature_expected_counts = forward_backward(x, f, possible_states)
        actual_counts = get_feature_count(x, y, f)
        
        for k, v in feature_expected_counts.items():
            feature_gradients[k] = feature_gradients.get(k, 0) + v
            
        for k, v in actual_counts.items():
            feature_gradients[k] = feature_gradients.get(k, 0) - v

    for k, v in f.items():
        feature_gradients[k] =feature_gradients.get(k,0) + 2*eta*f[k]

    return feature_gradients

def compute_crf_loss_with_reg(train_dataset, f, possible_states, eta=0):
    loss = 0

    for sentence in train_dataset:
        x = [token_tag_pair[0] for token_tag_pair in sentence]
        y = [token_tag_pair[1] for token_tag_pair in sentence]
    # for i in range(len(input_sequences)):
        first_term = calculate_score(x, y, f)
        _, alpha = forward_algorithm(x, f, possible_states)
        loss += (first_term - alpha) * -1

    reg_loss = 0
    for f_key in f:
        reg_loss += f[f_key]**2
    reg_loss = eta * reg_loss
    loss += reg_loss
    return loss

def callbackF(w):
    '''
    This function will be called by "fmin_l_bfgs_b"
    Arg:
        w: weights, numpy array
    '''
    loss = compute_crf_loss_with_reg(train_dataset,f,possible_states,0.1) 
    print('Loss:{0:.4f}'.format(loss))

def get_loss_grad(w, *args): 
    '''
    This function will be called by "fmin_l_bfgs_b"
    Arg:
        w: weights, numpy array
    Returns:
        loss: loss, float
        grads: gradients, numpy array
    '''
    train_dataset, f, possible_states = args
    for i,k in enumerate(f.keys()):
        f[k] = w[i]
    
    loss = compute_crf_loss_with_reg(train_dataset,f,possible_states,0.1)
    grads = compute_gradients_with_reg(train_dataset,f,possible_states,0.1)
    np_grads = np.zeros(len(f))
    for i,k in enumerate(f.keys()):
        np_grads[i] = grads[k]
    grads = np_grads
    return loss, grads

init_w = np.zeros(len(f))
result = fmin_l_bfgs_b(get_loss_grad, init_w, args=(train_dataset,f,possible_states), pgtol=0.01, callback=callbackF)

Loss:3546.3917
Loss:3200.1376
Loss:3123.8846
Loss:2983.3788
Loss:2848.3104
Loss:2644.9568
Loss:2479.1922
Loss:2428.1791
Loss:2280.4901
Loss:2181.1284
Loss:2131.0115
Loss:2086.8457
Loss:2055.4427
Loss:2021.9711
Loss:2007.9220
Loss:1976.4204
Loss:1957.3419
Loss:1946.0982
Loss:1931.1227
Loss:1927.8104
Loss:1919.8086
Loss:1914.1922
Loss:1904.4882
Loss:1894.4842
Loss:1881.5888
Loss:1875.7357
Loss:1865.2601
Loss:1861.4945
Loss:1858.5552
Loss:1854.2032
Loss:1850.8877
Loss:1847.0880
Loss:1845.0712
Loss:1843.1217
Loss:1838.7683
Loss:1837.6898
Loss:1836.2430
Loss:1835.4419
Loss:1834.6623
Loss:1833.8303
Loss:1830.6794
Loss:1829.8910
Loss:1829.3293
Loss:1828.4382
Loss:1827.8615
Loss:1827.0591
Loss:1826.6538
Loss:1825.9795
Loss:1825.7165
Loss:1825.0844
Loss:1824.4885
Loss:1823.5524
Loss:1822.8798
Loss:1822.6207
Loss:1822.1026
Loss:1821.9470
Loss:1821.8232
Loss:1821.4850
Loss:1821.2550
Loss:1820.8130
Loss:1820.6290
Loss:1820.4848
Loss:1820.2341
Loss:1820.1481
Loss:1820.0869
Loss:1820.0153
Loss:1819.

In [20]:
weight, loss, dictionary = result

for idx, key in enumerate(f.keys()):
    f[key] = weight[idx]

decode("dataset/dev.in", possible_states, f, 'dataset/dev.p4.out')
g_tags, p_tags = eval('dataset/dev.p4.out', 'dataset/dev.out')
print(evaluate(g_tags,p_tags,verbose=True))

processed 3809 tokens with 210 phrases; found: 229 phrases; correct: 79.
accuracy:  35.02%; (non-O)
accuracy:  90.39%; precision:  34.50%; recall:  37.62%; FB1:  35.99
         negative: precision:  29.27%; recall:  18.46%; FB1:  22.64  41
          neutral: precision:   0.00%; recall:   0.00%; FB1:   0.00  27
         positive: precision:  41.61%; recall:  48.91%; FB1:  44.97  161
(34.49781659388647, 37.61904761904762, 35.99088838268793)


# Part 5

In [21]:
def get_feature_count_p5(train_dataset):
    feature_counts = {}
    emission_count = {}
    transition_count = {}
    uni_count = {}
    bi_count = {}
    state_count = {}
    start_state = "START"
    stop_state = "STOP"

    for sentence in train_dataset:
        x = [token_tag_pair[0] for token_tag_pair in sentence]
        y = [token_tag_pair[1] for token_tag_pair in sentence]

        n = len(x)

        # START state
        if uni_count.get(start_state) == None:
            uni_count[start_state] = {}
        uni_count[start_state][x[0]] = uni_count[start_state].get(x[0], 0) + 1

        state_count[start_state] = state_count.get(start_state, 0) + 1

        # STOP state
        if transition_count.get(y[n-1]) == None:
            transition_count[y[n-1]] = {}
        transition_count[y[n-1]][stop_state] = transition_count[y[n-1]].get(stop_state, 0) + 1
        
        if uni_count.get(stop_state) == None:
            uni_count[stop_state] = {}
        uni_count[stop_state][x[n-1]] = uni_count[stop_state].get(x[n-1], 0) + 1

        state_count[stop_state] = state_count.get(stop_state, 0) + 1

        for i in range(n):

            # First word
            if i == 0:
                if emission_count.get(y[i]) == None:
                    emission_count[y[i]] = {}
                emission_count[y[i]][x[i]] = emission_count[y[i]].get(x[i], 0) + 1
                        
                if n>1:
                    if uni_count.get(y[i]) == None:
                        uni_count[y[i]] = {}
                    uni_count[y[i]][x[i+1]] = uni_count[y[i]].get(x[i+1], 0) + 1

                if transition_count.get(start_state) == None:
                    transition_count[start_state] = {}
                transition_count[start_state][y[i]] = transition_count[start_state].get(y[i], 0) + 1

                if bi_count.get((start_state, y[i])) == None:
                    bi_count[(start_state, y[i])] = {}
                bi_count[(start_state, y[i])][x[i]] = bi_count[(start_state, y[i])].get(x[i], 0) + 1

                state_count[(start_state, y[i])] = state_count.get((start_state, y[i]), 0) + 1                

            # Last word
            elif i == n-1:
                if emission_count.get(y[i]) == None:
                    emission_count[y[i]] = {}
                emission_count[y[i]][x[i]] = emission_count[y[i]].get(x[i], 0) + 1

                if uni_count.get(y[i]) == None:
                    uni_count[y[i]] = {}
                uni_count[y[i]][x[i-1]] = uni_count[y[i]].get(x[i-1], 0) + 1

                if transition_count.get(y[i-1]) == None:
                    transition_count[y[i-1]] = {}
                transition_count[y[i-1]][y[i]] = transition_count[y[i-1]].get(y[i], 0) + 1

                if bi_count.get((y[i-1], y[i])) == None:
                    bi_count[(y[i-1], y[i])] = {}
                bi_count[(y[i-1], y[i])][x[i]] = bi_count[(y[i-1], y[i])].get(x[i], 0) + 1

                state_count[(y[i-1], y[i])] = state_count.get((y[i-1], y[i]), 0) + 1

            # Middle words
            else:
                if emission_count.get(y[i]) == None:
                    emission_count[y[i]] = {}
                emission_count[y[i]][x[i]] = emission_count[y[i]].get(x[i], 0) + 1

                if uni_count.get(y[i]) == None:
                    uni_count[y[i]] = {}
                uni_count[y[i]][x[i-1]] = uni_count[y[i]].get(x[i-1], 0) + 1

                if uni_count.get(y[i]) == None:
                    uni_count[y[i]] = {}
                uni_count[y[i]][x[i+1]] = uni_count[y[i]].get(x[i+1], 0) + 1

                if transition_count.get(y[i-1]) == None:
                    transition_count[y[i-1]] = {}
                transition_count[y[i-1]][y[i]] = transition_count[y[i-1]].get(y[i], 0) + 1

                if bi_count.get((y[i-1], y[i])) == None:
                    bi_count[(y[i-1], y[i])] = {}
                bi_count[(y[i-1], y[i])][x[i]] = bi_count[(y[i-1], y[i])].get(x[i], 0) + 1

                state_count[(y[i-1], y[i])] = state_count.get((y[i-1], y[i]), 0) + 1

            state_count[y[i]] = state_count.get(y[i], 0) + 1

        state_count[(y[n-1], stop_state)] = state_count.get((y[n-1], stop_state), 0) + 1

    return emission_count, transition_count, uni_count, bi_count, state_count

In [22]:
def get_feature_dict_p5(emission_count, transition_count, uni_count, bi_count, state_count):
    f = {}
    for tag, tokens in emission_count.items():
        for token, e_count in tokens.items():
            key = "emission: " + tag + '+' + token
            e_prob = np.log(e_count/state_count[tag])
            f[key] = e_prob

    for prev_tag, next_tags in transition_count.items():
        for next_tag, t_count in next_tags.items():
            key = "transition: " + prev_tag + '+' + next_tag
            t_prob = np.log(t_count/state_count[prev_tag])
            f[key] = t_prob

    for tag, tokens in uni_count.items():
        for token, u_count in tokens.items():
            key = "unigram: " + tag + '+' + token
            u_prob = np.log(u_count/state_count[tag])
            f[key] = u_prob

    for tag, tokens in bi_count.items():
        for token, b_count in tokens.items():
            key = "bigram: " + tag[0] + '+' + tag[1] + '+' + token
            b_prob = np.log(b_count/state_count[tag])
            f[key] = b_prob

    return f

In [23]:
emission_count, transition_count, uni_count, bi_count, state_count = get_feature_count_p5(train_dataset)
f_p5 = get_feature_dict_p5(emission_count, transition_count, uni_count, bi_count, state_count)

In [24]:
def viterbi_p5(x, possible_states, f, default_index=0):
    n = len(x)
    d = len(possible_states)
    scores = np.full((n, d), -np.inf)
    bp = np.full((n, d), default_index, dtype=np.int32)

    for i in range(len(possible_states)):
        t_key = "transition: START"+possible_states[i]
        e_key = "emission: "+possible_states[i]+"+"+x[0]
        b_key = "bigram: START"+possible_states[i]+"+"+x[0]

        t_prob = f.get(t_key, -2**31)
        e_prob = f.get(e_key, -2**31)
        b_prob = f.get(b_key, -2**31)
        
        if n > 1:
            u_key = "unigram: "+possible_states[i]+"+"+x[1]
            u_prob = f.get(u_key, -2**31)
            scores[0, i] = t_prob + e_prob + u_prob + b_prob

        else:
            scores[0, i] = t_prob + e_prob + b_prob
    
    for i in range(1, n):
        for k in range(len(possible_states)):
            for j in range(len(possible_states)):
                t_key = "transition: "+possible_states[k]+"+"+possible_states[j]
                e_key = "emission: "+possible_states[j]+"+"+x[i]
                u_key1 = "unigram: "+possible_states[j]+"+"+x[i-1]
                b_key = "bigram: "+possible_states[k]+possible_states[j]+"+"+x[i]

                t_prob = f.get(t_key, -2**31)
                e_prob = f.get(e_key, -2**31)
                u_prob1 = f.get(u_key1, -2**31)
                b_prob = f.get(b_key, -2**31)

                if i != n-1:
                    u_key2 = "unigram: "+possible_states[j]+"+"+x[i+1]
                    u_prob2 = f.get(u_key2, -2**31)
                    overall_score = e_prob + t_prob + + u_prob1 + u_prob2 + b_prob + scores[i-1, k]

                else:
                    overall_score = e_prob + t_prob + + u_prob1 + b_prob + scores[i-1, k]

                if overall_score > scores[i, j]:
                    scores[i, j] = overall_score
                    bp[i,j] = k
    
    highest_score = -2**31
    highest_bp = default_index
    for i in range(len(possible_states)):
        t_key = "transition: "+possible_states[i]+"+STOP"
        t_prob = f.get(t_key, -2**31)

        overall_score = t_prob + scores[n-1, i]
        
        if overall_score > highest_score:
            highest_score = overall_score
            highest_bp = i
    
    result = [possible_states[highest_bp]]
    prev_bp = highest_bp
    for i in range(n-1, 0, -1):
        prev_bp = bp[i, prev_bp]
        output = possible_states[prev_bp]
        result = [output] + result
    
    return result

In [25]:
def decode_p5(path, states, f, output_filename):
    default_index = states.index('O')
    sentences = list()

    with open(path) as file:
        lines = file.readlines()
        sentence = list()
        for line in lines:
            formatted_line = line.strip()   
            
            if(len(formatted_line) ==0):
                sentences.append(sentence)
                sentence = []
                continue
            sentence.append(formatted_line)

    with open(output_filename, "w") as wf:
        for sentence in sentences:
            pred_sentence = viterbi_p5(sentence, states, f, default_index)        
            for i in range(len(sentence)):
                wf.write(sentence[i] + " " + pred_sentence[i] + "\n")
                
            wf.write("\n")

In [26]:
decode_p5("dataset/dev.in", possible_states, f_p5, 'dataset/dev.p5.out')
g_tags, p_tags = eval('dataset/dev.p5.out', 'dataset/dev.out')
print(evaluate(g_tags,p_tags,verbose=True))

processed 3809 tokens with 210 phrases; found: 218 phrases; correct: 100.
accuracy:  41.08%; (non-O)
accuracy:  92.94%; precision:  45.87%; recall:  47.62%; FB1:  46.73
         negative: precision:  35.71%; recall:  23.08%; FB1:  28.04  42
          neutral: precision:  16.67%; recall:  37.50%; FB1:  23.08  18
         positive: precision:  51.90%; recall:  59.85%; FB1:  55.59  158
(45.87155963302752, 47.61904761904761, 46.728971962616825)
