In [35]:
import math
import numpy as np
from collections import defaultdict
from scipy.optimize import fmin_l_bfgs_b
import time
from datetime import datetime
import pytz

In [36]:
DATA_FILE = "dataset" # dir where your train/dev.in is stored
FEATURE_OUTPUT_DIRECTORY = "features"
LEARN_CO = 0.1

In [37]:
import re
with open('./dataset/test.in', 'r', encoding='utf-8') as a:
    sents = a.read()
    sents = sents.replace("\n\n", "\n")
with open('./dataset/test2.in', 'w', encoding='utf-8') as test2:
    test2.write(sents)

In [38]:
def read_train_file(directory):
    x_train = []
    y_train = []
    all_tags = []
    all_words = []

    with open(directory) as f:
        x_sent = []
        y = []
        for line in f:
            if line == '\n': # end of a sentence
                x_train.append(x_sent)
                y_train.append(y)
                x_sent=[]
                y=[]
            else:
                temp = line.strip().split()
                x_sent.append(temp[0]) # word
                y.append(temp[1]) # tag

                if temp[1] not in all_tags:
                    all_tags.append(temp[1])
                if temp[0] not in all_words:
                    all_words.append(temp[0])

    return x_train, y_train, all_tags, all_words

In [39]:
x_train, y_train, ALL_TAGS,  ALL_WORDS = read_train_file(DATA_FILE + '/train')

In [40]:
def write_feature(feat, output_file):
    with open(output_file, "w") as out:
        for (k, v) in feat.items():
            out.write(f"{k} {v}\n")

In [41]:
def get_feature_dict(x_train, y_train, output_dir):
    
    features = {}
    tags = ALL_TAGS + ["START", "STOP"]
    
    for i in tags:
        for j in tags:
            string = f"transition:{str(i)}+{str(j)}"
            features[string] = -2**21
            
    for j in ALL_TAGS:
        for i in ALL_WORDS:
            string = f"emission:{str(j)}+{str(i)}"
            features[string] = -2**21
            
    label_dict = defaultdict(int)  # {LABEL : COUNT} e.g: {'o': 24273, 'B-negative': 278, ...}
    word_label_dict = defaultdict(int) # {(LABEL, WORD): COUNT} 
                               # e.g: {('O', 'All'): 3, ('B-positive', 'food'): 131, ...}

    for i in range(len(x_train)):
        for j in range(len(x_train[i])):
            label_dict[y_train[i][j]] += 1
            word_label_dict[(y_train[i][j], x_train[i][j])] += 1
    
    # print(f"y_dict: {y_dict}")
    # print(f"yx_dict: {yx_dict}")

    emission = defaultdict(int)
    for k in word_label_dict:
        tag = k[0]
        string = f"emission:{str(k[0])}+{str(k[1])}"
        emission[string] = math.log(float(word_label_dict[k])/label_dict[tag])
    # print(f"emission: {emission}")

    # getting transition
    yi_dict = defaultdict(int)
    yj_dict = defaultdict(int)
    
    for i in range(len(x_train)):
#         if len(y_train[i]) == 0: # this seems to be useless
#             continue

         # adding START and STOP tag to each sentence
        yi_dict['START'] += 1
        yj_dict[('START', y_train[i][0])] += 1
        yj_dict[(y_train[i][-1],'STOP')] += 1

        for j in range(len(x_train[i])-1):
            yi_dict[y_train[i][j]] += 1
            yj_dict[(y_train[i][j],y_train[i][j+1])] += 1
        yi_dict[y_train[i][-1]] += 1
                
    transition = defaultdict(int)
    for k in yj_dict:
        string = f"transition:{str(k[0])}+{str(k[1])}"
        transition[string] = math.log(float(yj_dict[k])/yi_dict[k[0]])
    
    if "transition:START+STOP" in transition:
        del transition["transition:START+STOP"]

    write_feature(emission, output_dir + "/emission_P1.txt") # save emission dictionary
    write_feature(transition, output_dir + "/transition_P1.txt") # save transition dictionary

    for key in emission:
        features[key] = emission[key]
    for key in transition:
        features[key] = transition[key]

    return features, emission

In [42]:
feature_dict, emission = get_feature_dict(x_train, y_train, FEATURE_OUTPUT_DIRECTORY)


In [43]:
def get_states():   
    sents = []
    sents.append('START')
    with open('./dataset/train', 'r', encoding='utf-8') as trainfile:
        lines = trainfile.readlines()
        for line in lines:
            x = line.split(' ')
            # print(x)
            if len(x) != 1: #for \n
                sents.append(x)
            else:
                sents.append('STOP')
                sents.append('START')
        # print(sents)

    states = []
    for term in range(len(sents) - 1):
        if len(sents[term]) == 2:
            states.append(sents[term][1].rstrip()) #append the word
        else:
            states.append(sents[term].rstrip()) #append start/stop
    return states

def get_bigram_count(states):
    bigram_count = dict()
    for i in range(len(states)-1):
        if(states[i+1]=='START'):
            continue
        bigram_count[f'transition:{states[i]}+{states[i+1]}'] = bigram_count.get('transition:'+states[i]+ '+'+states[i+1], 0) + 1 
    start_stop = dict()
    for i in range(len(states)):
        start_stop[states[i]] = start_stop.get(states[i], 0) + 1 
    return bigram_count, start_stop

In [44]:
def get_trigram_count(states):
    trigram_count = dict()
    for i in range(1,len(states)-1):
        if(states[i+1]=='START' or states[i-1] == 'STOP'):
            continue #skip to next 
        trigram_count[f'transition:{states[i-1]}+{states[i]}+{states[i+1]}'] = trigram_count.get(f'transition:{states[i-1]}+{states[i]}+{states[i+1]}', 0) + 1 

    return trigram_count

def get_bi_trans(y, y1, bi_dict, label_dict):
    if f'transition:{y}+{y1}' not in bi_dict:
        return 0.0
    bigram_count = bi_dict[f'transition:{y}+{y1}']
    count_y = label_dict[y]
    bi_trans = bigram_count/count_y
    return bi_trans

def get_bi_trans_dict(emission_dict, bigram_count, label_dict, states):
    for i in range(len(states)-1):
        if (states[i+1]=='START'):
            continue
        emission_dict['transition:'+states[i]+'+'+states[i+1]] = math.log(get_bi_trans(states[i], states[i+1], bigram_count, label_dict))
    return emission_dict

def get_tri_trans(y, y1, y2, trigram_dict, bigram_dict):
    if f'transition:{y}+{y1}+{y2}' not in trigram_dict:
        return 0.0
    count_trigram = trigram_dict[f'transition:{y}+{y1}+{y2}']
    count_u_v = bigram_dict[f"transition:{y}+{y1}"]
    result = count_trigram / count_u_v
    return result

def get_tri_trans_dict(emission_dict, trigram, bigram, states):
    for i in range(1,len(states)-1):
        if (states[i+1]=='START' or states[i-1]=="STOP"):
            continue
        emission_dict[f'transition:{states[i-1]}+{states[i]}+{states[i+1]}'] = math.log(get_tri_trans(states[i-1],states[i],states[i+1],trigram,bigram))
    return emission_dict

In [45]:
data_states = get_states()
bigram_count, start_stop = get_bigram_count(data_states)
x_train, y_train, all_tags, all_words = read_train_file('./dataset/train')
bigram_dict = get_bi_trans_dict(emission, bigram_count, start_stop, data_states)
trigram_count = get_trigram_count(data_states)
trigram_dict = get_tri_trans_dict(emission,trigram_count , bigram_count, data_states)

In [46]:
def viterbi_p6(x, states, trigram_dict):
    scores = np.full((len(x), len(states)), -np.inf)
    parents = np.full((len(x), len(states)), 0, dtype=int)
    
    for i in range(len(states)):
        for j in range(len(states)):
            emission_key1 = f"emission:{states[i]}+{x[0]}"
            transition_key = f"transition:start+{states[i]}"
            trigram_key = f"transition:start+{states[i]}+{states[j]}"
            scores[0, i] = trigram_dict.get(emission_key1, -10e8) + trigram_dict.get(trigram_key, -10e8) + trigram_dict.get(transition_key, -10e8)
    
    for i in range(1, len(x)):
        for j in range(len(states)):
            for tri in range(len(states)):
                for k in range(len(states)):
                    emission_key1 = f"emission:{states[k]}+{x[i].split()[0]}"
                    trigram_key = f"transition:{states[j]}+{states[tri]}+{states[k]}"
                    transition_key = f"transition:{states[j]}+{states[k]}"
                    overall_score = scores[i-1, j] + trigram_dict.get(emission_key1, -10e8) + trigram_dict.get(trigram_key, -10e8) + trigram_dict.get(transition_key, -10e8)

                    if overall_score > scores[i, k]:
                        scores[i, k] = overall_score
                        parents[i,k] = j

    best_score = -np.inf
    best_parent = None
    
    for i in range(len(states)):
        for j in range(len(states)):
            t_feature = f"transition:{states[i]}+stop"
            trigram_feature = f"transition:{states[i]}+{states[j]}+stop"
            total = scores[len(x)-1, i] + trigram_dict.get(t_feature, -10**8) + trigram_dict.get(trigram_feature,-10**8)
            if total > best_score:
                best_score = total
                best_parent = i
    best_state = [states[best_parent]]
    prev_parent = best_parent
    for i in range(len(x)-1, 0, -1):
        prev_parent = parents[i, prev_parent]
        output = states[prev_parent]
        best_state = [output] + best_state
    return best_state
states = list(set(all_tags))

def get_prediction(resulting_dict):
    with open('./dataset/test2.in', 'r', encoding="utf-8") as test_set:
        lines = test_set.readlines()
    sequences = []
    sequence = []
    for line in lines:
        if line == '\n':
            sequences.append(sequence)
            sequence = []
            continue

        line = line.replace('\n', '')
        sequence.append(line)
    
    with open('./features/test.p6.CRF.out', "w", encoding="utf-8") as out_file:
        # print(resulting_dict)
        for x in sequences:
            predicted= viterbi_p6(x, states, resulting_dict)
            # predicted = predicted[1:-1]
            for i in range(len(x)):
                out_file.write(x[i] + ' ' + predicted[i] + '\n')
            out_file.write('\n')
        out_file.close()
get_prediction(trigram_dict)