# 50.007 Machine Learning Project Part 4

____

## Model Choice: Structured Perceptron

Our improved model of choice is the structured perceptron as instead of training on the entire dataset all at once, it looks at a set of training examples at a time. This model is error driven, it will only update its parameters if there is an error.

In [306]:
import pandas as pd
import codecs

In [297]:
# Read data from files using UTF-8 encoding
with open("Data/ES/train", encoding="utf-8") as f:
    es = f.read().split("\n\n")
with open("Data/RU/train", encoding="utf-8") as f:
    ru = f.read().split("\n\n")
with open("Data/ES/dev.in", encoding="utf-8") as f:
    dev_in_es = f.read().split("\n\n")
with open("Data/RU/dev.in", encoding="utf-8") as f:
    dev_in_ru = f.read().split("\n\n")

cols = ["idx", "word", "tag"]

es_train = [pd.DataFrame([[str(idx)] + x.rsplit(" ", 1) for x in line.splitlines()], columns=cols) for idx, line in enumerate(es)]

ru_train = [pd.DataFrame([[str(idx)] + x.rsplit(" ", 1) for x in line.splitlines()], columns=cols) for idx, line in enumerate(ru)]

dev_in_es = [pd.DataFrame([[str(idx), line] for line in line.splitlines()], columns=[cols[:2]]).assign(tag="") for idx, line in enumerate(dev_in_es)]

dev_in_ru = [pd.DataFrame([[str(idx), line] for line in line.splitlines()], columns=[cols[:2]]).assign(tag="") for idx, line in enumerate(dev_in_ru)]

## Structure Perceptron Functions

Function parameters X and Y are used to denote the list of words and tags respectively.

In [298]:
START_KEYWORD = '!START!'
STOP_KEYWORD = '!STOP!'

def init_w_trans(tags, init_val):
    return pd.DataFrame(init_val, index=tags + [START_KEYWORD], columns=tags + [STOP_KEYWORD])

def init_w_emit(tags):
    return pd.DataFrame(0, index=[], columns=tags)

def create_features_trans(Y: pd.Series, tags):
    phi = init_w_trans(tags, 0)
    for i in range(Y.size + 1):
        if i == 0: first_tag = START_KEYWORD
        else: first_tag = Y[i-1]
        if i == Y.size: next_tag = STOP_KEYWORD
        else: next_tag = Y[i]
        phi.loc[first_tag, next_tag] += 1
    return phi

def create_features_emit(X: pd.Series, Y: pd.Series, tags):
    phi = pd.DataFrame(0, index=Y.index, columns=tags)
    for tag in tags:
        phi.loc[Y.loc[:] == tag, tag] = 1
    phi = pd.concat([X, phi], axis=1)
    phi = phi.groupby('word')[tags].sum().reset_index()
    phi.index = phi['word'].values
    phi = phi.drop('word', axis=1)
    return phi
    

def hmm_viterbi(w_trans: pd.DataFrame, w_emit: pd.DataFrame, X: pd.Series, tags):
    if (X.size == 0):
        return pd.Series()
    best_edge = pd.DataFrame('', index=X.index, columns=tags)
    best_score = pd.DataFrame(0, index=X.index, columns=tags)
    expander = pd.DataFrame(1, index=[0], columns=tags)
    
    trans = w_trans.loc[tags, tags].T
    
    # From start
    trans_filtered = w_trans.loc[[START_KEYWORD], tags].reset_index(drop=True)
    emit_transpose = w_emit.loc[[X[0]]] if X[0] in w_emit.index else pd.DataFrame(0, index=[0], columns=tags)
    emit_transpose = emit_transpose.reset_index(drop=True)
    trans_emit = trans_filtered + emit_transpose
    best_score.loc[0, :] = trans_emit.values
    best_edge.loc[0, :] = START_KEYWORD
    # Middle to end
    for i in range(0, X.shape[0]):
        emit_transpose = w_emit.loc[[X[i]]] if X[i] in w_emit.index else pd.DataFrame(0, index=[0], columns=tags)
        emit_transpose = emit_transpose.reset_index(drop=True).T @ expander
        trans_emit = best_score.loc[[i]].reset_index(drop=True).squeeze() + trans + emit_transpose
        best_score.loc[i+1, :] = trans_emit.max().values
        best_edge.loc[i+1, :] = trans_emit.idxmax().values
        
    # # Backtracking to find best state sequence
    best_seq = []
    curr_state = best_score.loc[best_score.shape[0] - 1].idxmax()
    for i in range(best_edge.shape[0] - 1, -1, -1):
        best_seq.append(curr_state)
        curr_state = best_edge.loc[i, curr_state]
    best_seq.reverse()
    return pd.Series(best_seq)

def structured_perceptron(data, tags, iters):
    w_trans = init_w_trans(tags, 1)
    w_emit = init_w_emit(tags)
    for iteration in range(iters):
        for idx in range(len(data)):
            X = data[idx].loc[:, 'word']
            X = X.squeeze() if isinstance(X, pd.DataFrame) else X
            Y_prime = data[idx].loc[:, 'tag']
            Y_prime = Y_prime.squeeze() if isinstance(Y_prime, pd.DataFrame) else Y_prime
            Y_hat = hmm_viterbi(w_trans, w_emit, X, tags)
            phi_prime_trans = create_features_trans(Y_prime, tags)
            phi_prime_emit = create_features_emit(X, Y_prime, tags)
            phi_hat_trans = create_features_trans(Y_hat, tags)
            phi_hat_emit = create_features_emit(X, Y_hat, tags)
            w_trans += phi_prime_trans - phi_hat_trans
            w_emit = w_emit.add(phi_prime_emit.sub(phi_hat_emit, fill_value=0), fill_value=0)
    return w_trans, w_emit

def predict(test_data, w_trans,  w_emit, tags):
    pred_data = []
    for idx in range(len(test_data)):
        data_instance = test_data[idx]
        X = data_instance.loc[:, ['word']].squeeze()
        X = pd.Series(X)
        Y_pred = hmm_viterbi(w_trans, w_emit, X, tags)
        data_instance['tag'] = Y_pred
        pred_data.append(data_instance)
    return pred_data

def result_to_str(df_list):
    res_str = []
    for df in df_list:
        res_str.append(df.drop('idx', axis=1).to_csv(sep=' ', header=None, index=False, lineterminator="\n"))
    res_str = "\n".join(res_str)
    return res_str

## Parameters

In [299]:
tags = ['O', 'B-positive', 'B-neutral', 'B-negative', 'I-positive', 'I-neutral', 'I-negative']
iterations = 1

## Training and Prediction

### Training time estimation

In [None]:
run_benchmark = False
benchmark_amount = 100 # number of tests to run to get a rough time value

In [300]:
import time
import random

if run_benchmark:
    print("=============================================================")
    print(f"Running benchmark with sample size of {benchmark_amount}")
    print("=============================================================")

    print("Benchmarking es_train...")
    start = time.perf_counter()
    structured_perceptron(random.sample(ru_train, benchmark_amount), tags, 1)
    stop = time.perf_counter()
    print(f"Time elapsed: {round(stop - start)}s")
    estimated_total_time = (stop - start) * (len(es_train) / benchmark_amount) * iterations
    print(f"Training es_train with {iterations} iterations will take an estimated {estimated_total_time // 60}m {round(estimated_total_time % 60)}s")

    print("Benchmarking ru_train...")
    start = time.perf_counter()
    structured_perceptron(random.sample(ru_train, benchmark_amount), tags, 1)
    stop = time.perf_counter()
    print(f"Time elapsed: {round(stop - start)}s")
    estimated_total_time = (stop - start) * (len(es_train) / benchmark_amount) * iterations
    print(f"Training ru_train with {iterations} iterations will take an estimated {estimated_total_time // 60}m {round(estimated_total_time % 60)}s")

    print("=============================================================")
    print(f"Benchmark complete")
    print("=============================================================")

### Dataset: ES

In [301]:
w_trans_es, w_emit_es = structured_perceptron(es_train, tags, 1)
display(w_trans_es)
display(w_emit_es)

Unnamed: 0,O,B-positive,B-neutral,B-negative,I-positive,I-neutral,I-negative,!STOP!
O,321,-830,61,322,-47,-65,-931,1814
B-positive,995,4,-4,-15,133,-6,-18,-1815
B-neutral,-30,1,0,-6,1,16,1,1
B-negative,-960,1,0,-1,0,0,69,4
I-positive,117,1,-2,-32,114,1,-1,2
I-neutral,15,1,-74,-5,1,9,1,0
I-negative,69,1,1,-1094,1,1,55,1
!START!,118,95,2,-56,-3,-8,-141,1


Unnamed: 0,O,B-positive,B-neutral,B-negative,I-positive,I-neutral,I-negative
!,3.0,0.0,0.0,-2.0,0.0,0.0,-1.0
"""",2.0,-2.0,0.0,0.0,0.0,0.0,0.0
%,1.0,0.0,0.0,0.0,0.0,0.0,-1.0
(,5.0,0.0,0.0,-1.0,0.0,-1.0,-3.0
),4.0,0.0,-1.0,-3.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...
“,0.0,0.0,0.0,-1.0,1.0,0.0,0.0
”,1.0,0.0,0.0,-2.0,1.0,0.0,0.0
…,0.0,0.0,0.0,0.0,0.0,0.0,0.0
″,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [302]:
dev_out_es_p4 = predict(dev_in_es, w_trans_es, w_emit_es, tags)
file_out_es_p4 = result_to_str(dev_out_es_p4)
with codecs.open("Data/ES/dev.out.p4", "w", "utf-8") as f:
    f.write(file_out_es_p4)
    f.close()

### Dataset: RU

In [303]:
w_trans_ru, w_emit_ru = structured_perceptron(ru_train, tags, 1)
display(w_trans_ru)
display(w_emit_ru)

Unnamed: 0,O,B-positive,B-neutral,B-negative,I-positive,I-neutral,I-negative,!STOP!
O,382,-1566,120,379,-52,-20,-1589,3045
B-positive,1481,4,0,-19,351,1,-5,-3030
B-neutral,176,1,1,-20,1,29,1,1
B-negative,-1738,1,1,-6,1,1,83,1
I-positive,349,-2,1,-33,248,1,1,-13
I-neutral,29,1,1,-22,1,26,1,1
I-negative,82,1,1,-1743,1,1,43,2
!START!,-62,343,65,-192,1,-1,-147,1


Unnamed: 0,O,B-positive,B-neutral,B-negative,I-positive,I-neutral,I-negative
!,8.0,0.0,0.0,-3.0,-1.0,0.0,-4.0
"""",-17.0,5.0,2.0,-16.0,27.0,4.0,-5.0
%,2.0,0.0,0.0,-1.0,0.0,0.0,-1.0
&,-1.0,0.0,0.0,0.0,2.0,0.0,-1.0
',0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...
ячменное,-1.0,0.0,0.0,0.0,1.0,0.0,0.0
–,0.0,0.0,0.0,0.0,0.0,0.0,0.0
—,0.0,0.0,0.0,0.0,0.0,0.0,0.0
…,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [307]:
dev_out_ru_p4 = predict(dev_in_ru, w_trans_ru, w_emit_ru, tags)
file_out_ru_p4 = result_to_str(dev_out_ru_p4)
with codecs.open("Data/RU/dev.out.p4", "w", "utf-8") as f:
    f.write(file_out_ru_p4)
    f.close()