# 50.007 Machine Learning Project Part 4

____

## Model Choice: Structured Perceptron

Our improved model of choice is the structured perceptron as instead of training on the entire dataset all at once, it looks at a set of training examples at a time. This model is error driven, it will only update its parameters if there is an error.

In [144]:
import pandas as pd

In [145]:
# Read data from files using UTF-8 encoding
with open("Data/ES/train", encoding="utf-8") as f:
    es = f.read().split("\n\n")
with open("Data/RU/train", encoding="utf-8") as f:
    ru = f.read().split("\n\n")
with open("Data/ES/dev.in", encoding="utf-8") as f:
    dev_in_es = f.read().split("\n\n")
with open("Data/ES/dev.out", encoding="utf-8") as f:
    dev_out_es = f.read().split("\n\n")

cols = ["idx", "word", "tag"]

es_train = [pd.DataFrame([[str(idx)] + x.rsplit(" ", 1) for x in line.splitlines()], columns=cols) for idx, line in enumerate(es)]

ru_train = [pd.DataFrame([[str(idx)] + x.rsplit(" ", 1) for x in line.splitlines()], columns=cols) for idx, line in enumerate(ru)]

dev_in_es = [pd.DataFrame([[str(idx), line] for line in line.splitlines()], columns=[cols[:2]]).assign(tag="") for idx, line in enumerate(dev_in_es)]

dev_out_es = [pd.DataFrame([[str(idx)] + x.rsplit(" ", 1) for x in line.splitlines()], columns=cols) for idx, line in enumerate(dev_out_es)]

display(es_train[0])
display(dev_in_es[0])
display(dev_out_es[0])

Unnamed: 0,idx,word,tag
0,0,Estuvimos,O
1,0,hace,O
2,0,poco,O
3,0,mi,O
4,0,pareja,O
5,0,y,O
6,0,yo,O
7,0,comiendo,O
8,0,y,O
9,0,resultó,O


Unnamed: 0,idx,word,tag
0,0,Plato,
1,0,degustación,
2,0,:,
3,0,un,
4,0,poco,
5,0,abundante,
6,0,de,
7,0,más,
8,0,",",
9,0,pero,


Unnamed: 0,idx,word,tag
0,0,Plato,B-positive
1,0,degustación,I-positive
2,0,:,O
3,0,un,O
4,0,poco,O
5,0,abundante,O
6,0,de,O
7,0,más,O
8,0,",",O
9,0,pero,O


## Structure Perceptron Functions

Function parameters X and Y are used to denote the list of words and tags respectively.

In [151]:
START_KEYWORD = '!START!'
STOP_KEYWORD = '!STOP!'

def create_trans(prev, next):
    return pd.Series([1], index=[f"T {prev} {next}"])

def create_emit(y, x):
    return pd.Series([1], index=[f"E {y} {x}"])

def create_features(X: pd.Series, Y: pd.Series):
    # Creates a transmission map in the form of a DataFrame
    phi = pd.Series()
    for i in range(Y.size + 1):
        if i == 0: first_tag = START_KEYWORD
        else: first_tag = Y[i-1]
        if i == Y.size: next_tag = STOP_KEYWORD
        else: next_tag = Y[i]
        phi = phi.add(create_trans(first_tag, next_tag), fill_value=0)
    for i in range(Y.size):
        phi = phi.add(create_emit(Y[i], X[i]), fill_value=0)
    return phi

def hmm_viterbi(w: pd.Series, X: pd.Series, tags):
    best_score = pd.Series()
    best_edge = pd.Series()
    best_score[f'0 {START_KEYWORD}'] = 0
    for i in range(X.size):
        for prev in tags:
            if i == 0:
                prev = START_KEYWORD
            for next in tags:
                if f'T {prev} {next}' in w.index:
                    prev_best_score = 0
                    if f'{i} {prev}' in best_score.index:
                        prev_best_score = best_score[f'{i} {prev}']
                    
                    score = prev_best_score + (w*(
                        create_trans(prev, next).add(create_emit(next, X[i]), fill_value=0)
                        )
                     ).sum()
                    next_score_index = f'{i+1} {next}'
                    if next_score_index not in best_score.index or best_score[next_score_index] < score:
                        best_score[next_score_index] = score
                        best_edge[next_score_index] = f'{i} {prev}'
            if i == 0:
                break
    # Finish with calculating the score for the stop tag
    for prev in tags:
        next = STOP_KEYWORD
        i = X.size
        if f'T {prev} {next}' in w.index:
            score = best_score[f'{i} {prev}'] + (w*create_trans(prev, next)).sum()
            next_score_index = f'{i+1} {next}'
            if next_score_index not in best_score.index or best_score[next_score_index] < score:
                best_score[next_score_index] = score
                best_edge[next_score_index] = f'{i} {prev}'
    # Get best tag sequence by backtracking from best_edge
    best_seq = []
    curr = f'{X.size + 1} {STOP_KEYWORD}'
    while(True):
        curr = best_edge[curr]
        if curr == f'0 {START_KEYWORD}':
            break
        best_seq.append(curr.rsplit(' ', 1)[1])
    best_seq.reverse()
    return pd.Series(best_seq)

def init_w(tags):
    w = pd.Series()
    for prev in tags + [START_KEYWORD]:
        for next in tags + [STOP_KEYWORD]:
            if prev == START_KEYWORD and next == STOP_KEYWORD:
                continue
            w = w.add(create_trans(prev, next), fill_value=0)
    return w

def structured_perceptron(data, tags, iters):
    w = init_w(tags)
    for iteration in range(iters):
        for idx in range(len(data)):
            X = data[idx].loc[:, 'word']
            Y_prime = data[idx].loc[:, 'tag']
            Y_hat = hmm_viterbi(w, X, tags)
            phi_prime = create_features(X, Y_prime)
            phi_hat = create_features(X, Y_hat)
            w = w.add(phi_prime.sub(phi_hat, fill_value=0), fill_value=0)
    return w

## Parameters

In [163]:
tags = ['O', 'B-positive', 'B-negative', 'I-positive', 'I-negative']
benchmark_amount = 30 # number of tests to run to get a rough time value
iterations = 1

### Training time estimation

In [168]:
import time
import random

print("=============================================================")
print(f"Running benchmark with sample size of {benchmark_amount}")
print("=============================================================")

print("Benchmarking es_train...")
start = time.perf_counter()
structured_perceptron(random.sample(ru_train, benchmark_amount), tags, 1)
stop = time.perf_counter()
print(f"Time elapsed: {round(stop - start)}s")
estimated_total_time = (stop - start) * (len(es_train) / benchmark_amount) * iterations
print(f"Training es_train with {iterations} iterations will take an estimated {estimated_total_time // 60}m {round(estimated_total_time % 60)}s")

print("Benchmarking ru_train...")
start = time.perf_counter()
structured_perceptron(random.sample(ru_train, benchmark_amount), tags, 1)
stop = time.perf_counter()
print(f"Time elapsed: {round(stop - start)}s")
estimated_total_time = (stop - start) * (len(es_train) / benchmark_amount) * iterations
print(f"Training ru_train with {iterations} iterations will take an estimated {estimated_total_time // 60}m {round(estimated_total_time % 60)}s")

print("=============================================================")
print(f"Benchmark complete")
print("=============================================================")

Running benchmark with sample size of 30
Benchmarking es_train...


## Training

In [165]:


# display(structured_perceptron(es_train, tags, 1))