# 50.007 Machine Learning Project Part 4

____

## Model Choice: Structured Perceptron

Our improved model of choice is the structured perceptron as instead of training on the entire dataset all at once, it looks at a set of training examples at a time. This model is error driven, it will only update its parameters if there is an error.

References:
Nara Institute of Science and Technology. (n.d.). NLP Programming Tutorial 11 - The Structured Perceptron. Retrieved August 13, 2023, from http://www.phontron.com/slides/nlp-programming-en-12-struct.pdf

In [341]:
import pandas as pd
import codecs

In [342]:
# Read data from files using UTF-8 encoding
with open("Data/ES/train", encoding="utf-8") as f:
    es = f.read().split("\n\n")
with open("Data/RU/train", encoding="utf-8") as f:
    ru = f.read().split("\n\n")
with open("Data/ES/dev.in", encoding="utf-8") as f:
    dev_in_es = f.read().split("\n\n")
with open("Data/RU/dev.in", encoding="utf-8") as f:
    dev_in_ru = f.read().split("\n\n")

cols = ["idx", "word", "tag"]

es_train = [pd.DataFrame([[str(idx)] + x.rsplit(" ", 1) for x in line.splitlines()], columns=cols) for idx, line in enumerate(es)]

ru_train = [pd.DataFrame([[str(idx)] + x.rsplit(" ", 1) for x in line.splitlines()], columns=cols) for idx, line in enumerate(ru)]

dev_in_es = [pd.DataFrame([[str(idx), line] for line in line.splitlines()], columns=[cols[:2]]).assign(tag="") for idx, line in enumerate(dev_in_es)]

dev_in_ru = [pd.DataFrame([[str(idx), line] for line in line.splitlines()], columns=[cols[:2]]).assign(tag="") for idx, line in enumerate(dev_in_ru)]

## Structure Perceptron Functions

Function parameters X and Y are used to denote the list of words and tags respectively.

In [343]:
START_KEYWORD = '!START!'
STOP_KEYWORD = '!STOP!'

def init_w_trans(tags, init_val):
    return pd.DataFrame(init_val, index=tags + [START_KEYWORD], columns=tags + [STOP_KEYWORD])

def init_w_emit(tags):
    return pd.DataFrame(0, index=[], columns=tags)

def create_features_trans(Y: pd.Series, tags):
    phi = init_w_trans(tags, 0)
    for i in range(Y.size + 1):
        if i == 0: first_tag = START_KEYWORD
        else: first_tag = Y[i-1]
        if i == Y.size: next_tag = STOP_KEYWORD
        else: next_tag = Y[i]
        phi.loc[first_tag, next_tag] += 1
    return phi

def create_features_emit(X: pd.Series, Y: pd.Series, tags):
    phi = pd.DataFrame(0, index=Y.index, columns=tags)
    for tag in tags:
        phi.loc[Y.loc[:] == tag, tag] = 1
    phi = pd.concat([X, phi], axis=1)
    phi = phi.groupby('word')[tags].sum().reset_index()
    phi.index = phi['word'].values
    phi = phi.drop('word', axis=1)
    return phi
    

def hmm_viterbi(w_trans: pd.DataFrame, w_emit: pd.DataFrame, X: pd.Series, tags):
    if (X.size == 0):
        return pd.Series()
    best_edge = pd.DataFrame('', index=X.index, columns=tags)
    best_score = pd.DataFrame(0, index=X.index, columns=tags)
    expander = pd.DataFrame(1, index=[0], columns=tags)
    
    trans = w_trans.loc[tags, tags].T
    
    # From start
    trans_filtered = w_trans.loc[[START_KEYWORD], tags].reset_index(drop=True)
    emit_transpose = w_emit.loc[[X[0]]] if X[0] in w_emit.index else pd.DataFrame(0, index=[0], columns=tags)
    emit_transpose = emit_transpose.reset_index(drop=True)
    trans_emit = trans_filtered + emit_transpose
    best_score.loc[0, :] = trans_emit.values
    best_edge.loc[0, :] = START_KEYWORD
    # Middle to end
    for i in range(0, X.shape[0]):
        emit_transpose = w_emit.loc[[X[i]]] if X[i] in w_emit.index else pd.DataFrame(0, index=[0], columns=tags)
        emit_transpose = emit_transpose.reset_index(drop=True).T @ expander
        trans_emit = best_score.loc[[i]].reset_index(drop=True).squeeze() + trans + emit_transpose
        best_score.loc[i+1, :] = trans_emit.max().values
        best_edge.loc[i+1, :] = trans_emit.idxmax().values
        
    # # Backtracking to find best state sequence
    best_seq = []
    curr_state = best_score.loc[best_score.shape[0] - 1].idxmax()
    for i in range(best_edge.shape[0] - 1, -1, -1):
        best_seq.append(curr_state)
        curr_state = best_edge.loc[i, curr_state]
    best_seq.reverse()
    return pd.Series(best_seq)

def structured_perceptron(data, tags, iters):
    w_trans = init_w_trans(tags, 1)
    w_emit = init_w_emit(tags)
    for iteration in range(iters):
        for idx in range(len(data)):
            X = data[idx].loc[:, 'word']
            X = X.squeeze() if isinstance(X, pd.DataFrame) else X
            Y_prime = data[idx].loc[:, 'tag']
            Y_prime = Y_prime.squeeze() if isinstance(Y_prime, pd.DataFrame) else Y_prime
            Y_hat = hmm_viterbi(w_trans, w_emit, X, tags)
            phi_prime_trans = create_features_trans(Y_prime, tags)
            phi_prime_emit = create_features_emit(X, Y_prime, tags)
            phi_hat_trans = create_features_trans(Y_hat, tags)
            phi_hat_emit = create_features_emit(X, Y_hat, tags)
            w_trans += phi_prime_trans - phi_hat_trans
            w_emit = w_emit.add(phi_prime_emit.sub(phi_hat_emit, fill_value=0), fill_value=0)
    return w_trans, w_emit

def predict(test_data, w_trans,  w_emit, tags):
    pred_data = []
    for idx in range(len(test_data)):
        data_instance = test_data[idx]
        X = data_instance.loc[:, ['word']].squeeze()
        X = pd.Series(X)
        Y_pred = hmm_viterbi(w_trans, w_emit, X, tags)
        data_instance['tag'] = Y_pred
        pred_data.append(data_instance)
    return pred_data

def result_to_str(df_list):
    res_str = []
    for df in df_list:
        res_str.append(df.drop('idx', axis=1).to_csv(sep=' ', header=None, index=False, lineterminator="\n"))
    res_str = "\n".join(res_str)
    return res_str

## Parameters

In [344]:
tags = ['O', 'B-positive', 'B-neutral', 'B-negative', 'I-positive', 'I-neutral', 'I-negative']
epoch = 15

## Training and Prediction

### Training time estimation

In [345]:
run_benchmark = True
benchmark_amount = 100 # number of tests to run to get a rough time value

In [346]:
import time
import random

if run_benchmark:
    print("=============================================================")
    print(f"Running benchmark with sample size of {benchmark_amount}")
    print("=============================================================")

    print("Benchmarking es_train...")
    start = time.perf_counter()
    structured_perceptron(random.sample(ru_train, benchmark_amount), tags, 1)
    stop = time.perf_counter()
    print(f"Time elapsed: {round(stop - start)}s")
    estimated_total_time = (stop - start) * (len(es_train) / benchmark_amount) * epoch
    print(f"Training es_train over {epoch} epoch will take an estimated {estimated_total_time // 60}m {round(estimated_total_time % 60)}s")

    print("Benchmarking ru_train...")
    start = time.perf_counter()
    structured_perceptron(random.sample(ru_train, benchmark_amount), tags, 1)
    stop = time.perf_counter()
    print(f"Time elapsed: {round(stop - start)}s")
    estimated_total_time = (stop - start) * (len(es_train) / benchmark_amount) * epoch
    print(f"Training ru_train over {epoch} epoch will take an estimated {estimated_total_time // 60}m {round(estimated_total_time % 60)}s")

    print("=============================================================")
    print(f"Benchmark complete")
    print("=============================================================")

Running benchmark with sample size of 100
Benchmarking es_train...
Time elapsed: 2s
Training es_train over 15 epoch will take an estimated 11.0m 10s
Benchmarking ru_train...
Time elapsed: 2s
Training ru_train over 15 epoch will take an estimated 11.0m 27s
Benchmark complete


### Dataset: ES

In [347]:
w_trans_es, w_emit_es = structured_perceptron(es_train, tags, epoch)
display(w_trans_es)
display(w_emit_es)

Unnamed: 0,O,B-positive,B-neutral,B-negative,I-positive,I-neutral,I-negative,!STOP!
O,5285,-12459,675,5292,-878,-338,-14982,27602
B-positive,15149,46,7,-336,2023,-9,-169,-27673
B-neutral,768,1,0,-280,1,226,1,1
B-negative,-16274,1,0,-93,0,0,1021,60
I-positive,1993,1,-4,-871,1987,-19,-1,16
I-neutral,225,1,-88,-305,1,236,1,0
I-negative,1021,1,1,-17646,1,1,980,1
!START!,2030,1446,127,-1046,-33,-26,-2491,1


Unnamed: 0,O,B-positive,B-neutral,B-negative,I-positive,I-neutral,I-negative
!,5.0,0.0,0.0,-3.0,0.0,0.0,-2.0
"""",-4.0,-2.0,0.0,-10.0,26.0,0.0,-10.0
%,3.0,0.0,0.0,-2.0,0.0,0.0,-1.0
(,86.0,0.0,0.0,-4.0,0.0,-1.0,-81.0
),17.0,0.0,-1.0,-12.0,0.0,0.0,-4.0
...,...,...,...,...,...,...,...
“,1.0,0.0,0.0,-8.0,15.0,0.0,-8.0
”,-2.0,0.0,0.0,-8.0,15.0,0.0,-5.0
…,9.0,0.0,0.0,0.0,0.0,0.0,-9.0
″,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [348]:
dev_out_es_p4 = predict(dev_in_es, w_trans_es, w_emit_es, tags)
file_out_es_p4 = result_to_str(dev_out_es_p4)
with codecs.open("Data/ES/dev.p4.out", "w", "utf-8") as f:
    f.write(file_out_es_p4)
    f.close()

### Dataset: RU

In [349]:
w_trans_ru, w_emit_ru = structured_perceptron(ru_train, tags, epoch)
display(w_trans_ru)
display(w_emit_ru)

Unnamed: 0,O,B-positive,B-neutral,B-negative,I-positive,I-neutral,I-negative,!STOP!
O,5672,-23542,1906,5671,-509,-262,-24315,45871
B-positive,22523,46,0,-221,5251,-6,-30,-45884
B-neutral,2682,1,1,-222,1,421,1,1
B-negative,-26691,1,1,-107,1,1,1231,1
I-positive,5221,12,1,-484,3706,-5,1,1
I-neutral,421,1,1,-281,1,424,1,1
I-negative,1216,1,1,-26456,1,1,829,16
!START!,-552,5159,975,-3462,1,-5,-2109,1


Unnamed: 0,O,B-positive,B-neutral,B-negative,I-positive,I-neutral,I-negative
!,38.0,0.0,0.0,-3.0,-1.0,0.0,-34.0
"""",-188.0,75.0,30.0,-194.0,405.0,59.0,-187.0
%,12.0,0.0,0.0,-5.0,0.0,0.0,-7.0
&,-3.0,0.0,0.0,-5.0,30.0,-1.0,-21.0
',0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...
ячменное,-5.0,0.0,0.0,-3.0,15.0,0.0,-7.0
–,2.0,0.0,0.0,-1.0,0.0,0.0,-1.0
—,3.0,0.0,0.0,-1.0,0.0,0.0,-2.0
…,2.0,0.0,0.0,-1.0,0.0,0.0,-1.0


In [350]:
dev_out_ru_p4 = predict(dev_in_ru, w_trans_ru, w_emit_ru, tags)
file_out_ru_p4 = result_to_str(dev_out_ru_p4)
with codecs.open("Data/RU/dev.p4.out", "w", "utf-8") as f:
    f.write(file_out_ru_p4)
    f.close()

In [351]:
print("============================")
print("Part 4 ES prediction results")
print("============================")
print(f"Epoch: {epoch}")
!python EvalScript/evalResult.py Data/ES/dev.out Data/ES/dev.p4.out

Part 4 ES prediction results
Epoch: 15

#Entity in gold data: 229
#Entity in prediction: 1954

#Correct Entity : 162
Entity  precision: 0.0829
Entity  recall: 0.7074
Entity  F: 0.1484

#Correct Sentiment : 47
Sentiment  precision: 0.0241
Sentiment  recall: 0.2052
Sentiment  F: 0.0431


In [352]:
print("============================")
print("Part 4 RU prediction results")
print("============================")
print(f"Epoch: {epoch}")
!python EvalScript/evalResult.py Data/RU/dev.out Data/RU/dev.p4.out

Part 4 RU prediction results
Epoch: 15

#Entity in gold data: 389
#Entity in prediction: 172

#Correct Entity : 40
Entity  precision: 0.2326
Entity  recall: 0.1028
Entity  F: 0.1426

#Correct Sentiment : 10
Sentiment  precision: 0.0581
Sentiment  recall: 0.0257
Sentiment  F: 0.0357
