# biLSTM: Automatic glossing, otomi (corpus completo)
## Usando etiquetas POS

In [1]:
from reccurrent_model import biLSTM
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from pickle import load
from itertools import chain
import pandas as pd
import numpy as np
import torch

Sólo toma información de caracter y etiqueta. Elimina POS tags.

In [2]:
file1 = load(open('pickle_objects/predata','rb'))
print(file1[0], len(file1))

#file2 = load(open('pickle_objects/preinput_data','rb'))
#print(file2[0], len(file2))

#file3 = load(open('pickle_objects/X_input','rb'))
#print(file3[0][:8], len(file3))

#file4 = load(open('pickle_objects/y_input','rb'))
#print(file4[0], len(file4))

[[['n', 'v', 'B-psd'], ['d', 'v', 'B-1.cpl'], ['ó', 'v', 'I-1.cpl'], ['p', 'v', 'B-stem'], ['h', 'v', 'I-stem'], ['μ', 'v', 'I-stem'], ['d', 'v', 'I-stem'], ['i', 'v', 'I-stem']], [['d', 'v', 'B-1.cpl'], ['ó', 'v', 'I-1.cpl'], ['p', 'v', 'B-stem'], ['ε', 'v', 'I-stem'], ['p', 'v', 'I-stem'], ['h', 'v', 'I-stem'], ['í', 'v', 'I-stem']], [['b', 'v', 'B-3.cpl'], ['i', 'v', 'I-3.cpl'], ['t', 'v', 'B-lig'], ["'", 'v', 'B-stem'], ['μ', 'v', 'I-stem'], ['n', 'v', 'I-stem'], ['g', 'v', 'B-1.obj'], ['í', 'v', 'I-1.obj']], [['b', 'v', 'B-3.cpl'], ['i', 'v', 'I-3.cpl'], ['m', 'v', 'B-stem'], ['ä', 'v', 'I-stem'], ['h', 'v', 'I-stem'], ['t', 'v', 'I-stem'], ['r', 'v', 'I-stem'], ['a', 'v', 'I-stem'], ['t', 'v', 'I-stem'], ['á', 'v', 'I-stem'], ['g', 'v', 'B-1.obj'], ['í', 'v', 'I-1.obj']], [['k', 'obl', 'B-stem'], ['o', 'obl', 'I-stem']], [['c', 'obl', 'B-stem'], ['h', 'obl', 'I-stem'], ['í', 'obl', 'I-stem'], ['k', 'obl', 'I-stem'], ['ó', 'obl', 'I-stem'], ['h', 'obl', 'I-stem'], ['t', 'obl', 'I-

Idea simple para conservar POS en biLSTM, concatenar la etiqueta a la letra: 

$$n+v$$

Tal que 'n' es la letra y 'v' POS de verbo.

El índice asociado a cada letra dependerá de su POS: $idx(n+v) \neq idx(n+obl)$ por ejemplo. 

Intuición: las letras van a ser consideradas como diferentes embeddings dependiendo de la POS de la palabra en que aparezcan.

In [3]:
data = []

for s in file1:
    sent_data = []
    for w in s:
        #print(w)
        word = [(x[0]+'_'+x[1], x[2]) for x in w] + [(' ', ' ')]
        sent_data.append(word)
    
    one_sent = list(chain(*sent_data))
    one_sent.pop(-1)
    data.append(list(zip(*one_sent)))
    
print(data[0])

[('n_v', 'd_v', 'ó_v', 'p_v', 'h_v', 'μ_v', 'd_v', 'i_v', ' ', 'd_v', 'ó_v', 'p_v', 'ε_v', 'p_v', 'h_v', 'í_v', ' ', 'b_v', 'i_v', 't_v', "'_v", 'μ_v', 'n_v', 'g_v', 'í_v', ' ', 'b_v', 'i_v', 'm_v', 'ä_v', 'h_v', 't_v', 'r_v', 'a_v', 't_v', 'á_v', 'g_v', 'í_v', ' ', 'k_obl', 'o_obl', ' ', 'c_obl', 'h_obl', 'í_obl', 'k_obl', 'ó_obl', 'h_obl', 't_obl', 'é_obl'), ('B-psd', 'B-1.cpl', 'I-1.cpl', 'B-stem', 'I-stem', 'I-stem', 'I-stem', 'I-stem', ' ', 'B-1.cpl', 'I-1.cpl', 'B-stem', 'I-stem', 'I-stem', 'I-stem', 'I-stem', ' ', 'B-3.cpl', 'I-3.cpl', 'B-lig', 'B-stem', 'I-stem', 'I-stem', 'B-1.obj', 'I-1.obj', ' ', 'B-3.cpl', 'I-3.cpl', 'B-stem', 'I-stem', 'I-stem', 'I-stem', 'I-stem', 'I-stem', 'I-stem', 'I-stem', 'B-1.obj', 'I-1.obj', ' ', 'B-stem', 'I-stem', ' ', 'B-stem', 'I-stem', 'I-stem', 'I-stem', 'I-stem', 'I-stem', 'I-stem', 'I-stem')]


### Training and test data

In [4]:
#Data split 33%
train_pairs, eval_pairs = train_test_split(data, test_size=0.33)
print(len(train_pairs), len(eval_pairs))

#Separar el train en dos conjuntos
train_sents, train_tags = zip(*train_pairs)

#print(train_sents)
#print(train_tags)

1196 590


### Training the model

In [5]:
model = biLSTM(train_sents, train_tags)

In [6]:
%%time
model.train_model(its=150)

100%|██████████| 150/150 [1:06:29<00:00, 26.60s/it]

CPU times: user 11h 23min 21s, sys: 5min 55s, total: 11h 29min 17s
Wall time: 1h 6min 29s





### Evaluation

In [15]:
y_pred = []
y_true = []
for sent in eval_pairs:
    prediccion = model.forward(sent[0])
    y_pred.append(prediccion)
    y_true.append(sent[1])

In [16]:
y_true = list(chain(*y_true))
y_pred = list(chain(*y_pred))

In [17]:
labels = list(set(y_true))
labels.remove(' ')

print('Accuracy:', accuracy_score(y_true, y_pred))

Accuracy: 0.5560073702268251


In [18]:

prec, rec, f1, supp = precision_recall_fscore_support(y_true, y_pred, labels=labels)

  _warn_prf(average, modifier, msg_start, len(result))


In [19]:
results = pd.DataFrame(data=np.array([prec, rec, f1, supp]).T, index=labels, columns=['Precision','Recall','F1', 'Support'])
results

Unnamed: 0,Precision,Recall,F1,Support
B-2.obj,0.0,0.0,0.0,6.0
B-mujer/v,0.0,0.0,0.0,1.0
B-1.enf,0.0,0.0,0.0,11.0
I-chico,0.0,0.0,0.0,4.0
B-lig,0.0,0.0,0.0,100.0
...,...,...,...,...
B-dem,0.0,0.0,0.0,57.0
B-como,0.0,0.0,0.0,1.0
I-mexico,0.0,0.0,0.0,4.0
I-it,0.0,0.0,0.0,13.0


In [20]:
print(results.to_string())

               Precision  Recall        F1  Support
B-2.obj         0.000000     0.0  0.000000      6.0
B-mujer/v       0.000000     0.0  0.000000      1.0
B-1.enf         0.000000     0.0  0.000000     11.0
I-chico         0.000000     0.0  0.000000      4.0
B-lig           0.000000     0.0  0.000000    100.0
I-agujerear/v   0.000000     0.0  0.000000      3.0
I-3.sg          0.000000     0.0  0.000000      6.0
B-3.icp         0.000000     0.0  0.000000    115.0
B-2.icp         0.000000     0.0  0.000000     33.0
I-aqui          0.000000     0.0  0.000000      6.0
B-muy           0.000000     0.0  0.000000     16.0
I-2.cpl         0.000000     0.0  0.000000      3.0
I-1.pss         0.000000     0.0  0.000000     57.0
I-prt           0.000000     0.0  0.000000     15.0
I-3.pot         0.000000     0.0  0.000000     91.0
B-pl.exc        0.000000     0.0  0.000000     67.0
I-solo          0.000000     0.0  0.000000      2.0
I-dem           0.000000     0.0  0.000000     93.0
I-1.icp.irr 

In [21]:
aver_prec = (supp*prec).sum()/supp.sum()
aver_rec = (supp*rec).sum()/supp.sum()
aver_f1 = (supp*f1).sum()/supp.sum()

print('Average precision: {}\nAverage recall: {}\nAverage F1: {}'.format(aver_prec, aver_rec, aver_f1))

Average precision: 0.23592653789222842
Average recall: 0.485722696496909
Average F1: 0.3175916184743015


In [14]:
#torch.save(model,'Model.biLSTM.justChar')