# biLSTM: Automatic glossing, otomi (corpus completo)

In [1]:
from reccurrent_model import biLSTM
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from pickle import load
from itertools import chain
import pandas as pd
import numpy as np
import torch

Sólo toma información de caracter y etiqueta. Elimina POS tags.

In [2]:
file1 = load(open('pickle_objects/preinput_data','rb'))
print(file1[0], len(file1))

[[["'", 'v', 'B-como'], ['á', 'v', 'I-como'], ['b', 'v', 'B-3.cpl'], ['i', 'v', 'I-3.cpl'], ['m', 'v', 'B-ctrf'], ['á', 'v', 'I-ctrf'], ['k', 'v', 'B-stem'], ['h', 'v', 'I-stem'], ['á', 'v', 'I-stem']]] 1769


In [3]:
def get_data(file):
    data = []

    for s in file:
        sent_data = []
        for w in s:
            word = [(x[0], x[2]) for x in w] + [(' ', ' ')]
            sent_data.append(word)

        one_sent = list(chain(*sent_data))
        one_sent.pop(-1)
        data.append(list(zip(*one_sent)))

    return data

### Training and test data

In [4]:
#Data split 33%
train_pairs, eval_pairs = train_test_split(get_data(file1), test_size=0.1) #33)
print(len(train_pairs), len(eval_pairs))

#Separar el train en dos conjuntos
train_sents, train_tags = zip(*train_pairs)

#print(train_sents)
#print(train_tags)

1592 177


### Training the model

In [5]:
model = biLSTM(train_sents, train_tags)

In [6]:
%%time
model.train_model(its=150)

100%|██████████| 150/150 [1:23:09<00:00, 33.27s/it]

CPU times: user 14h 24min 29s, sys: 2min 9s, total: 14h 26min 38s
Wall time: 1h 23min 9s





### Evaluation

In [7]:
y_pred = []
y_true = []
for sent in eval_pairs:
    prediccion = model.forward(sent[0])
    y_pred.append(prediccion)
    y_true.append(sent[1])

In [8]:
y_true = list(chain(*y_true))
y_pred = list(chain(*y_pred))

In [9]:
labels = list(set(y_true))
labels.remove(' ')

print('Accuracy:', accuracy_score(y_true, y_pred))

Accuracy: 0.5604813172894236


In [10]:

prec, rec, f1, supp = precision_recall_fscore_support(y_true, y_pred, labels=labels)

  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
results = pd.DataFrame(data=np.array([prec, rec, f1, supp]).T, index=labels, columns=['Precision','Recall','F1', 'Support'])
results

Unnamed: 0,Precision,Recall,F1,Support
I-1.pot,0.0,0.0,0.0,24.0
B-ctrf,0.0,0.0,0.0,27.0
B-1.enf,0.0,0.0,0.0,4.0
I-dual.exc,0.0,0.0,0.0,3.0
B-2,0.0,0.0,0.0,3.0
...,...,...,...,...
B-rapido,0.0,0.0,0.0,1.0
B-p.loc,0.0,0.0,0.0,1.0
B-prt,0.0,0.0,0.0,5.0
I-3.cpl,0.0,0.0,0.0,51.0


In [12]:
print(results.to_string())

                  Precision    Recall        F1  Support
I-1.pot            0.000000  0.000000  0.000000     24.0
B-ctrf             0.000000  0.000000  0.000000     27.0
B-1.enf            0.000000  0.000000  0.000000      4.0
I-dual.exc         0.000000  0.000000  0.000000      3.0
B-2                0.000000  0.000000  0.000000      3.0
B-3.pls            0.000000  0.000000  0.000000      1.0
I-2                0.000000  0.000000  0.000000      3.0
I-p.loc            0.000000  0.000000  0.000000      1.0
I-muy              0.000000  0.000000  0.000000      8.0
B-3.obj            0.000000  0.000000  0.000000      5.0
I-prt              0.000000  0.000000  0.000000      4.0
I-dem              0.000000  0.000000  0.000000     31.0
B-pl               0.000000  0.000000  0.000000     14.0
I-prag             0.000000  0.000000  0.000000     34.0
I-3.pot            0.000000  0.000000  0.000000     15.0
I-2.icp            0.000000  0.000000  0.000000      6.0
I-rapido           0.000000  0.

In [13]:
aver_prec = (supp*prec).sum()/supp.sum()
aver_rec = (supp*rec).sum()/supp.sum()
aver_f1 = (supp*f1).sum()/supp.sum()

print('Average precision: {}\nAverage recall: {}\nAverage F1: {}'.format(aver_prec, aver_rec, aver_f1))

Average precision: 0.39692337209177264
Average recall: 0.6518536705131353
Average F1: 0.48728441012052087


In [14]:
#torch.save(model,'Model.biLSTM.justChar')

#### Save embeddings

In [15]:
embs = {}
for w,v in model.input_voc.items():
    if w != ' ':
        vec = model.emb[0](torch.tensor([v])).detach().numpy()[0]
        embs[w] = vec        

In [16]:
from pickle import dump

dump( embs, open( "Embs.biLSTM.justChar.p", "wb" ) )