### Definimos el Modelo Base

In [2]:
from jupyterthemes import get_themes
import jupyterthemes as jt
from jupyterthemes.stylefx import set_nb_theme

set_nb_theme('monokai')
#""!pip3 install git+https://www.github.com/keras-team/keras-contrib.git
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
import keras as k
from keras_contrib.layers import CRF
import pickle
import numpy as np

MAX_LENGTH=149
word2index = np.load('Models/word2index.npy', allow_pickle=True)
tag2index = np.load('Models/tag2index.npy', allow_pickle=True)


input = Input(shape=(MAX_LENGTH,))
word_embedding_size = 300

# Embedding Layer
model = Embedding(input_dim=len(word2index), output_dim=word_embedding_size, input_length=MAX_LENGTH)(input)

# BI-LSTM Layer
model = Bidirectional(LSTM(units=word_embedding_size, 
                           return_sequences=True, 
                           dropout=0.5, 
                           recurrent_dropout=0.5, 
                           kernel_initializer=k.initializers.he_normal()))(model)
model = LSTM(units=word_embedding_size * 2, 
             return_sequences=True, 
             dropout=0.5, 
             recurrent_dropout=0.5, 
             kernel_initializer=k.initializers.he_normal())(model)

# TimeDistributed Layer
model = TimeDistributed(Dense(len(tag2index), activation="relu"))(model)  

# CRF Layer
crf = CRF(len(tag2index))

out = crf(model)  # output
model = Model(input, out)


#Optimiser 
adam = k.optimizers.Adam(lr=0.0005, beta_1=0.9, beta_2=0.999)

# Compile model
model.compile(optimizer=adam, loss=crf.loss_function, metrics=[crf.accuracy, 'accuracy'])
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 149)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 149, 300)          7349700   
_________________________________________________________________
bidirectional (Bidirectional (None, 149, 600)          1442400   
_________________________________________________________________
lstm_1 (LSTM)                (None, 149, 600)          2882400   
_________________________________________________________________
time_distributed (TimeDistri (None, 149, 291)          174891    
_________________________________________________________________
crf (CRF)                    (None, 149, 291)          170235    
Total params: 12,019,626
Trainable params: 12,019,626
Non-trainable params: 0
_________________________________________________



### Cargamos el modelo

In [3]:
from keras.models import Model, Input as Kinput
model.load_weights("Models/mb.h5")
print("Loaded model from disk")


Loaded model from disk


### Función que Permite convertir Indices en Tags

In [4]:
def logits_to_tokens(sequences, index):
    token_sequences = []
    for categorical_sequence in sequences:
        token_sequence = []
        for categorical in categorical_sequence:
            token_sequence.append(index[np.argmax(categorical)])
 
        token_sequences.append(token_sequence)
 
    return token_sequences

### Creamos la función que usa el modelo para el postagging



In [5]:
#!pip install tabulate
from tabulate import tabulate

from keras.preprocessing.sequence import pad_sequences
def postagging(test_samples):
  #Convertimos el texto en Una entrada para el Modelo
    test_samples_X = []
    for s in test_samples:
        s_int = []
        for w in s:
           try:
               s_int.append(word2index[w.lower()])
           except KeyError:
              s_int.append(word2index['-OOV-'])
        test_samples_X.append(s_int)

    test_samples_X = pad_sequences(test_samples_X, maxlen=MAX_LENGTH, padding='post')
    #Se Ejecuta la predicion con la Entrada del modelo entrenado
    predictions = model.predict(test_samples_X)
    #Conversion de la Salida del Modelo a un lista de Indices de Tags
    log_tokens = logits_to_tokens(predictions, {i: t for t, i in tag2index.items()})
    #Presentación de los Resultados

    salida = ""
    for i in range(len(test_samples)):
        head = test_samples[i]
        body = [log_tokens[i][:len(test_samples[i])]]
        salida += tabulate(body, headers=head)
        salida += "\n"
        
    return salida

## postagging Freeling 4.1

## El      hombre   bajo     corre    bajo  el      puente   con  bajo  índice   de  adrenalina  .
## DA0MS0  NCMS000  AQ0MS00  VMIP3S0  SP    DA0MS0  NCMS000  SP   SP    NCMS000  SP  NCFS000     Fp


## pos tagger Stanford NLP

## El      hombre   bajo     corre    bajo  el      puente   con    bajo   índice  de    adrenalina  .
## da0000  nc0s000  aq0000   vmip000  sp000 da0000  nc0s000  sp000  aq0000 nc0s000 sp000 nc0s000     fp


### Ejemplo

In [8]:
test_samples = [
    "Correr es importante para mi .".split(),
    "El hombre bajo corre bajo el puente con bajo índice de adrenalina .".split()
]
print(test_samples)

[['Correr', 'es', 'importante', 'para', 'mi', '.'], ['El', 'hombre', 'bajo', 'corre', 'bajo', 'el', 'puente', 'con', 'bajo', 'índice', 'de', 'adrenalina', '.']]


In [9]:
print(postagging(test_samples))

Correr    es       importante    para    mi       .
--------  -------  ------------  ------  -------  ---
nccs000   vmip2s0  spcms         dn0cp0  nccs000  Fp
El        hombre    bajo    corre    bajo    el        puente    con     bajo    índice    de      adrenalina    .
--------  --------  ------  -------  ------  --------  --------  ------  ------  --------  ------  ------------  ---
pp3csd00  vasi3p0   da0fs0  aq0msp   da0fs0  pp3csd00  pd0fp000  da0fs0  da0fs0  vag0000   ao0fs0  da0fs0        Fp



### Definicion de interfaz

In [11]:
from tkinter import *
import tkinter as tk
from tkinter.ttk import *
import sentencepiece as spm

raiz = Tk()

raiz.configure(background='white')

raiz.title("Proyecto PLN")

raiz.geometry('1000x700')
raiz.resizable(0,0)

label= Label( text = 'Tokenizer & Postagging',  background = "white",
            font = "Helvetica 30 bold italic")
label.pack(pady=20)

def tokenize():
        sp_word = spm.SentencePieceProcessor()
        sp_word.load('Models/m_word.model')
        test_sample = entrada.get("1.0",END)
        token= str(sp_word.encode_as_pieces(test_sample))
        salida.configure(state='normal')
        salida.delete(1.0,END)
        salida.insert(INSERT,token)

def postag():
  
    sp_word = spm.SentencePieceProcessor()
    sp_word.load('Models/m_word.model')
    test_sample = entrada.get("1.0",END)
    
    token= sp_word.encode_as_pieces(test_sample)
    listToken=[]
    for i in token:
        listToken.append(i.replace('▁', '', 1))
   
    salidapos = postagging([listToken])
    print(salidapos)
    salida.configure(state='normal')
    salida.delete('1.0', END)
    salida.insert("insert", salidapos)

entrada = Text(raiz,height=14, width=700)
entrada.pack(pady=15)
btn = Button(raiz, text="Tokenizer",command=tokenize)
btn.place(x=440, y=320)
btn1 = Button(raiz, text="Postagging",command=postag)
btn1.place(x=525, y=320)
salida = Text(raiz,height=14, width=700, state='disabled')
salida.pack(pady=25)

raiz.mainloop()

El        hombre    bajo    corre    bajo    el        puente    con     bajo    índice    de      adrenalina
--------  --------  ------  -------  ------  --------  --------  ------  ------  --------  ------  ------------
pp3csd00  vasi3p0   da0fs0  aq0msp   da0fs0  pp3csd00  pd0fp000  da0fs0  da0fs0  vag0000   ao0fs0  px1ms0p0

La        visita    al      zoológico    no      se       cobra    y       se       puede      observar    la        cobra▁recién▁adquirida
--------  --------  ------  -----------  ------  -------  -------  ------  -------  ---------  ----------  --------  ------------------------
pd0fp000  vag0000   dn0cp0  px1ms0p0     aq0fp0  vasi3p0  da0fs0   dn0cp0  vaii1p0  sn.co-SUJ  pi0ms000    pd0fp000  px1ms0p0

