In [None]:
!pip install tensorflow-addons

In [None]:
!pip install git+https://www.github.com/keras-team/keras-contrib.git

In [None]:
!pip install tensorflow==1.13.1

In [None]:
!pip install q keras==2.2.4

In [None]:
#%tensorflow_version 1.13.1
#%keras_version 2.2.4

In [None]:
%%shell
jupyter nbconvert --to html /content/NER.ipynb

[NbConvertApp] Converting notebook /content/NER.ipynb to html
[NbConvertApp] Writing 356478 bytes to /content/NER.html




In [None]:
from google.colab import drive
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from keras.layers import Conv1D, Activation, MaxPooling1D, Dense, concatenate, LSTM, Bidirectional, TimeDistributed, Input, Embedding, Flatten
#import tensorflow_addons as tfa
#from tensorflow_addons.layers import CRF
from keras_contrib.losses import crf_loss
from keras_contrib.metrics import crf_accuracy
from keras_contrib.layers import CRF
from keras.models import Model
from sklearn.model_selection import train_test_split
import re
from keras.utils.vis_utils import plot_model

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [None]:
print(tf.__version__)

1.13.1


## Carga de los datos

In [None]:
# Conexión a Google Drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
data_dir = '/content/gdrive/MyDrive/ColabNotebooks/NER/ner_datasetreference.csv'
ner_df = pd.read_csv(data_dir,  encoding= 'unicode_escape')

In [None]:
# Rellenamos la fila sentencia, propagando desde la primera informada hasta la siguiente informada
ner_df = ner_df.fillna(method='ffill', axis=0)

# Eliminamos la columna POS
ner_df = ner_df.drop(['POS'], axis=1)

In [None]:
web_pattern = '((http|https)://)?[a-zA-Z0-9./?:@-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9.&/?:@\-_=#])*'
filter = ner_df['Word'].str.contains(web_pattern)
ner_df = ner_df[~filter]

  


In [None]:
ner_df['Word'] = ner_df['Word'].str.lower()

In [None]:
ner_df.head()

Unnamed: 0,Sentence #,Word,Tag
0,Sentence: 1,thousands,O
1,Sentence: 1,of,O
2,Sentence: 1,demonstrators,O
3,Sentence: 1,have,O
4,Sentence: 1,marched,O


## Data cleaning

In [None]:
ner_grouped_df = ner_df.groupby(
['Sentence #'],as_index=False
)['Word', 'Tag'].agg(lambda x: ' '.join(x)).rename(columns={"Word": "Word_sequence", "Tag": "Tag_sequence"})

  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
ner_grouped_df.head()

Unnamed: 0,Sentence #,Word_sequence,Tag_sequence
0,Sentence: 1,thousands of demonstrators have marched throug...,O O O O O O B-geo O O O O O B-geo O O O O O B-...
1,Sentence: 10,iranian officials say they expect to get acces...,B-gpe O O O O O O O O O O O O O O B-tim O O O ...
2,Sentence: 100,helicopter gunships saturday pounded militant ...,O O B-tim O O O O O B-geo O O O O O B-org O O ...
3,Sentence: 1000,they left after a tense hour-long standoff wit...,O O O O O O O O O O O
4,Sentence: 10000,u.n. relief coordinator jan egeland said sunda...,B-geo O O B-per I-per O B-tim O B-geo O B-gpe ...


## Transformación de los datos

In [None]:
# Word level tokenizer
word_tokenizer = Tokenizer(lower = False, oov_token="<OOV>")
word_tokenizer.fit_on_texts(ner_grouped_df['Word_sequence'])

In [None]:
# Char level tokenizer
char_tokenizer = Tokenizer(char_level=True, oov_token='UNK')
char_tokenizer.fit_on_texts(ner_grouped_df['Word_sequence'])

In [None]:
tag_tokenizer = Tokenizer(lower=False, filters='!"#$%&()*+,./:;<=>?@[\\]^_`{|}~\t\n')
tag_tokenizer.fit_on_texts(ner_grouped_df['Tag_sequence'])

In [None]:
ntags = len(tag_tokenizer.word_index)
char_vocab_size = len(char_tokenizer.word_index)

In [None]:
max_word = 0
_word = None
sentence_ = None
for sentence in ner_grouped_df['Word_sequence']:
  for word in sentence.split():
    if len(word) > max_word:
      sentence_ = sentence
      _word = word
      max_word = len(word)

In [None]:
max_word

27

In [None]:
max_sequence = max([len(seq) for seq in word_tokenizer.texts_to_sequences(ner_grouped_df['Word_sequence'])])

In [None]:
print("number of tags:", ntags)
print("max length of the word sequence:", max_sequence)
print('Max length of word:', max_word)

number of tags: 17
max length of the word sequence: 89
Max length of word: 27


In [None]:
#one hot encoding representation  
z = pad_sequences(tag_tokenizer.texts_to_sequences(ner_grouped_df['Tag_sequence']), maxlen=max_sequence, padding='post')
y = [to_categorical(i, num_classes=ntags + 1) for i in z]

Muestra de entrenamiento, validación y test

In [None]:
X = ner_grouped_df['Word_sequence']
X_train, X_, y_train, y_ = train_test_split(X, y,test_size=0.20, random_state=1, shuffle=True)
X_cv, X_test, y_cv, y_test = train_test_split(X_,y_,test_size=0.50, random_state=1)

In [None]:
print("Shape of training data:", X_train.shape)
print("Shape of validation data:", X_cv.shape)
print("Shape of test data:", X_test.shape)

Shape of training data: (38367,)
Shape of validation data: (4796,)
Shape of test data: (4796,)


In [None]:
def build_char_sequences(X_sentences, char_tokenizer, max_word, max_sequence):
  n = X_sentences.shape[0]
  pad_char_rep = np.zeros(shape=(n, max_sequence, max_word), dtype=int)
  for idx, sentence in enumerate(X_sentences):
    temp_pad_char = np.zeros(shape =(max_sequence, max_word), dtype=int)
    char_tokenized = []
    for word in sentence.split():
      char_tokenized.append(pad_sequences(char_tokenizer.texts_to_sequences([word[ : max_word]]), maxlen=max_word, padding='post')[-1])
    max_size = min(len(char_tokenized), max_sequence)
    for i in range(max_size):
      temp_pad_char[i] = char_tokenized[i]
    pad_char_rep[idx] = temp_pad_char
  return pad_char_rep

## Muestras de entrenamiento, validación y test

In [None]:
def get_train_dev_test_data(X_train, X_cv, X_test, word_tokenizer, char_tokenizer):
  X_word_train = pad_sequences(word_tokenizer.texts_to_sequences(X_train), maxlen=max_sequence, padding='post')
  X_word_cv = pad_sequences(word_tokenizer.texts_to_sequences(X_cv), maxlen=max_sequence, padding='post')
  X_word_test = pad_sequences(word_tokenizer.texts_to_sequences(X_test), maxlen=max_sequence, padding='post')
  
  X_char_train = build_char_sequences(X_train, char_tokenizer, max_word=max_word, max_sequence=max_sequence)
  X_char_cv = build_char_sequences(X_cv, char_tokenizer, max_word=max_word, max_sequence=max_sequence)
  X_char_test = build_char_sequences(X_test, char_tokenizer, max_word=max_word, max_sequence=max_sequence)
  return X_word_train, X_word_cv, X_word_test, X_char_train, X_char_cv, X_char_test

In [None]:
X_word_train, X_word_cv, X_word_test, X_char_train, X_char_cv, X_char_test = get_train_dev_test_data(X_train, X_cv, X_test, word_tokenizer, char_tokenizer)

In [None]:
print('Shape of the word test representation: ', X_word_train.shape)
print('Shape of the word validation representation: ', X_word_cv.shape)
print('Shape of the word test representation: ', X_word_test.shape)
print('Shape of the char test representation: ', X_char_train.shape)
print('Shape of the char validation representation: ', X_char_cv.shape)
print('Shape of the char test representation: ', X_char_test.shape)

Shape of the word test representation:  (38367, 89)
Shape of the word validation representation:  (4796, 89)
Shape of the word test representation:  (4796, 89)
Shape of the char test representation:  (38367, 89, 27)
Shape of the char validation representation:  (4796, 89, 27)
Shape of the char test representation:  (4796, 89, 27)


## Construcción del modelo

In [None]:
def create_word_embedding_layer(tokenized_corpus, glove_dir, embedding_dim, trainable):
  # Calculate the matrix weights for the input corpus
  #params:
  # tokenizedd_corpus: the text after the cleaning and tokenization process
  # glove_dir: the method uses the glove pretrained embedding vector space. The value of this param is the directory where it is allocated
  # The dimension of the embedding
  
  embeddings_index = {}

  f = open(glove_dir)
  for line in f:
      values = line.split()
      word = values[0]
      coefs = np.asarray(values[1:],dtype='float32')
      embeddings_index[word] = coefs
  f.close()

  print('Found %s word vectors.' % len(embeddings_index))

  vocab_size = len(tokenized_corpus.word_index) + 1
  word_embedding_matrix = np.zeros((vocab_size,embedding_dim))
  for word , i in tokenized_corpus.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        word_embedding_matrix[i] = embedding_vector
  
  #embedding_layer = Embedding(input_dim=len(tokenized_corpus.word_index)+ 1,
  #                           output_dim=embedding_dim,
  #                           weights=[word_embedding_matrix],
  #                           input_length=input_size,
  #                           trainable=trainable)        
  return word_embedding_matrix

In [None]:
#!unzip gdrive/MyDrive/ColabNotebooks/NER/glove*.zip

Archive:  gdrive/MyDrive/ColabNotebooks/NER/glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [None]:
embedding_dim = 100
word_embedding_matrix = create_word_embedding_layer(word_tokenizer, glove_dir = "glove.6B.100d.txt", embedding_dim = embedding_dim, trainable = False)

Found 400000 word vectors.


In [None]:
def get_CNN_BiLSTM_CRF_model(max_sequence, max_word, word_embedding_matrix, word_embedding_dim, char_embedding_dim, word_vocab_size, char_vocab_size, num_tags):
  # Sequence representation at character level

  word_input = Input(shape=(max_sequence,))
  embedding_word = Embedding(input_dim= word_vocab_size + 1,
                             output_dim=word_embedding_dim,
                             weights=[word_embedding_matrix],
                             #input_length=max_sequence,
                             mask_zero=True,
                             trainable=False)(word_input)
  
  char_input = Input(shape=(max_sequence, max_word, ))
  char_embedding = TimeDistributed(Embedding(input_dim= char_vocab_size + 1,
                       output_dim=char_embedding_dim,
                       # input_length=max_sequence,
                       mask_zero=True,
                       trainable=True))(char_input)
  # Conv layer 1
  Conv1D_1 = TimeDistributed(Conv1D(filters=30, kernel_size=5, activation='relu'))(char_embedding)
  MaxPool1D_1 = TimeDistributed(MaxPooling1D(pool_size = 2))(Conv1D_1)
  # Conv layer 2
  Conv1D_2 = TimeDistributed(Conv1D(filters=15, kernel_size=3, activation='relu'))(MaxPool1D_1)
  MaxPool1D_2 = TimeDistributed(MaxPooling1D(pool_size = 2))(Conv1D_2)

  dense = TimeDistributed(Flatten())(MaxPool1D_2)
  concat = concatenate([embedding_word, dense])
  bilstm = Bidirectional(LSTM(10, return_sequences=True), merge_mode='concat')(concat)
  #output = TimeDistributed(Dense(num_classes, activation = 'softmax'))(model)
  crf = CRF(num_tags+1, name="output")
  output = crf(bilstm)
  model = Model(inputs=[word_input, char_input], outputs=output)
  return model

In [None]:
model = get_CNN_BiLSTM_CRF_model(max_sequence=max_sequence, max_word=max_word, word_embedding_matrix=word_embedding_matrix, word_embedding_dim=embedding_dim, char_embedding_dim=30, word_vocab_size=len(word_tokenizer.word_index), char_vocab_size=len(char_tokenizer.word_index), num_tags=ntags)

Instructions for updating:
Colocations handled automatically by placer.


In [None]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 89, 27)       0                                            
__________________________________________________________________________________________________
time_distributed_1 (TimeDistrib (None, 89, 27, 30)   2220        input_2[0][0]                    
__________________________________________________________________________________________________
time_distributed_2 (TimeDistrib (None, 89, 23, 30)   4530        time_distributed_1[0][0]         
__________________________________________________________________________________________________
time_distributed_3 (TimeDistrib (None, 89, 11, 30)   0           time_distributed_2[0][0]         
__________________________________________________________________________________________________
time_distr

In [None]:
plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

In [None]:
#from keras_contrib.losses import crf_loss3
#from keras_contrib.metrics import crf_accuracy

In [None]:
model.compile(optimizer=Adam(lr=0.001), loss=crf_loss, metrics=[crf_accuracy])

In [None]:
model.fit([X_word_train, X_char_train], 
          np.array(y_train),
          batch_size=32,
          epochs=20,
          #verbose=1,
          validation_data=([X_word_cv, X_char_cv], np.array(y_cv))
          )

Train on 38367 samples, validate on 4796 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f2ac6e04210>

In [None]:
model.save('gdrive/MyDrive/ColabNotebooks/NER/ner_model.h5')

In [None]:
# Predictions
p = model.predict([X_word_test, X_char_test])

In [None]:
n = len(y_test)
y_test_ar = np.reshape(np.array(y_test, dtype=int), newshape=(n, max_sequence, ntags+1))

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(np.argmax(y_test, 2).ravel(), np.argmax(p, axis=2).ravel(), labels=list(tag_tokenizer.index_word.keys()), target_names=list(tag_tokenizer.index_word.values())))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           O       0.98      0.99      0.98     88868
       B-geo       0.83      0.85      0.84      3703
       B-tim       0.90      0.84      0.87      2056
       B-org       0.70      0.55      0.61      1975
       I-per       0.80      0.85      0.82      1696
       B-per       0.79      0.76      0.78      1655
       I-org       0.57      0.60      0.58      1671
       B-gpe       0.95      0.88      0.91      1555
       I-geo       0.75      0.68      0.71       757
       I-tim       0.76      0.71      0.74       702
       B-art       0.00      0.00      0.00        53
       B-eve       0.56      0.21      0.31        43
       I-art       0.00      0.00      0.00        58
       I-eve       0.33      0.23      0.27        40
       B-nat       0.57      0.18      0.28        22
       I-gpe       0.71      0.45      0.56        11
       I-nat       0.00      0.00      0.00         7

   micro avg       0.95   

  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
#y_test_seq = np.argmax(y_test, axis=-1)
#y_test_tag = tag_tokenizer.sequences_to_texts(y_test_seq)
#y_pred_tag = tag_tokenizer.sequences_to_texts(p)