In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
[nltk_data] Downloading package punkt to
[nltk_data]     /home/randomspace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
import tensorflow as tf
if tf.test.gpu_device_name():
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))
else:
    print("Please install GPU version of TF")

Default GPU Device: /device:GPU:0


In [3]:
data = pd.read_csv("ner_dataset.csv", encoding="latin1")
data = data.drop(['POS'], axis =1)
data = data.fillna(method="ffill")
data.head(10)

Unnamed: 0,Sentence #,Word,Tag
0,Sentence: 1,Thousands,O
1,Sentence: 1,of,O
2,Sentence: 1,demonstrators,O
3,Sentence: 1,have,O
4,Sentence: 1,marched,O
5,Sentence: 1,through,O
6,Sentence: 1,London,B-geo
7,Sentence: 1,to,O
8,Sentence: 1,protest,O
9,Sentence: 1,the,O


In [4]:
print(data.loc[data['Tag'] == 'I-art'][0:10])

         Sentence #               Word    Tag
264    Sentence: 12  Non-Proliferation  I-art
3811  Sentence: 171                V-6  I-art
4016  Sentence: 183             Simple  I-art
4017  Sentence: 183               Life  I-art
4142  Sentence: 188            Morning  I-art
4143  Sentence: 188            America  I-art
5248  Sentence: 236             Mirror  I-art
5923  Sentence: 270                 De  I-art
5924  Sentence: 270             Gaulle  I-art
5935  Sentence: 270      International  I-art


In [5]:
class SentenceGetter(object):
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s["Word"].values.tolist(),s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [6]:
getter = SentenceGetter(data)
sentences = getter.sentences

In [7]:
print(sentences[0])

[('Thousands', 'O'), ('of', 'O'), ('demonstrators', 'O'), ('have', 'O'), ('marched', 'O'), ('through', 'O'), ('London', 'B-geo'), ('to', 'O'), ('protest', 'O'), ('the', 'O'), ('war', 'O'), ('in', 'O'), ('Iraq', 'B-geo'), ('and', 'O'), ('demand', 'O'), ('the', 'O'), ('withdrawal', 'O'), ('of', 'O'), ('British', 'B-gpe'), ('troops', 'O'), ('from', 'O'), ('that', 'O'), ('country', 'O'), ('.', 'O')]


In [8]:
words = list(set(data["Word"].values))
words.append("ENDPAD")
n_words = len(words)
print(n_words)
print(words)

35179


In [9]:
tags = list(set(data["Tag"].values))
n_tags = len(tags)
print(n_tags)
print(tags)

17
['B-eve', 'I-per', 'I-art', 'I-eve', 'B-geo', 'B-org', 'I-org', 'O', 'B-art', 'I-tim', 'I-nat', 'I-gpe', 'B-gpe', 'B-tim', 'B-per', 'B-nat', 'I-geo']


In [10]:
word2idx = {w: i for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}

In [11]:
max_len = 50

X = [[word2idx[w[0]] for w in s] for s in sentences]
X = pad_sequences(maxlen=max_len, sequences=X, padding="post",value=word2idx["ENDPAD"])

y = [[tag2idx[w[1]] for w in s] for s in sentences]
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["O"])

y = [to_categorical(i, num_classes=n_tags) for i in y]

In [12]:
print(X[0])

[25786 12164 17580 22723 15261  2862 21460 12051   417 15933 19456 31469
 17535  8671 18323 15933 30977 12164  5210  5342  2398 11454  2018 35043
 35178 35178 35178 35178 35178 35178 35178 35178 35178 35178 35178 35178
 35178 35178 35178 35178 35178 35178 35178 35178 35178 35178 35178 35178
 35178 35178]


In [13]:
print(y[0])

[[0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.

In [14]:
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size = 0.2)

In [15]:
print(len(X_tr))
print(len(y_tr))

38367
38367


In [16]:
print(len(X_te))
print(len(y_te))

9592
9592


In [17]:
input = Input(shape=(max_len,))
model = Embedding(input_dim=n_words, output_dim=max_len, input_length=max_len)(input)
model = Dropout(0.1)(model)
model = Bidirectional(LSTM(units=400, return_sequences=True, recurrent_dropout=0.1))(model)
out = TimeDistributed(Dense(n_tags, activation="softmax"))(model)

model = Model(input, out)
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

W0820 17:07:29.266101 139759870887744 deprecation_wrapper.py:119] From /home/randomspace/.local/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0820 17:07:29.269333 139759870887744 deprecation_wrapper.py:119] From /home/randomspace/.local/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0820 17:07:29.279546 139759870887744 deprecation_wrapper.py:119] From /home/randomspace/.local/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0820 17:07:29.292844 139759870887744 deprecation_wrapper.py:119] From /home/randomspace/.local/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.p

In [18]:
# Train model
history = model.fit(X_tr,\
                    np.array(y_tr),\
                    batch_size = 32,\
                    epochs = 3,\
                    validation_split = 0.2,\
                    verbose = 1)

W0820 17:09:02.556863 139759870887744 deprecation.py:323] From /home/randomspace/.local/lib/python3.6/site-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Train on 30693 samples, validate on 7674 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [19]:
model.summary()
model.save('model.h5')

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 50)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 50, 50)            1758950   
_________________________________________________________________
dropout_1 (Dropout)          (None, 50, 50)            0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 50, 800)           1443200   
_________________________________________________________________
time_distributed_1 (TimeDist (None, 50, 17)            13617     
Total params: 3,215,767
Trainable params: 3,215,767
Non-trainable params: 0
_________________________________________________________________


In [20]:
# Test model
test_pred = model.predict(X_te, verbose=1)



In [21]:
idx2tag = {i: w for w, i in tag2idx.items()}

def pred2label(pred):
    out = []
    for pred_i in pred:
        for p in pred_i:
            p_i = np.argmax(p, axis = -1)
            out.append(idx2tag[p_i])
    return out

pred_labels = pred2label(test_pred)
true_labels = pred2label(y_te)

print(classification_report(true_labels, pred_labels))

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

       B-art       0.00      0.00      0.00        86
       B-eve       0.83      0.26      0.40        57
       B-geo       0.85      0.89      0.87      7452
       B-gpe       0.96      0.93      0.95      3097
       B-nat       1.00      0.10      0.19        39
       B-org       0.83      0.67      0.74      4005
       B-per       0.84      0.81      0.82      3254
       B-tim       0.91      0.89      0.90      4002
       I-art       0.00      0.00      0.00        62
       I-eve       0.00      0.00      0.00        43
       I-geo       0.77      0.81      0.79      1474
       I-gpe       0.76      0.53      0.63        30
       I-nat       0.00      0.00      0.00        11
       I-org       0.78      0.76      0.77      3252
       I-per       0.85      0.86      0.85      3286
       I-tim       0.81      0.76      0.78      1310
           O       0.99      1.00      1.00    448140

    accuracy              

In [34]:
# Test user's text
test_text = "The Pretty Little Liars alum did a little shopping in London’s Covent Garden on a recent trip across the pond."

In [35]:
list_tokens = word_tokenize(test_text)
X_test = []
for token in list_tokens:
    if token not in word2idx:
        word2idx[token] = len(word2idx)
        words.append(token)
    X_test.append(word2idx[token])

X_test = pad_sequences(maxlen=max_len, sequences=[X_test], padding="post",value=word2idx["ENDPAD"])

In [36]:
p = model.predict(X_test)
p = np.argmax(p, axis=-1)
print("{:14} ({:4})".format("Word", "Pred"))
for w,pred in zip(X_test[0],p[0]):
    print("{:14}: {}".format(words[w],tags[pred]))

Word           (Pred)
The           : O
Pretty        : O
Little        : O
Liars         : O
alum          : O
did           : O
a             : O
little        : O
shopping      : O
in            : O
London        : B-geo
’             : O
s             : I-org
Covent        : I-org
Garden        : I-org
on            : O
a             : O
recent        : O
trip          : O
across        : O
the           : O
pond          : O
.             : O
ENDPAD        : O
ENDPAD        : O
ENDPAD        : O
ENDPAD        : O
ENDPAD        : O
ENDPAD        : O
ENDPAD        : O
ENDPAD        : O
ENDPAD        : O
ENDPAD        : O
ENDPAD        : O
ENDPAD        : O
ENDPAD        : O
ENDPAD        : O
ENDPAD        : O
ENDPAD        : O
ENDPAD        : O
ENDPAD        : O
ENDPAD        : O
ENDPAD        : O
ENDPAD        : O
ENDPAD        : O
ENDPAD        : O
ENDPAD        : O
ENDPAD        : O
ENDPAD        : O
ENDPAD        : O
