In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

Using TensorFlow backend.
[nltk_data] Downloading package punkt to /home/wenbo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
data = pd.read_csv("ner_dataset.csv", encoding="latin1")
data = data.drop(['POS'], axis =1)
data = data.fillna(method="ffill")
data.head(10)

Unnamed: 0,Sentence #,Word,Tag
0,Sentence: 1,Thousands,O
1,Sentence: 1,of,O
2,Sentence: 1,demonstrators,O
3,Sentence: 1,have,O
4,Sentence: 1,marched,O
5,Sentence: 1,through,O
6,Sentence: 1,London,B-geo
7,Sentence: 1,to,O
8,Sentence: 1,protest,O
9,Sentence: 1,the,O


In [3]:
print(data.loc[data['Tag'] == 'I-art'][0:10])

         Sentence #               Word    Tag
264    Sentence: 12  Non-Proliferation  I-art
3811  Sentence: 171                V-6  I-art
4016  Sentence: 183             Simple  I-art
4017  Sentence: 183               Life  I-art
4142  Sentence: 188            Morning  I-art
4143  Sentence: 188            America  I-art
5248  Sentence: 236             Mirror  I-art
5923  Sentence: 270                 De  I-art
5924  Sentence: 270             Gaulle  I-art
5935  Sentence: 270      International  I-art


In [4]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s["Word"].values.tolist(),s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [5]:
getter = SentenceGetter(data)
sentences = getter.sentences

In [6]:
print(sentences[0])

[('Thousands', 'O'), ('of', 'O'), ('demonstrators', 'O'), ('have', 'O'), ('marched', 'O'), ('through', 'O'), ('London', 'B-geo'), ('to', 'O'), ('protest', 'O'), ('the', 'O'), ('war', 'O'), ('in', 'O'), ('Iraq', 'B-geo'), ('and', 'O'), ('demand', 'O'), ('the', 'O'), ('withdrawal', 'O'), ('of', 'O'), ('British', 'B-gpe'), ('troops', 'O'), ('from', 'O'), ('that', 'O'), ('country', 'O'), ('.', 'O')]


In [7]:
def tag_text(text):
    sentence = []
    for word in text.split():
        if word.startswith("*"):
            sentence.append((word[1:], "B-geo"))
        elif word.startswith("!"):
            sentence.append((word[1:], "I-art"))
        else:
            sentence.append((word, "O"))
    return sentence

In [8]:
train_text = "I lived in *Munich last summer. *Germany has a relaxing, slow summer lifestyle. One night, I got food poisoning and couldn't find !Tylenol to make the pain go away, they insisted I take !aspirin instead."

In [9]:
sentence = tag_text(train_text)
print(sentence)

[('I', 'O'), ('lived', 'O'), ('in', 'O'), ('Munich', 'B-geo'), ('last', 'O'), ('summer.', 'O'), ('Germany', 'B-geo'), ('has', 'O'), ('a', 'O'), ('relaxing,', 'O'), ('slow', 'O'), ('summer', 'O'), ('lifestyle.', 'O'), ('One', 'O'), ('night,', 'O'), ('I', 'O'), ('got', 'O'), ('food', 'O'), ('poisoning', 'O'), ('and', 'O'), ("couldn't", 'O'), ('find', 'O'), ('Tylenol', 'I-art'), ('to', 'O'), ('make', 'O'), ('the', 'O'), ('pain', 'O'), ('go', 'O'), ('away,', 'O'), ('they', 'O'), ('insisted', 'O'), ('I', 'O'), ('take', 'O'), ('aspirin', 'I-art'), ('instead.', 'O')]


In [10]:
sentences.append(sentence)

In [11]:
words = list(set(data["Word"].values))
words.append("ENDPAD")
for word in sentence:
    if word[0] not in words:
        words.append(word[0])
n_words = len(words)

In [12]:
tags = list(set(data["Tag"].values))
n_tags = len(tags)

In [13]:
word2idx = {w: i for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}

In [14]:
max_len = 50

X = [[word2idx[w[0]] for w in s] for s in sentences]
X = pad_sequences(maxlen=max_len, sequences=X, padding="post",value=word2idx["ENDPAD"])

y = [[tag2idx[w[1]] for w in s] for s in sentences]
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["O"])

y = [to_categorical(i, num_classes=n_tags) for i in y]

In [15]:
X_tr, X_te, y_tr, y_te = train_test_split(X[0:-2], y[0:-2], test_size = 0.2)

In [16]:
print(len(X_tr))
print(len(y_tr))

38366
38366


In [17]:
#Augment training data with user's input data
X_tr = np.vstack((X_tr, X[-1]))
y_tr.append(y[-1])

In [18]:
print(len(X_tr))
print(len(y_tr))

38367
38367


In [19]:
input = Input(shape=(max_len,))
model = Embedding(input_dim=n_words, output_dim=max_len, input_length=max_len)(input)
model = Dropout(0.1)(model)
model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
out = TimeDistributed(Dense(n_tags, activation="softmax"))(model)

model = Model(input, out)
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

In [20]:
# Train model
history = model.fit(X_tr,\
                    np.array(y_tr),\
                    batch_size = 32,\
                    epochs = 3,\
                    validation_split = 0.2,\
                    verbose = 1)

Train on 30693 samples, validate on 7674 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [21]:
model.summary()
model.save('model.h5')

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 50)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 50, 50)            1759400   
_________________________________________________________________
dropout_1 (Dropout)          (None, 50, 50)            0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 50, 200)           120800    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 50, 17)            3417      
Total params: 1,883,617
Trainable params: 1,883,617
Non-trainable params: 0
_________________________________________________________________


In [22]:
# Test model
test_pred = model.predict(np.array(X_te), verbose=1)



In [23]:
idx2tag = {i: w for w, i in tag2idx.items()}

def pred2label(pred):
    out = []
    for pred_i in pred:
        for p in pred_i:
            p_i = np.argmax(p, axis = -1)
            out.append(idx2tag[p_i])
    return out

pred_labels = pred2label(test_pred)
true_labels = pred2label(y_te)

print(classification_report(true_labels, pred_labels))

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

       B-art       0.00      0.00      0.00        81
       B-eve       1.00      0.03      0.06        67
       B-geo       0.88      0.85      0.86      7466
       B-gpe       0.95      0.93      0.94      3177
       B-nat       0.00      0.00      0.00        35
       B-org       0.75      0.69      0.72      4013
       B-per       0.82      0.81      0.81      3419
       B-tim       0.91      0.87      0.89      4053
       I-art       0.00      0.00      0.00        48
       I-eve       0.00      0.00      0.00        61
       I-geo       0.81      0.76      0.78      1454
       I-gpe       1.00      0.23      0.37        31
       I-nat       0.00      0.00      0.00         8
       I-org       0.76      0.77      0.77      3452
       I-per       0.85      0.85      0.85      3539
       I-tim       0.82      0.70      0.75      1248
           O       0.99      1.00      1.00    447448

   micro avg       0.98   

In [24]:
# Test user's text
test_text = "When I lived in Paris last year, France was experiencing a recession. The night life was too fun, I developed an addiction to Adderall and Ritalin."

In [29]:
list_tokens = word_tokenize(test_text)
X_test = []
for token in list_tokens:
    if token not in word2idx:
        word2idx[token] = len(word2idx)
        words.append(token)
    X_test.append(word2idx[token])

X_test = pad_sequences(maxlen=max_len, sequences=[X_test], padding="post",value=word2idx["ENDPAD"])

In [33]:
i = 0
p = model.predict(np.array(X_test))
p = np.argmax(p, axis=-1)
print("{:14} ({:4})".format("Word", "Pred"))
for w,pred in zip(X_test[i],p[0]):
    print("{:14}: {}".format(words[w],tags[pred]))

Word           (Pred)
When          : O
I             : O
lived         : O
in            : O
Paris         : B-geo
last          : O
year          : O
,             : O
France        : B-geo
was           : O
experiencing  : O
a             : O
recession     : O
.             : O
The           : O
night         : O
life          : O
was           : O
too           : O
fun           : O
,             : O
I             : O
developed     : O
an            : O
addiction     : O
to            : O
Adderall      : O
and           : O
Ritalin       : O
.             : O
ENDPAD        : O
ENDPAD        : O
ENDPAD        : O
ENDPAD        : O
ENDPAD        : O
ENDPAD        : O
ENDPAD        : O
ENDPAD        : O
ENDPAD        : O
ENDPAD        : O
ENDPAD        : O
ENDPAD        : O
ENDPAD        : O
ENDPAD        : O
ENDPAD        : O
ENDPAD        : O
ENDPAD        : O
ENDPAD        : O
ENDPAD        : O
ENDPAD        : O
