# Load the data

In [1]:
import pandas as pd
import numpy as np
import os

os.chdir("d:\\School Stuff\\SEM_5\\Big Data Analysis\\科研实践 ——————Research Practice")
data = pd.read_csv("practice course/data/ner_dataset.csv", encoding="latin1")

In [2]:
data = data.fillna(method="ffill")


In [3]:
data.tail(10)


Unnamed: 0,Sentence #,Word,POS,Tag
1048565,Sentence: 47958,impact,NN,O
1048566,Sentence: 47958,.,.,O
1048567,Sentence: 47959,Indian,JJ,B-gpe
1048568,Sentence: 47959,forces,NNS,O
1048569,Sentence: 47959,said,VBD,O
1048570,Sentence: 47959,they,PRP,O
1048571,Sentence: 47959,responded,VBD,O
1048572,Sentence: 47959,to,TO,O
1048573,Sentence: 47959,the,DT,O
1048574,Sentence: 47959,attack,NN,O


In [4]:
words = list(set(data["Word"].values))
words.append("ENDPAD")
n_words = len(words); n_words


35179

In [5]:
tags = list(set(data["Tag"].values))
n_tags = len(tags); n_tags


17

In [6]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None


In [7]:
getter = SentenceGetter(data)


In [8]:
sent = getter.get_next()


In [9]:
print(sent)


[('Thousands', 'NNS', 'O'), ('of', 'IN', 'O'), ('demonstrators', 'NNS', 'O'), ('have', 'VBP', 'O'), ('marched', 'VBN', 'O'), ('through', 'IN', 'O'), ('London', 'NNP', 'B-geo'), ('to', 'TO', 'O'), ('protest', 'VB', 'O'), ('the', 'DT', 'O'), ('war', 'NN', 'O'), ('in', 'IN', 'O'), ('Iraq', 'NNP', 'B-geo'), ('and', 'CC', 'O'), ('demand', 'VB', 'O'), ('the', 'DT', 'O'), ('withdrawal', 'NN', 'O'), ('of', 'IN', 'O'), ('British', 'JJ', 'B-gpe'), ('troops', 'NNS', 'O'), ('from', 'IN', 'O'), ('that', 'DT', 'O'), ('country', 'NN', 'O'), ('.', '.', 'O')]


In [10]:
sentences = getter.sentences


In [11]:
max_len = 75
word2idx = {w: i + 1 for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}


In [12]:
word2idx["Obama"]


31152

In [13]:
tag2idx["B-geo"]


3

# Tokenize and prepare the sentences

In [14]:
from keras.preprocessing.sequence import pad_sequences
X = [[word2idx[w[0]] for w in s] for s in sentences]


Using TensorFlow backend.


In [15]:
X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=0)


In [16]:
y = [[tag2idx[w[2]] for w in s] for s in sentences]


In [17]:
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["O"])


In [18]:
from keras.utils import to_categorical


In [19]:
y = [to_categorical(i, num_classes=n_tags) for i in y]


In [20]:
from sklearn.model_selection import train_test_split


In [21]:
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.1)


# Train the model

In [22]:
from keras.models import Sequential
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras_contrib.layers import CRF

In [23]:
input = Input(shape=(max_len,))
model = Embedding(input_dim=n_words + 1, output_dim=20,
                  input_length=max_len)(input)  # 20-dim embedding
model = Bidirectional(LSTM(units=50, return_sequences=True,
                           recurrent_dropout=0.1))(model)  # variational biLSTM
model = TimeDistributed(Dense(50, activation="relu"))(model)  # a dense layer as suggested by neuralNer
crf = CRF(n_tags)  # CRF layer
out = crf(model)  # output


Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [24]:
model = Model(input, out)

In [25]:
model.compile(optimizer="rmsprop", loss=crf.loss_function, metrics=[crf.accuracy])


In [26]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 75)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 75, 20)            703600    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 75, 100)           28400     
_________________________________________________________________
time_distributed_1 (TimeDist (None, 75, 50)            5050      
_________________________________________________________________
crf_1 (CRF)                  (None, 75, 17)            1190      
Total params: 738,240
Trainable params: 738,240
Non-trainable params: 0
_________________________________________________________________


In [27]:
history = model.fit(X_tr, np.array(y_tr), batch_size=32, epochs=5,
                    validation_split=0.1, verbose=1)

Instructions for updating:
Use tf.cast instead.
Train on 38846 samples, validate on 4317 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [28]:
hist = pd.DataFrame(history.history)


In [29]:
import matplotlib.pyplot as plt
plt.style.use("ggplot")
plt.figure(figsize=(12, 12))
plt.plot(hist["crf_viterbi_accuracy"])
plt.plot(hist["val_crf_viterbi_accuracy"])
plt.show()


<Figure size 1200x1200 with 1 Axes>

# Evaluate

In [30]:
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report


In [31]:
test_pred = model.predict(X_te, verbose=1)




In [32]:
idx2tag = {i: w for w, i in tag2idx.items()}

def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            p_i = np.argmax(p)
            out_i.append(idx2tag[p_i].replace("PAD", "O"))
        out.append(out_i)
    return out
    
pred_labels = pred2label(test_pred)
test_labels = pred2label(y_te)


In [33]:
print("F1-score: {:.1%}".format(f1_score(test_labels, pred_labels)))


F1-score: 81.9%


In [34]:
print(classification_report(test_labels, pred_labels))


              precision    recall  f1-score   support

         art       0.00      0.00      0.00        36
         eve       0.00      0.00      0.00        35
         geo       0.81      0.89      0.85      3887
         gpe       0.96      0.93      0.94      1568
         nat       0.00      0.00      0.00        27
         org       0.72      0.63      0.67      2017
         per       0.77      0.76      0.76      1688
         tim       0.89      0.84      0.87      2020

   micro avg       0.82      0.81      0.82     11278
   macro avg       0.52      0.51      0.51     11278
weighted avg       0.82      0.81      0.81     11278



In [35]:
i = 1928
p = model.predict(np.array([X_te[i]]))
p = np.argmax(p, axis=-1)
true = np.argmax(y_te[i], -1)
print("{:15}||{:5}||{}".format("Word", "True", "Pred"))
print(30 * "=")
for w, t, pred in zip(X_te[i], true, p[0]):
    if w != 0:
        print("{:15}: {:5} {}".format(words[w-1], tags[t], tags[pred]))


Word           ||True ||Pred
Russia         : B-geo B-geo
's             : O     O
state-run      : O     O
natural        : O     O
gas            : O     O
company        : O     O
,              : O     O
Gazprom        : B-org B-org
,              : O     O
cut            : O     O
off            : O     O
exports        : O     O
to             : O     O
Ukraine        : B-geo B-geo
Sunday         : B-tim B-tim
after          : O     O
Kiev           : B-org B-geo
refused        : O     O
a              : O     O
contract       : O     O
that           : O     O
increased      : O     O
the            : O     O
price          : O     O
by             : O     O
400            : O     O
percent        : O     O
.              : O     O


# Predict on new data

In [36]:
test_sentence = ["Hawking", "was", "a", "Fellow", "of", "the", "Royal", "Society", ",", "a", "lifetime", "member",
                 "of", "the", "Pontifical", "Academy", "of", "Sciences", ",", "and", "a", "recipient", "of",
                 "the", "Presidential", "Medal", "of", "Freedom", ",", "the", "highest", "civilian", "award",
                 "in", "the", "United", "States", "."]


In [37]:
x_test_sent = pad_sequences(sequences=[[word2idx.get(w, 0) for w in test_sentence]],
                            padding="post", value=0, maxlen=max_len)


In [38]:
tags


['B-gpe',
 'O',
 'I-gpe',
 'B-eve',
 'I-per',
 'I-tim',
 'B-org',
 'B-geo',
 'B-art',
 'I-art',
 'I-nat',
 'B-nat',
 'I-eve',
 'B-tim',
 'I-org',
 'I-geo',
 'B-per']

In [39]:
p = model.predict(np.array([x_test_sent[0]]))
p = np.argmax(p, axis=-1)
print("{:15}||{}".format("Word", "Prediction"))
print(30 * "=")
for w, pred in zip(test_sentence, p[0]):
    print("{:15}: {:5}".format(w, tags[pred]))


Word           ||Prediction
Hawking        : O    
was            : O    
a              : O    
Fellow         : O    
of             : O    
the            : O    
Royal          : B-org
Society        : I-org
,              : O    
a              : O    
lifetime       : O    
member         : O    
of             : O    
the            : O    
Pontifical     : B-org
Academy        : I-org
of             : I-org
Sciences       : I-org
,              : O    
and            : O    
a              : O    
recipient      : O    
of             : O    
the            : O    
Presidential   : O    
Medal          : O    
of             : O    
Freedom        : B-org
,              : O    
the            : O    
highest        : O    
civilian       : O    
award          : O    
in             : O    
the            : O    
United         : B-geo
States         : I-geo
.              : O    
