In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm, trange

data = pd.read_csv("dataset kaggle/ner_dataset.csv", encoding="latin1").fillna(method="ffill")
data.tail(10)

Unnamed: 0,Sentence #,Word,POS,Tag
1048565,Sentence: 47958,impact,NN,O
1048566,Sentence: 47958,.,.,O
1048567,Sentence: 47959,Indian,JJ,B-gpe
1048568,Sentence: 47959,forces,NNS,O
1048569,Sentence: 47959,said,VBD,O
1048570,Sentence: 47959,they,PRP,O
1048571,Sentence: 47959,responded,VBD,O
1048572,Sentence: 47959,to,TO,O
1048573,Sentence: 47959,the,DT,O
1048574,Sentence: 47959,attack,NN,O


In [4]:
words = list(set(data["Word"].values))
n_words = len(words); n_words

35178

In [6]:
tags = list(set(data["Tag"].values))
n_tags = len(tags); n_tags

17

In [7]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [8]:
getter = SentenceGetter(data)
sentences = getter.sentences

In [9]:
labels = [[s[2] for s in sent] for sent in sentences]
sentences = [" ".join([s[0] for s in sent]) for sent in sentences]
sentences[0]

'Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .'

In [11]:
print(labels[0])

['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O']


In [12]:
from collections import Counter
from keras.preprocessing.sequence import pad_sequences

word_cnt = Counter(data["Word"].values)
vocabulary = set(w[0] for w in word_cnt.most_common(5000))

Using TensorFlow backend.


In [13]:
max_len = 50
word2idx = {"PAD": 0, "UNK": 1}
word2idx.update({w: i for i, w in enumerate(words) if w in vocabulary})
tag2idx = {t: i for i, t in enumerate(tags)}

In [16]:
X = [[word2idx.get(w, word2idx["UNK"]) for w in s.split()] for s in sentences]

In [17]:
X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=word2idx["PAD"])

In [18]:
y = [[tag2idx[l_i] for l_i in l] for l in labels]

In [19]:
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["O"])

In [20]:
from sklearn.model_selection import train_test_split

X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.1, shuffle=False)

In [28]:
from tensorflow.keras.models import Model
from tensorflow.keras import Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, SpatialDropout1D, Bidirectional, Input

In [30]:
word_input = Input(shape=(max_len,))
model = Embedding(input_dim=n_words, output_dim=50, input_length=max_len)(word_input)
model = SpatialDropout1D(0.1)(model)
model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
out = TimeDistributed(Dense(n_tags, activation="softmax"))(model)



In [31]:
model = Model(word_input, out)
model.compile(optimizer="rmsprop",
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])

In [32]:
history = model.fit(X_tr, y_tr.reshape(*y_tr.shape, 1),
                    batch_size=32, epochs=5,
                    validation_split=0.1, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [34]:
from eli5.lime import TextExplainer
from eli5.lime.samplers import MaskingTextSampler



In [35]:
class NERExplainerGenerator(object):
    
    def __init__(self, model, word2idx, tag2idx, max_len):
        self.model = model
        self.word2idx = word2idx
        self.tag2idx = tag2idx
        self.idx2tag = {v: k for k,v in tag2idx.items()}
        self.max_len = max_len
        
    def _preprocess(self, texts):
        X = [[self.word2idx.get(w, self.word2idx["UNK"]) for w in t.split()]
             for t in texts]
        X = pad_sequences(maxlen=self.max_len, sequences=X,
                          padding="post", value=self.word2idx["PAD"])
        return X
    
    def get_predict_function(self, word_index):
        def predict_func(texts):
            X = self._preprocess(texts)
            p = self.model.predict(X)
            return p[:,word_index,:]
        return predict_func

In [36]:
index = 46781
label = labels[index]
text = sentences[index]
print(text)
print()
print(" ".join([f"{t} ({l})" for t, l in zip(text.split(), label)]))

Nigeria 's President Olusegun Obasanjo expressed his condolences , noting the late pontiff promoted religious tolerance and democracy in the West African nation .

Nigeria (B-geo) 's (O) President (B-per) Olusegun (I-per) Obasanjo (I-per) expressed (O) his (O) condolences (O) , (O) noting (O) the (O) late (O) pontiff (O) promoted (O) religious (O) tolerance (O) and (O) democracy (O) in (O) the (O) West (O) African (B-gpe) nation (O) . (O)


In [37]:
for i, w in enumerate(text.split()):
    print(f"{i}: {w}")

0: Nigeria
1: 's
2: President
3: Olusegun
4: Obasanjo
5: expressed
6: his
7: condolences
8: ,
9: noting
10: the
11: late
12: pontiff
13: promoted
14: religious
15: tolerance
16: and
17: democracy
18: in
19: the
20: West
21: African
22: nation
23: .


In [38]:
explainer_generator = NERExplainerGenerator(model, word2idx, tag2idx, max_len)

In [40]:
word_index = 4
predict_func = explainer_generator.get_predict_function(word_index=word_index)

In [41]:
sampler = MaskingTextSampler(
    replacement="UNK",
    max_replace=0.7,
    token_pattern=None,
    bow=False
)

In [42]:
samples, similarity = sampler.sample_near(text, n_samples=4)
print(samples)

("Nigeria 's President UNK UNK expressed UNK UNK , noting the late pontiff promoted religious UNK and UNK in UNK UNK UNK nation .", "UNK 's UNK Olusegun Obasanjo UNK his UNK , UNK the late pontiff promoted UNK tolerance UNK UNK UNK the West African UNK .", "Nigeria 'UNK President Olusegun UNK expressed UNK UNK , UNK the late pontiff promoted UNK tolerance UNK democracy in UNK West African UNK .", "Nigeria 'UNK President UNK Obasanjo UNK UNK condolences , noting UNK UNK pontiff UNK religious UNK and democracy UNK UNK West African UNK .")


In [43]:
te = TextExplainer(
    sampler=sampler,
    position_dependent=True,
    random_state=42
)

te.fit(text, predict_func)

te.explain_prediction(
    target_names=list(explainer_generator.idx2tag.values()),
    top_targets=3
)

Contribution?,Feature
4.131,Highlighted in text (sum)
-0.391,<BIAS>

Contribution?,Feature
-1.178,<BIAS>
-2.86,Highlighted in text (sum)

Contribution?,Feature
-0.319,Highlighted in text (sum)
-4.008,<BIAS>


In [46]:
preds = model.predict(X_te)

In [64]:
tags[np.argmax(preds[0][0])], X_te,
word2idx
search_number = 3505
for name, age in word2idx.items():
    if age == search_number:
        print(name)

The


In [65]:
model.save('ModeloTestInicialExemploInternet.h5')

In [66]:
import tensorflow as tf
model_load = tf.keras.models.load_model('ModeloTestInicialExemploInternet.h5') 



In [67]:
model_load.predict(X_te[0])



array([[[1.15639505e-05, 1.13679875e-04, 4.71319845e-06, 2.77629556e-06,
         1.15233520e-03, 5.08128178e-05, 2.07925332e-05, 9.81708581e-05,
         2.84267444e-05, 1.21134457e-04, 2.90967035e-03, 1.06746134e-04,
         5.29559693e-05, 2.60413032e-07, 2.27524160e-05, 9.94926572e-01,
         3.76516691e-04]],

       [[2.50009180e-05, 3.69176298e-04, 1.61493681e-05, 6.19811863e-06,
         6.04283880e-04, 4.33768328e-05, 6.59549478e-05, 9.02706015e-05,
         2.06063996e-05, 6.09474140e-04, 4.76114277e-04, 4.72351239e-04,
         5.60766493e-05, 1.30801925e-06, 4.74560329e-05, 9.96962965e-01,
         1.33247391e-04]],

       [[7.11442726e-06, 2.25686748e-03, 1.98420676e-05, 4.63881133e-06,
         4.05521365e-04, 2.55815357e-05, 3.24534689e-04, 3.56102741e-04,
         4.21336199e-06, 1.15333882e-04, 7.61578325e-04, 7.48601960e-05,
         3.75192067e-05, 6.52009362e-07, 2.03205145e-05, 9.95456696e-01,
         1.28622793e-04]],

       [[2.08730100e-07, 2.04193184e-05,