In [1]:
import spacy
from spacy.util import minibatch, compounding
from spacy.training import Example
import random
import warnings
from tqdm import tqdm
from itertools import chain
import numpy as np
import pandas as pd
import csv

Performed Named entity recognition using 2 models(Spacy's-CNN model & ELMo). Transfer learning was adopted for both cases.

METHOD:

• After performing EDA, I found out that words and tags are seperated by '\t' delimiters. And the lines are seperated by '\n'.

• The data was pre-processed 2 times to fit 2 different model architectures.

• The first model uses transfer learning on the "en_core_web_sm" model (a CNN model from the spaCy library).

• The evaluvation metrics used in the spacy model are MAE, precision, recall and f1 score.

• The loss was brought down to 931.53. Since this is a simple model, the metrics are not comparable to the sophisticated ELMo     model.

• The second model uses transfer learning on the ELMo model. It is a well-known Bi-directional language model trained on a billion   word dataset. I have added LSTM units to the same model to tailor our NER problem.

• The evaluvation metrics used in the ELMo model are MSE, precision, recall and f1 score.

• The loss and the accuracy for this model are 0.0467 and 0.9871.

• Clearly the ELMo model performs better due to it's complexity.

• But, both the models perform well on unseen data ('test.txt').

SPACY'S CONVOLUTIONAL NEURAL NETWORK

STEP:1 FUNCTION TO LOAD DATA & DO BASIC PRE-PROCESSING

In [2]:
def load_data_for_spacy(file_path):
    file = open(file_path, 'r')
    training_data, entities, sentence, unique_labels = [], [], [], []
    current_annotation = None
    start =0
    end = 0
    for line in file:
        line = line.strip().split("\t")
        # lines with len > 1 are words
        if len(line) > 1:
            label = line[1]
            if(label != 'O'):
                label = line[1]
            word = line[0]
            sentence.append(word)
            start = end
            end += (len(word) + 1)
           
            if label.startswith('I-') == True: 
                entities.append(( start,end-1, label))
                              
            if label.startswith('B-') == True:
                entities.append(( start,end-1, label))
                
           
           
            if label != 'O' and label not in unique_labels:
                unique_labels.append(label)
 
        if len(line) == 1:
            if(len(entities) > 0):
                sentence = " ".join(sentence)
                training_data.append([sentence, {'entities' : entities}])
            end = 0 
            start = 0
            entities, sentence = [], []
            
    file.close()
    return training_data, unique_labels

In [3]:
TRAIN_DATA, LABELS = load_data_for_spacy("train.txt")
VALID_DATA = TRAIN_DATA[1150:]
TRAIN_DATA = TRAIN_DATA[:1150]

In [4]:
TRAIN_DATA

[["@paulwalk It 's the view from where I 'm living for two weeks . Empire State Building = ESB . Pretty bad storm here last evening .",
  {'entities': [(64, 70, 'B-location'),
    (71, 76, 'I-location'),
    (77, 85, 'I-location'),
    (88, 91, 'B-location')]}],
 ['From Green Newsfeed : AHFA extends deadline for Sage Award to Nov . 5 http://tinyurl.com/24agj38',
  {'entities': [(22, 26, 'B-group')]}],
 ['Pxleyes Top 50 Photography Contest Pictures of August 2010 ... http://bit.ly/bgCyZ0 #photography',
  {'entities': [(0, 7, 'B-corporation')]}],
 ["4Dbling 's place til monday , party party party . &lt; 3",
  {'entities': [(0, 7, 'B-person')]}],
 ["watching the VMA pre-show again lol it was n't even a good show the first time ... so bored !",
  {'entities': [(13, 16, 'B-creative-work')]}],
 ["@Suzie55 whispering cause I may have had 1 too many vodka 's last night and am a lil fragile , hold me ?",
  {'entities': [(52, 57, 'B-product')]}],
 ["RT @midgetmegs : hate people who write drunk s

STEP:2 FUNCTIONS TO CALCULATE EVALUVATION METRICS

In [6]:
def calc_precision(pred, true):        
    precision = len([x for x in pred if x in true]) / (len(pred) + 1e-20) # true positives / total pred
    return precision

def calc_recall(pred, true):
    recall = len([x for x in true if x in pred]) / (len(true) + 1e-20)    # true positives / total test
    return recall

def calc_f1(precision, recall):
    f1 = 2 * ((precision * recall) / (precision + recall + 1e-20))
    return f1

In [7]:
def evaluate(ner, data):
    preds = [ner(x[0]) for x in data]

    precisions, recalls, f1s = [], [], []

    for pred, true in zip(preds, data):
        true = [x[2] for x in list(chain.from_iterable(true[1].values()))]
        pred = [i.label_ for i in pred.ents]
        precision = calc_precision(true, pred)
        precisions.append(precision)
        recall = calc_recall(true, pred)
        recalls.append(recall)
        f1s.append(calc_f1(precision, recall))

    return {"textcat_p": np.mean(precisions), "textcat_r": np.mean(recalls), "textcat_f":np.mean(f1s)}

STEP:3 FUNCTION TO TRAIN THE CNN MODEL

In [8]:
def train_spacy(train_data, labels, iterations, dropout = 0.5, display_freq = 1):
    
    valid_f1scores=[]
    test_f1scores=[]
    
    model = None
    
    nlp = spacy.load("en_core_web_sm")
    
    if model is not None:
        nlp = spacy.load(model) 
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')
        print("Created blank 'en' model")
        
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe('ner', last=True)
    
    else:
        ner = nlp.get_pipe("ner")
        
    for _, annotations in train_data:
        for ent in annotations.get('entities'):
            ner.add_label(ent[2])
            
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    
    with nlp.disable_pipes(*other_pipes), warnings.catch_warnings():
        warnings.filterwarnings("once", category=UserWarning, module='spacy')
        if model is None:
            nlp.begin_training()
        optimizer = nlp.initialize()
        for itn in range(iterations):
            random.shuffle(train_data)
            losses = {}
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for text, annotation in train_data:
                example = Example.from_dict(nlp.make_doc(text), annotation)
                nlp.update(          
                    [example],
                    drop = dropout,  
                    sgd = optimizer,
                    losses = losses)

            scores = evaluate(nlp,VALID_DATA)
            valid_f1scores.append(scores["textcat_f"])
            print('=======================================')
            print('Interation = '+str(itn+1))
            print('Losses = '+str(losses))
            print('===============VALID DATA========================')
            
            print('F1-score = '+str(scores["textcat_f"]))
            print('Precision = '+str(scores["textcat_p"]))
            print('Recall = '+str(scores["textcat_r"]))
            
    return nlp,valid_f1scores,test_f1scores

STEP:4 RUNNING THE MODEL AND SAVING IT IN A LOCAL DIRECTORY

In [9]:
ner,valid_f1scores,test_f1scores = train_spacy(TRAIN_DATA, LABELS,50)
ner.to_disk("C:\Codes and resources\Codes\infrrd")

Created blank 'en' model
Interation = 1
Losses = {'ner': 5070.352267130611}
F1-score = 0.25873385873385873
Precision = 0.2308608058608059
Recall = 0.3269230769230769
Interation = 2
Losses = {'ner': 4245.313856270026}
F1-score = 0.3511069340016708
Precision = 0.35213675213675216
Recall = 0.3758241758241759
Interation = 3
Losses = {'ner': 3882.9839151664037}
F1-score = 0.4166151930857813
Precision = 0.4191900691900692
Recall = 0.4559829059829059
Interation = 4
Losses = {'ner': 3672.171075441074}
F1-score = 0.45111925111925105
Precision = 0.4551282051282051
Recall = 0.48290598290598286
Interation = 5
Losses = {'ner': 3445.798167085043}
F1-score = 0.38403404962109416
Precision = 0.3804029304029304
Recall = 0.4243589743589743
Interation = 6
Losses = {'ner': 3345.528188522912}
F1-score = 0.45622013487867147
Precision = 0.44618437118437115
Recall = 0.5032051282051282
Interation = 7
Losses = {'ner': 3227.3611591837744}
F1-score = 0.41327191327191326
Precision = 0.4032661782661783
Recall = 0.47

Interation = 37
Losses = {'ner': 1249.946551698866}
F1-score = 0.4938618304151039
Precision = 0.49761904761904757
Recall = 0.5241452991452992
Interation = 38
Losses = {'ner': 1193.8211954919893}
F1-score = 0.4596687425634794
Precision = 0.48205128205128206
Recall = 0.46944444444444455
Interation = 39
Losses = {'ner': 1144.7421621076069}
F1-score = 0.5245396315722994
Precision = 0.5414224664224664
Recall = 0.5514957264957265
Interation = 40
Losses = {'ner': 1066.819898747556}
F1-score = 0.4731425981425981
Precision = 0.47884615384615375
Recall = 0.50491452991453
Interation = 41
Losses = {'ner': 1048.4629804634778}
F1-score = 0.5009548691284001
Precision = 0.5151709401709402
Recall = 0.5305250305250305
Interation = 42
Losses = {'ner': 1092.9205281395778}
F1-score = 0.4805456669233759
Precision = 0.48479853479853485
Recall = 0.5121794871794872
Interation = 43
Losses = {'ner': 1108.1644216169466}
F1-score = 0.4066261719139733
Precision = 0.41978021978021984
Recall = 0.42206959706959707
Int

STEP:5 LOADING TEST DATA, PRE-PROCESSING & TESTING IT ON THE SAVED MODEL

In [11]:
spacy_df = pd.read_csv('test.txt', sep=' ', quoting=csv.QUOTE_NONE, skip_blank_lines = False, header=None)
spacy_df['Sentence #'] = ''
i = 0
for ind in spacy_df.index-1:
    if type(spacy_df.iloc[ind,0])==float:
        i = i+1
        spacy_df.iloc[ind+1,1] = 'Sentence: ' + str(i)
    spacy_df.iloc[ind,1] = 'Sentence: ' + str(i)
spacy_df = spacy_df.dropna()

In [12]:
agg_func = lambda s: [ w for w in s[0]]
TEST_DATA = spacy_df.groupby("Sentence #").apply(agg_func)

In [13]:
def load_model(model_path):
    nlp = spacy.blank('en')
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe('ner')
    ner = nlp.from_disk(model_path)
    return ner

In [14]:
ner = load_model("C:\Codes and resources\Codes\infrrd")

test_sentences = [' '.join(x) for x in TEST_DATA]
for x in test_sentences:
    doc = ner(x)
    for ent in doc.ents:
        print(ent.text, ent.start_char, ent.end_char, ent.label_)

Defence 138 145 B-person
Spokesman 146 155 I-person
Colonel 156 163 I-person
Kalia 171 176 B-location
Juve 50 54 B-person
Verratti 55 63 I-person
M 80 81 B-person
Leel 5 9 B-person
Not 10 13 B-group
Answers 14 21 I-product
| 22 23 I-corporation
Box 24 27 I-product
of 28 30 I-group
Crayons 31 38 I-group
female 31 37 B-person
jungkook 62 70 B-person
Donald 18 24 B-person
Nazaire 5 12 B-creative-work
Trump 18 23 B-person
Appalachia 57 67 B-person
5 16 17 B-group
Highest 18 25 I-product
Young 60 65 B-person
Ae 66 68 I-person
Saimdang 71 79 B-group
Diary 92 97 I-creative-work
Billion 109 116 B-person
oi 13 15 B-location
Kid 6 9 B-person
Development 51 62 B-creative-work
katarina 2 10 B-location
Trump 78 83 B-person
Collaborations 14 28 B-person
Sims 36 40 B-location
Uroxatral 0 9 I-location
Gandalf 10 17 B-person
SoCal 35 40 B-location
meoss 5 10 B-corporation
MarioLopezExtra 5 20 B-location
Willpower 2 11 B-group
Coe 0 3 B-person
Glasgow 43 50 B-location
Max 88 91 B-group
Maria 62 67 B-per

Demolition 16 26 B-person
Man 27 30 I-person
SPLC 51 55 B-creative-work
FIFA 17 21 B-group
shitting 59 67 B-corporation
Trump 19 24 B-person
Waltengoo 54 63 B-location
EDIT 0 4 B-location
OP 71 73 B-location
Edit 0 4 B-location
Maronti 2 9 B-person
Mirror 27 33 I-location
HD 34 36 I-product
HD 20 22 I-product
2002 255 259 B-location
World 260 265 I-corporation
Qatar 55 60 B-product
2022 61 65 I-product
World 66 71 I-location
Cup 72 75 I-location
Trump 15 20 B-person
Kelly 47 52 B-group
Heh 0 3 B-person
The 0 3 B-group
OP 4 6 I-group
jawans 47 53 B-location
Siachen 81 88 B-person
Glacier 89 96 I-person
Obligatory 0 10 B-location
karma 16 21 B-person
Obligatory 0 10 B-location
António 16 23 B-person
Guterres 24 32 I-person
UN 40 42 B-person
folk 39 43 B-person
Trump 82 87 B-person
Siachen 33 40 B-location
Mo-bot 15 21 B-location
NAH 9 12 B-person
NAAAHHH 13 20 I-person
Trump 111 116 B-person
Navy 5 9 B-person
Yaletown 55 63 B-group
The 0 3 B-creative-work
Prem 4 8 B-person
IMO 51 54 B-lo

Auckland 71 79 B-person
Legality 8 16 B-location
Citizens 39 47 B-product
Checked 48 55 I-product
UN 20 22 B-person
UN 31 33 B-person
Bank 68 72 I-corporation
Javier 0 6 B-corporation
Assassins 36 45 B-group
Creed 46 51 I-group
CandiedOrange 2 15 B-location
molecules 76 85 B-location
Ireland 123 130 B-location
CP 42 44 B-location
C- 63 65 B-person
CP 81 83 B-location
Comments 0 8 B-corporation
XiongChiamiov 2 15 B-corporation
input 121 126 B-person
The 57 60 B-location
Next 61 65 I-creative-work
Three 66 71 I-creative-work
Days 72 76 I-group
Higgs 0 5 B-group
However 123 130 B-person
^ 362 363 B-location
Super 42 47 B-corporation
Yautja 48 54 I-corporation
TOS 15 18 I-product
Discovery 69 78 B-location
potential 9 18 B-person
Identify 0 8 B-group
Grimaldo 11 19 B-person
Highschool 11 21 B-group
Fedxa 102 107 B-location
Does 0 4 B-product
Rogue 5 10 I-product
Hyperspace 59 69 B-group
Radio 70 75 I-group
Books 0 5 B-group
itpastorn 2 11 B-corporation
Wildcard 2 10 B-group
Bsc 150 153 B-p

Respect 18 25 B-person
Business 32 40 B-corporation
Advertise 73 82 B-person
Anything 83 91 I-person
ICYMI- 14 20 B-group
Lee 21 24 I-location
Joon 25 29 I-location
Of 63 65 I-creative-work
The 66 69 I-creative-work
Actor 70 75 I-group
Dating 76 82 I-group
HugotDre 5 13 B-location
Serbia 37 43 B-person
Jak 139 142 B-person
The 21 24 B-creative-work
Black 25 30 I-creative-work
Keys 31 35 I-creative-work
Lonely 40 46 B-creative-work
Boy 47 50 I-group
The 25 28 I-group
Perfect 29 36 I-group
Outreach 37 45 I-creative-work
quora 76 81 B-product
Linux 48 53 B-person
Don 18 21 B-person
jzaffos 5 12 B-group
Confidential 114 126 B-group
Tom 4 7 B-person
Don 0 3 B-person
teen 9 13 B-person
GayPrideBelize 5 19 B-location
Belize 57 63 B-creative-work
BiggDawg 38 46 B-creative-work
C 47 48 I-creative-work
Loc 49 52 I-creative-work
DamnFeelings 5 17 B-location
KleinISD 5 13 B-location
KleinISD 22 30 B-location
Daily 31 36 I-location
KISD 84 88 B-location
_ 89 90 I-person
DoctorZen 5 14 B-person
EPA 

ELMo (EMBEDDING FROM LANGUAGE MDEL)

STEP:1 LOADING TRAIN DATA & DOING BASIC PRE-PROCESSING AFTER COMPLETING EDA

In [15]:
df = pd.read_csv('train.txt', sep = '\t', header = None, quoting=csv.QUOTE_NONE, skip_blank_lines = False)
df['Sentence #'] = ''
i = 1
for ind in df.index-1:
    if type(df.iloc[ind,0])==float:
        i = i+1
        df.iloc[ind+1,2] = 'Sentence: ' + str(i)
    df.iloc[ind,2] = 'Sentence: ' + str(i)
df = df.dropna()

In [16]:
df

Unnamed: 0,0,1,Sentence #
0,@paulwalk,O,Sentence: 2
1,It,O,Sentence: 2
2,'s,O,Sentence: 2
3,the,O,Sentence: 2
4,view,O,Sentence: 2
...,...,...,...
66118,39,O,Sentence: 3400
66119,GMT+0000,O,Sentence: 3400
66120,(,O,Sentence: 3400
66121,UTC,O,Sentence: 3400


In [17]:
agg_func = lambda s: [[w,t] for w,t in zip(s[0],s[1])]

In [19]:
grouped = df.groupby("Sentence #").apply(agg_func)

In [20]:
print(len(grouped))

3397


In [21]:
tags = list(set(df[1].values))
n_tags = len(tags)

In [22]:
largest_sen = max(len(sen) for sen in grouped)
print('biggest sentence has {} words'.format(largest_sen))

biggest sentence has 41 words


STEP:1.1 PADDING EACH SENTENCE AND TAGS TO MAKE THEM UNIFORMLY SIZED

In [24]:
max_len = 41
X = [[w[0]for w in s] for s in grouped]
new_X = []
for seq in X:
    new_seq = []
    for i in range(max_len):
        try:
            new_seq.append(seq[i])
        except:
            new_seq.append("PADword")
    new_X.append(new_seq)
print(new_X[22])

['OH-OH', 'It', 'seems', 'Lindsay', 'Lohan', 'maybe', 'be', 'heading', 'back', 'to', 'jail', '.', 'The', 'actress', 'has', 'failed', 'a', 'court', 'mandated', 'drug', 'test', '&amp;', 'could', 'face', '30', 'days', 'in', 'prison', 'PADword', 'PADword', 'PADword', 'PADword', 'PADword', 'PADword', 'PADword', 'PADword', 'PADword', 'PADword', 'PADword', 'PADword', 'PADword']


In [25]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
tags2index = {t:i for i,t in enumerate(tags)}
y = [[tags2index[w[1]] for w in s] for s in grouped]
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tags2index["O"])
y[22]

array([ 2,  2,  2, 10,  7,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
        2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
        2,  2,  2,  2,  2,  2,  2])

STEP:2 SPLITTING THE DATA INTO TRAINING TESTING AND VALIDATION SET

In [26]:
from sklearn.model_selection import train_test_split

X_tr, X_te, y_tr, y_te = train_test_split(new_X, y, test_size=0.1, random_state=2018)

STEP:3 DOWNLOADING THE ORIGINAL MODEL, INITIALIZING THE VARIABLES, CONVERTING THE PRE-PROCESSED DATA INTO TENSORS, ADDING NEW LAYERS TO MODEL & RUNNING THE MODEL

In [27]:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
import tensorflow_hub as hub

sess = tf.Session()
tf.compat.v1.keras.backend.set_session(sess)
elmo_model = hub.Module("https://tfhub.dev/google/elmo/3", trainable=True)
sess.run(tf.global_variables_initializer())
sess.run(tf.tables_initializer())

Instructions for updating:
non-resource variables are not supported in the long term


In [28]:
batch_size = 32
def ElmoEmbedding(x):
    return elmo_model(inputs={"tokens": tf.squeeze(tf.cast(x,tf.string)),"sequence_len": tf.constant(batch_size*[max_len])
                     },
                      signature="tokens",
                      as_dict=True)["elmo"]

In [29]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, Lambda, Input, add

input_text = Input(shape=(max_len,), dtype=tf.string)
embedding = Lambda(ElmoEmbedding, output_shape=(max_len, 1024))(input_text)
x = Bidirectional(LSTM(units=512, return_sequences=True,
                       recurrent_dropout=0.2, dropout=0.2))(embedding)
x_rnn = Bidirectional(LSTM(units=512, return_sequences=True,
                           recurrent_dropout=0.2, dropout=0.2))(x)
x = add([x, x_rnn])  # residual connection to the first biLSTM
out = TimeDistributed(Dense(n_tags, activation="softmax"))(x)
model = Model(input_text, out)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [30]:
batch_size = 32
X_tr, X_val = X_tr[:75*batch_size], X_tr[-20*batch_size:]
y_tr, y_val = y_tr[:75*batch_size], y_tr[-20*batch_size:]
y_tr = y_tr.reshape(y_tr.shape[0], y_tr.shape[1], 1)
y_val = y_val.reshape(y_val.shape[0], y_val.shape[1], 1)

In [32]:
history = model.fit(np.array(X_tr), y_tr, validation_data=(np.array(X_val), y_val),batch_size=batch_size, epochs=3, verbose=1)

Train on 2400 samples, validate on 640 samples
Epoch 1/3



Epoch 2/3
Epoch 3/3


STEP 4: TESTING THE MODEL

In [33]:
len(X_te[300])

41

In [34]:
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report
X_test = X_te[:10*batch_size]
test_pred = model.predict(np.array(X_test), verbose=1)

STEP 4.1: CONVERTING THE PREDICTED VALUES BACK TO LABELS & CALCULATING THE METRICS

In [35]:
idx2tag = {i: w for w, i in tags2index.items()}

def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            p_i = np.argmax(p)
            out_i.append(idx2tag[p_i].replace("PADword", "O"))
        out.append(out_i)
    return out
def test2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            out_i.append(idx2tag[p].replace("PADword", "O"))
        out.append(out_i)
    return out
    
pred_labels = pred2label(test_pred)
test_labels = test2label(y_te[:10*32])
print(classification_report(test_labels, pred_labels))

               precision    recall  f1-score   support

  corporation       0.38      0.33      0.36        15
creative-work       0.43      0.23      0.30        13
        group       0.50      0.20      0.29        35
     location       0.63      0.44      0.52        54
       person       0.74      0.63      0.68        51
      product       0.33      0.33      0.33        12

    micro avg       0.59      0.42      0.49       180
    macro avg       0.50      0.36      0.41       180
 weighted avg       0.58      0.42      0.48       180



STEP 5: TESTING ON UNSEEN DATA (TEST.TXT)

STEP 5.1: LOADING AND PRE-PROCESSING TEST DATA

In [36]:
df1 = pd.read_csv('test.txt', sep=' ', quoting=csv.QUOTE_NONE, skip_blank_lines = False, header=None)
df1['Sentence #'] = ''
i = 0
for ind in df1.index-1:
    if type(df1.iloc[ind,0])==float:
        i = i+1
        df1.iloc[ind+1,1] = 'Sentence: ' + str(i)
    df1.iloc[ind,1] = 'Sentence: ' + str(i)
df1 = df1.dropna()

In [37]:
agg_func = lambda s: [ w for w in s[0]]
TEST_DATA = df1.groupby("Sentence #").apply(agg_func)

In [38]:
t_pred = model.predict(np.array(X_te[300:332]), verbose=1)
p_labels = pred2label(t_pred)

In [39]:
max_len = 41
X = TEST_DATA
new_TEST_X = []
for seq in X:
    new_seq = []
    for i in range(max_len):
        try:
            new_seq.append(seq[i])
        except:
            new_seq.append("PADword")
    new_TEST_X.append(new_seq)

In [40]:
i = 180 #TAKES TO THE 180TH SENTENCE 
p = model.predict(np.array(new_TEST_X[i:i+32]))[0] #TO CALCULATE PREDICTIONS FOR 32 MORE SENTENCES 
p = np.argmax(p, axis=-1)
print("{:15} {:5}: ({})".format("Word", "Pred", "True"))
print("="*30)
for w, true, pred in zip(X_te[i], y_te[i], p):
    if w != "__PAD__":
        print("{:15}:{:5} ({})".format(w, tags[pred], tags[true]))

Word            Pred : (True)
@CHAMBERSfever :O     (O)
i              :O     (O)
used           :O     (O)
to             :O     (O)
have           :O     (O)
fishes         :O     (O)
but            :O     (O)
they           :O     (O)
lived          :O     (O)
like           :O     (O)
5              :O     (O)
days           :O     (O)
what           :O     (O)
pet            :O     (O)
would          :O     (O)
you            :O     (O)
like           :O     (O)
to             :O     (O)
have           :O     (O)
?              :O     (O)
i'm            :O     (O)
deeply         :O     (O)
in             :O     (O)
love           :O     (O)
with           :O     (O)
Eli            :O     (B-person)
PADword        :O     (O)
PADword        :O     (O)
PADword        :O     (O)
PADword        :O     (O)
PADword        :O     (O)
PADword        :O     (O)
PADword        :O     (O)
PADword        :O     (O)
PADword        :O     (O)
PADword        :O     (O)
PADword        :O     (O)
P