In [1]:
# Import the required libraries.
import re
import math
import random
import collections
import operator
import numpy as np

from sklearn.model_selection import KFold
from sklearn.metrics import precision_recall_fscore_support, f1_score, accuracy_score
from sklearn.metrics import confusion_matrix
from collections import defaultdict, Counter

from keras.utils import to_categorical
from keras.layers import *
from keras.models import Model
from keras import Model, Sequential
from sklearn.model_selection import StratifiedKFold
from keras.callbacks import *
from sklearn.metrics import classification_report

random.seed(11)
np.random.seed(11)

Using TensorFlow backend.


In [2]:
!ls

 NER-Dataset-10Types-Train.txt
 NER-Dataset--TestSet.txt
 NER-Dataset-Train.txt
 NER-TestSet-10Types-RNN-Predictions.txt
 NER-TestSet-RNN-Predictions.txt
'Q2 - NER Prediction - 10 Types (RNN).ipynb'
'Q2  - NER Prediction (RNN).ipynb'


In [3]:
with open('NER-Dataset-Train.txt', 'r') as f:
    ner_dataset = f.readlines()

In [4]:
sentences = []
words = []
tags = []
for line in ner_dataset:
    line = line.strip()
    if line == '':
        sentences.append((words, tags))
        words = []
        tags = []
    else:
        word, tag = line.split('\t')
        words.append(word)
        tags.append(tag)

if len(words) > 0:
    sentences.append((words, tags))
    words = []
    tags= []

In [5]:
vocab_counts = Counter(sum([a[0] for a in sentences], [])).most_common()
words_to_keep = set([word for word, count in vocab_counts if count > 1])

In [6]:
with open('NER-Dataset--TestSet.txt', 'r') as f:
    test_dataset = f.readlines()

test_sentences = []
words = []
for line in test_dataset:
    line = line.strip()
    if line == '':
        test_sentences.append((words,))
        words = []
    else:
        word = line
        words.append(word)

if len(words) > 0:
    test_sentences.append((words,))
    words = []

In [7]:
word_features = ['twoDigitNum',
                'fourDigitNum',
                'containsDigitAndAlpha',
                'containsDigitAndDash',
                'containsDigitAndSlash',
                'containsDigitAndComma',
                'containsDigitAndPeriod',
                'otherNum',
                'allCaps',
                'capPeriod',
                'firstWord',
                'initCap',
                'lowerCase',
                'other']

In [8]:
def get_word_features(sentence):
    features = []
    ## Optimize and use an Enum!
    firstword = True
    for word in sentence:
        if word.isnumeric() and len(word) == 2:
            features.append('twoDigitNum')
        elif word.isnumeric() and len(word) == 4:
            features.append('fourDigitNum')
        elif word.isalnum() and not word.isalpha() and not word.isnumeric():
            features.append('containsDigitAndAlpha')
        elif word.replace('-', '').isnumeric():
            features.append('containsDigitAndDash')
        elif word.replace('/', '').isnumeric():
            features.append('containsDigitAndSlash')
        elif word.replace('.', '').replace(',', '').isnumeric() and ',' in word:
            features.append('containsDigitAndComma')
        elif word.replace('.', '').isnumeric():
            features.append('containsDigitAndPeriod')
        elif word.isnumeric():
            features.append('otherNum')
        elif word.isupper():
            features.append('allCaps')
        elif len(word) == 2 and word[0].isupper() and word[1] == '.':
            features.append('capPeriod')
        elif firstword:
            features.append('firstWord')
        elif word[0].isupper():
            features.append('initCap')
        elif word.islower():
            features.append('lowerCase')
        else:
            features.append('other')
        firstword = False

    return features

In [9]:
max_len_found = max(len(s[0]) for s in sentences)
max_len = max_len_found + ((50 - (max_len_found % 50)) % 50)

In [10]:
eye_mat = list(np.eye(len(word_features)))
wordfeat2float = {feat: eye_mat[i] for i, feat in enumerate(word_features)}

In [11]:
word2idx = {'UNK': 0, 'PAD': 1}
word2idx.update({word: i + 2 for i, word in enumerate(sorted(words_to_keep))})

In [12]:
def numberize_sentence(words, max_len=50):
    features = get_word_features(words)
    word_idx = [word2idx[w] if w in word2idx.keys() else word2idx['UNK'] for w in words]
    feat_np = [wordfeat2float[f] for f in features]
    word_padding = [word2idx['PAD'] for _ in range(max_len - len(word_idx))]
    feat_padding = [np.ones((len(word_features),)) * 2 for _ in range(max_len - len(word_idx))]
    word_idx = np.asarray(word_idx + word_padding)
    feat_np = np.asarray(feat_np + feat_padding)
    return word_idx, feat_np

In [13]:
labels = set.union(*(set(s[1]) for s in sentences))
idx2labels = {i: s for i, s in enumerate(labels)}
n_labels = len(labels)
eye_mat = list(np.eye(len(labels)))
labels2float = {feat: eye_mat[i] for i, feat in enumerate(labels)}

def numberize_labels(gt_labels, max_len=50):
    labels_np = [labels2float[l] for l in gt_labels]
    labels_padding = [labels2float['O'] for _ in range(max_len - len(gt_labels))]
    return np.asarray(labels_np + labels_padding)

In [14]:
def create_model():
    input_words = Input(shape = (max_len,))
    input_feats = Input(shape = (max_len, len(word_features)))
    masked_words = Masking(mask_value = 1)(input_words)
    masked_feats = Masking(mask_value = 2)(input_feats)
    emb = Embedding(input_dim = (len(word2idx)), output_dim = 50, input_length = max_len)(masked_words)
    drop_emb = Dropout(0.1)(emb)
    concat_out = Concatenate()([drop_emb, masked_feats])
    rnn_out = Bidirectional(SimpleRNN(units = 100, return_sequences = True, recurrent_dropout = 0.1))(concat_out)
    dense_out = TimeDistributed(Dense(n_labels, activation = "softmax"))(rnn_out)
    model = Model(inputs = [input_words, input_feats], outputs = dense_out)
    model.summary()
    return model

In [15]:
parsed_sentences = [(numberize_sentence(s[0]), numberize_labels(s[1])) for s in sentences]
parsed_test_sentences = [numberize_sentence(s[0]) for s in test_sentences]

In [16]:
Counter(sum([s[1] for s in sentences], []))
Counter(sum([np.argmax(s[1], axis=-1).tolist() for s in parsed_sentences], []))

Counter({1: 44028, 0: 582, 2: 390})

In [17]:
# Build the test and training sets of sentences.
kf = KFold(n_splits = 5, shuffle = False)
parsed_sentences = np.asarray(parsed_sentences)
scores = []
y_pred_idx = []
y_test_idx = []

preds = []
fold_count = 0
foldwise_score_outputs = []

for train_index, test_index in kf.split(parsed_sentences):
    fold_count += 1
    y_pred_idx_fold = []
    y_test_idx_fold = []
    scores_fold = []
    
    train_data = parsed_sentences[train_index]
    test_data = parsed_sentences[test_index]
    X_train = [np.asarray([a[0][0] for a in train_data]), np.asarray([a[0][1] for a in train_data])]
    Y_train = np.asarray([a[1] for a in train_data])
    X_test = [np.asarray([a[0][0] for a in test_data]), np.asarray([a[0][1] for a in test_data])]
    Y_test = np.asarray([a[1] for a in test_data])
    model = create_model()
    model.compile(optimizer = 'rmsprop',
                  loss = 'categorical_crossentropy',
                  metrics = ['accuracy'])

    model.fit(X_train, Y_train, epochs = 3, validation_split = 0.1, batch_size = 4)

    y_pred_padded = np.argmax(model.predict(X_test), axis = -1)
    y_true_padded = np.argmax(Y_test, axis = -1)
    
    for i in range(X_test[0].shape[0]):
        for j in range(X_test[0].shape[1]):
            if X_test[0][i][j] == word2idx['PAD']:
                continue
            else:
                pred = y_pred_padded[i][j]
                true = y_true_padded[i][j]
                y_pred_idx_fold.append(pred)
                y_pred_idx.append(pred)
                y_test_idx_fold.append(true)
                y_test_idx.append(true)
                scores.append(pred == true)
                scores_fold.append(pred == true)

    prec_, rec_, fscore_, _ = precision_recall_fscore_support(y_test_idx_fold, y_pred_idx_fold, average = 'weighted')
    print('[Fold ({}/{})] Accuracy: {}, Precision: {}, Recall: {}, FScore: {}'.format(fold_count, kf.n_splits, np.asarray(scores_fold).mean(), prec_, rec_, fscore_))
    foldwise_score_outputs.append('[Fold ({}/{})] Accuracy: {}, Precision: {}, Recall: {}, FScore: {}'.format(fold_count, kf.n_splits, np.asarray(scores_fold).mean(), prec_, rec_, fscore_))
    
prec, rec, fscore, _ = precision_recall_fscore_support(y_test_idx, y_pred_idx, average = 'weighted')
print('Accuracy: {}, Precision: {}, Recall: {}, FScore: {}'.format(np.asarray(scores).mean(), prec, rec, fscore))

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 50)           0                                            
__________________________________________________________________________________________________
masking_1 (Masking)             (None, 50)           0           input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 50, 50)       72900       masking_1[0][0]                  
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 50, 14)       0                                            
____________________________________________________________________________________________

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 648 samples, validate on 72 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
[Fold (1/5)] Accuracy: 0.9526610644257703, Precision: 0.9485268096309561, Recall: 0.9526610644257703, FScore: 0.949593291838548
Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 50)           0                                            
__________________________________________________________________________________________________
masking_3 (Masking)             (None, 50)           0           input_3[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 50, 50)       72900       masking_3[0][0]                  
_______________________________________________________________________________________

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 648 samples, validate on 72 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
[Fold (2/5)] Accuracy: 0.9534751773049646, Precision: 0.9505178345375412, Recall: 0.9534751773049646, FScore: 0.9510871517860034
Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, 50)           0                                            
__________________________________________________________________________________________________
masking_5 (Masking)             (None, 50)           0           input_5[0][0]                    
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 50, 50)       72900       masking_5[0][0]                  
______________________________________________________________________________________

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 648 samples, validate on 72 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
[Fold (3/5)] Accuracy: 0.9556722076407116, Precision: 0.9493129500272377, Recall: 0.9556722076407116, FScore: 0.9487352416691274
Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            (None, 50)           0                                            
__________________________________________________________________________________________________
masking_7 (Masking)             (None, 50)           0           input_7[0][0]                    
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 50, 50)       72900       masking_7[0][0]                  
______________________________________________________________________________________

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 648 samples, validate on 72 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
[Fold (4/5)] Accuracy: 0.9653212052302445, Precision: 0.9605602782699517, Recall: 0.9653212052302445, FScore: 0.9611318240196485
Model: "model_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_9 (InputLayer)            (None, 50)           0                                            
__________________________________________________________________________________________________
masking_9 (Masking)             (None, 50)           0           input_9[0][0]                    
__________________________________________________________________________________________________
embedding_5 (Embedding)         (None, 50, 50)       72900       masking_9[0][0]                  
______________________________________________________________________________________

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 648 samples, validate on 72 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
[Fold (5/5)] Accuracy: 0.9520069808027923, Precision: 0.9422223141228804, Recall: 0.9520069808027923, FScore: 0.9450074261739186
Accuracy: 0.9558352402745995, Precision: 0.9498458461158625, Recall: 0.9558352402745995, FScore: 0.9514009972305221


In [18]:
print("Foldwise scores:")
for i in foldwise_score_outputs:
    print(i)
print("\n----\nAveraged Cross-Validation scores:")
print('Accuracy: {}, Precision: {}, Recall: {}, FScore: {}'.format(np.asarray(scores).mean(), prec, rec, fscore))

Foldwise scores:
[Fold (1/5)] Accuracy: 0.9526610644257703, Precision: 0.9485268096309561, Recall: 0.9526610644257703, FScore: 0.949593291838548
[Fold (2/5)] Accuracy: 0.9534751773049646, Precision: 0.9505178345375412, Recall: 0.9534751773049646, FScore: 0.9510871517860034
[Fold (3/5)] Accuracy: 0.9556722076407116, Precision: 0.9493129500272377, Recall: 0.9556722076407116, FScore: 0.9487352416691274
[Fold (4/5)] Accuracy: 0.9653212052302445, Precision: 0.9605602782699517, Recall: 0.9653212052302445, FScore: 0.9611318240196485
[Fold (5/5)] Accuracy: 0.9520069808027923, Precision: 0.9422223141228804, Recall: 0.9520069808027923, FScore: 0.9450074261739186

----
Averaged Cross-Validation scores:
Accuracy: 0.9558352402745995, Precision: 0.9498458461158625, Recall: 0.9558352402745995, FScore: 0.9514009972305221


In [19]:
X_test_data = [np.asarray([a[0] for a in parsed_test_sentences]), np.asarray([a[1] for a in parsed_test_sentences])]
predictions_full = model.predict(X_test_data)

In [20]:
predictions_list = []
for i, s in enumerate(test_sentences):
    output = []
    for j, w in enumerate(s[0]):
        output.append(np.argmax(predictions_full[i][j]))
    predictions_list.append(output)

In [21]:
with open('NER-TestSet-RNN-Predictions.txt', 'w', encoding = 'utf-8') as f:
    for words, predictions in zip(test_sentences, predictions_list):
        assert(len(words[0]) == len(predictions))
        for word, prediction in zip(words[0], predictions):
            f.writelines(word + '\t' + idx2labels[prediction] + '\n')
        f.writelines('\n')