In [106]:
# Import the required libraries.
import re
import math
import random
import collections
import operator
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import precision_recall_fscore_support, f1_score, accuracy_score
from sklearn.metrics import confusion_matrix
from collections import defaultdict, Counter

from keras.utils import to_categorical
from keras.layers import *
from keras.models import Model
from keras import Model, Sequential
from sklearn.model_selection import StratifiedKFold
from keras.callbacks import *
from sklearn.metrics import classification_report
# from sklearn.utils import class_weight.compute_class_weight

import math

random.seed(11)
np.random.seed(11)

In [2]:
!ls

NER-Dataset-10Types-Train.txt
NER-Dataset--TestSet.txt
NER-Dataset-Train.txt
Q1 - NER Prediction 10 Types (buggy).ipynb
Q1 - NER prediction-10 Types (no test output)-Copy1.ipynb
Q1 - NER prediction-10 Types (no test output).ipynb
Q1 - NER prediction-10 Types (with test output).ipynb
Q1 - NER prediction.ipynb
Q1 - NER prediction (with test output).ipynb
Q2.ipynb


In [3]:
with open('NER-Dataset-Train.txt', 'r') as f:
    ner_dataset = f.readlines()

In [4]:
sentences = []
words = []
tags = []
for line in ner_dataset:
    line = line.strip()
    if line == '':
        sentences.append((words, tags))
        words = []
        tags = []
    else:
        word, tag = line.split('\t')
        words.append(word)
        tags.append(tag)

if len(words) > 0:
    sentences.append((words, tags))
    words = []
    tags= []

In [5]:
len(sentences)

900

In [6]:
vocab_counts = Counter(sum([a[0] for a in sentences], [])).most_common()

In [7]:
words_to_keep = set([word for word, count in vocab_counts if count > 1])
len(words_to_keep)

1456

In [8]:
# parsed_sentences = [([w if w in words_to_keep else 'UNK' for w in words], tags) for words, tags in sentences]

In [9]:
with open('NER-Dataset--TestSet.txt', 'r') as f:
    test_dataset = f.readlines()

test_sentences = []
words = []
for line in test_dataset:
    line = line.strip()
    if line == '':
        test_sentences.append((words,))
        words = []
    else:
        word = line
        words.append(word)

if len(words) > 0:
    test_sentences.append((words,))
    words = []


In [10]:
len(sentences), len(test_sentences)

(900, 100)

In [11]:
word_features = ['twoDigitNum',
                'fourDigitNum',
                'containsDigitAndAlpha',
                'containsDigitAndDash',
                'containsDigitAndSlash',
                'containsDigitAndComma',
                'containsDigitAndPeriod',
                'otherNum',
                'allCaps',
                'capPeriod',
                'firstWord',
                'initCap',
                'lowerCase',
                'other']

In [12]:
def get_word_features(sentence):
    features = []
    ## Optimize and use an Enum!
    firstword = True
    for word in sentence:
        if word.isnumeric() and len(word) == 2:
            features.append('twoDigitNum')
        elif word.isnumeric() and len(word) == 4:
            features.append('fourDigitNum')
        elif word.isalnum() and not word.isalpha() and not word.isnumeric():
            features.append('containsDigitAndAlpha')
        elif word.replace('-', '').isnumeric():
            features.append('containsDigitAndAlpha')
        elif word.replace('/', '').isnumeric():
            features.append('containsDigitAndSlash')
        elif word.replace('.', '').replace(',', '').isnumeric() and ',' in word:
            features.append('containsDigitAndComma')
        elif word.replace('.', '').isnumeric():
            features.append('containsDigitAndPeriod')
        elif word.isnumeric():
            features.append('otherNum')
        elif word.isupper():
            features.append('allCaps')
        elif len(word) == 2 and word[0].isupper() and word[1] == '.':
            features.append('capPeriod')
        elif firstword:
            features.append('firstWord')
        elif word[0].isupper():
            features.append('initCap')
        elif word.islower():
            features.append('lowerCase')
        else:
            features.append('other')
        firstword = False

    return features

In [13]:
max_len_found = max(len(s[0]) for s in sentences)

In [14]:
max_len = max_len_found + ((50 - (max_len_found % 50)) % 50)

In [15]:
eye_mat = list(np.eye(len(word_features)))
wordfeat2float = {feat: eye_mat[i] for i, feat in enumerate(word_features)}

In [16]:
word2idx = {'UNK': 0, 'PAD': 1}
word2idx.update({word: i + 2 for i, word in enumerate(sorted(words_to_keep))})

In [17]:
def numberize_sentence(words, max_len=50):
    features = get_word_features(words)
    word_idx = [word2idx[w] if w in word2idx.keys() else word2idx['UNK'] for w in words]
    feat_np = [wordfeat2float[f] for f in features]
    word_padding = [word2idx['PAD'] for _ in range(max_len - len(word_idx))]
    feat_padding = [np.ones((len(word_features),)) * 2 for _ in range(max_len - len(word_idx))]
    word_idx = np.asarray(word_idx + word_padding)
    feat_np = np.asarray(feat_np + feat_padding)
    return word_idx, feat_np

In [18]:
labels = set.union(*(set(s[1]) for s in sentences))
n_labels = len(labels)
eye_mat = list(np.eye(len(labels)))
labels2float = {feat: eye_mat[i] for i, feat in enumerate(labels)}

def numberize_labels(gt_labels, max_len=50):
    labels_np = [labels2float[l] for l in gt_labels]
    labels_padding = [labels2float['O'] for _ in range(max_len - len(gt_labels))]
    return np.asarray(labels_np + labels_padding)

In [19]:
def create_model():
    input_words = Input(shape=(max_len,))
    input_feats = Input(shape=(max_len, len(word_features)))
    masked_words = Masking(mask_value = 1)(input_words)
    masked_feats = Masking(mask_value = 2)(input_feats)
    emb = Embedding(input_dim=(len(word2idx)), output_dim=50, input_length=max_len)(masked_words)
    drop_emb = Dropout(0.1)(emb)
    concat_out = Concatenate()([drop_emb, masked_feats])
    rnn_out = Bidirectional(SimpleRNN(units=100, return_sequences=True, recurrent_dropout=0.1))(concat_out)
    dense_out = TimeDistributed(Dense(n_labels, activation="softmax"))(rnn_out)
    model = Model(inputs=[input_words, input_feats], outputs=dense_out)
    model.summary()
    return model

In [20]:
parsed_sentences = [(numberize_sentence(s[0]), numberize_labels(s[1])) for s in sentences]

In [21]:
parsed_test_sentences = [numberize_sentence(s[0]) for s in test_sentences]

In [46]:
Counter(sum([s[1] for s in sentences], []))

Counter({'O': 16508, 'B': 582, 'I': 390})

In [51]:
Counter(sum([np.argmax(s[1], axis=-1).tolist() for s in parsed_sentences], []))

Counter({0: 44028, 1: 582, 2: 390})

In [61]:
train_data[7][0][0].shape, train_data[7][1].shape

((50,), (50, 3))

In [65]:
X_train[0][7].shape, Y_train[7].shape

((50,), (50, 3))

In [74]:
# Build the test and training sets of sentences.
kf = KFold(n_splits = 5, shuffle = False)
parsed_sentences = np.asarray(parsed_sentences)
scores = []
scores1 = []
y_pred_idx = []
y_pred_idx1 = []
y_test_idx = []
y_test_idx1 = []

preds = []

for train_index, test_index in kf.split(parsed_sentences):
    train_data = parsed_sentences[train_index]
    test_data = parsed_sentences[test_index]
    X_train = [np.asarray([a[0][0] for a in train_data]), np.asarray([a[0][1] for a in train_data])]
    Y_train = np.asarray([a[1] for a in train_data])
    X_test = [np.asarray([a[0][0] for a in test_data]), np.asarray([a[0][1] for a in test_data])]
    Y_test = np.asarray([a[1] for a in test_data])
    model = create_model()
    model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

    model.fit(X_train, Y_train, epochs=3, validation_split=0.2, batch_size=4)

    y_pred_padded = np.argmax(model.predict(X_test), axis=-1)
    y_true_padded = np.argmax(Y_test, axis=-1)
    
    for i in range(X_test[0].shape[0]):
        for j in range(X_test[0].shape[1]):
            if X_test[0][i][j] == word2idx['PAD']:
                continue
            else:
                pred = y_pred_padded[i][j]
                true = y_true_padded[i][j]
                y_pred_idx.append(pred)
                y_test_idx.append(true)
                scores.append(pred == true)

    prec_, rec_, fscore_, _ = precision_recall_fscore_support(y_test_idx, y_pred_idx, average = 'macro')
    print('[still updating...] Accuracy: {}, Precision: {}, Recall: {}, FScore: {}'.format(np.asarray(scores).mean(), prec_, rec_, fscore_))
    
prec, rec, fscore, _ = precision_recall_fscore_support(y_test_idx, y_pred_idx, average = 'macro')
print('Accuracy: {}, Precision: {}, Recall: {}, FScore: {}'.format(np.asarray(scores).mean(), prec, rec, fscore))

Model: "model_16"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_31 (InputLayer)           (None, 50)           0                                            
__________________________________________________________________________________________________
masking_31 (Masking)            (None, 50)           0           input_31[0][0]                   
__________________________________________________________________________________________________
embedding_16 (Embedding)        (None, 50, 50)       72900       masking_31[0][0]                 
__________________________________________________________________________________________________
input_32 (InputLayer)           (None, 50, 14)       0                                            
___________________________________________________________________________________________

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 576 samples, validate on 144 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
[still updating...] Accuracy: 0.9495798319327731, Precision: 0.6827391340480347, Recall: 0.5613509653986885, FScore: 0.6059678416300517
Model: "model_17"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_33 (InputLayer)           (None, 50)           0                                            
__________________________________________________________________________________________________
masking_33 (Masking)            (None, 50)           0           input_33[0][0]                   
__________________________________________________________________________________________________
embedding_17 (Embedding)        (None, 50, 50)       72900       masking_33[0][0]                 
_____________________________________________________________________________

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 576 samples, validate on 144 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
[still updating...] Accuracy: 0.9481324876673713, Precision: 0.7442810193706251, Recall: 0.536210706090852, FScore: 0.6018719264523495
Model: "model_18"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_35 (InputLayer)           (None, 50)           0                                            
__________________________________________________________________________________________________
masking_35 (Masking)            (None, 50)           0           input_35[0][0]                   
__________________________________________________________________________________________________
embedding_18 (Embedding)        (None, 50, 50)       72900       masking_35[0][0]                 
______________________________________________________________________________

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 576 samples, validate on 144 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
[still updating...] Accuracy: 0.9488787533257317, Precision: 0.7586130797750258, Recall: 0.5312732211038077, FScore: 0.6004491588837595
Model: "model_19"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_37 (InputLayer)           (None, 50)           0                                            
__________________________________________________________________________________________________
masking_37 (Masking)            (None, 50)           0           input_37[0][0]                   
__________________________________________________________________________________________________
embedding_19 (Embedding)        (None, 50, 50)       72900       masking_37[0][0]                 
_____________________________________________________________________________

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 576 samples, validate on 144 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
[still updating...] Accuracy: 0.9525708588520154, Precision: 0.7627007394702495, Recall: 0.5433828854596211, FScore: 0.6124723545493426
Model: "model_20"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_39 (InputLayer)           (None, 50)           0                                            
__________________________________________________________________________________________________
masking_39 (Masking)            (None, 50)           0           input_39[0][0]                   
__________________________________________________________________________________________________
embedding_20 (Embedding)        (None, 50, 50)       72900       masking_39[0][0]                 
_____________________________________________________________________________

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 576 samples, validate on 144 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
[still updating...] Accuracy: 0.9517734553775744, Precision: 0.7371925586274307, Recall: 0.5313648049441254, FScore: 0.59650134529246
Accuracy: 0.9517734553775744, Precision: 0.7371925586274307, Recall: 0.5313648049441254, FScore: 0.59650134529246


In [77]:
X_test_data = [np.asarray([a[0] for a in parsed_test_sentences]), np.asarray([a[1] for a in parsed_test_sentences])]

In [81]:
predictions_full = model.predict(X_test_data)

In [82]:
predictions_full.shape

(100, 50, 3)

In [99]:
predictions_list = []
for i, s in enumerate(test_sentences):
    output = []
    for j, w in enumerate(s[0]):
        output.append(np.argmax(predictions_full[i][j]))
    predictions_list.append(output)

In [103]:
len(predictions_list)

100

In [105]:
print(predictions_list)

[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0