In [0]:
# Import the required libraries.
import re
import math
import random
import collections
import operator
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import precision_recall_fscore_support, f1_score
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict

random.seed(11)
np.random.seed(11)

In [2]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Conv1D, InputLayer, Bidirectional, TimeDistributed, Embedding, Activation, Masking
from keras.optimizers import Adam, SGD
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [3]:
from google.colab import files
uploaded = files.upload()

Saving Brown_train.txt to Brown_train (1).txt


In [0]:
def parse_sentence(sentence):
    '''
    Function for parsing the words and tags from the
    sentences of the input corpus.
    '''
    word_tag_pairs = sentence.split(" ")
    words = []
    tags = []

    for i, word_tag in enumerate(word_tag_pairs):
        word, tag = word_tag.strip().rsplit('/', 1)
        words.append(word)
        tags.append(tag)
        
    return words, tags

In [0]:
# Parse the sentences into a list.
parsed_sentences = []

with open('./Brown_train.txt', 'r') as file:
    sentences = file.readlines()

    for sentence in sentences:
        sentence = sentence.strip()
        parsed_sentences.append(parse_sentence(sentence))

In [0]:
def get_vocab(X_train, Y_train):
    '''
    Function for building the vocabulary from the training set of
    words and tags.
    '''
    vocabulary2id = dict()    
    tag2id = dict()
    vocabulary2id['PAD'] = 0
    vocabulary2id['UNK'] = 1

    for sent in X_train:
        for word in sent:
            if word not in vocabulary2id.keys():
                vocabulary2id[word] = len(vocabulary2id)
    
    tag2id['PAD'] = 0
    for sent in Y_train:
        for tag in sent:
            if tag not in tag2id.keys():
                tag2id[tag] = len(tag2id)
    
    return vocabulary2id, tag2id

def get_word_tag_counts(X_train, Y_train, vocabulary2id, tag2id):
    '''
    Function for calculating the counts pertaining to the
    individual word tags.
    '''
    wordcount = defaultdict(int)
    tagcount = defaultdict(int)
    tagpaircount = defaultdict(int)
    tagtriplecount = defaultdict(int)
    
    for sent in X_train:
        for word in sent:
            wordcount[word] += 1
    
    for sent in Y_train:
        for tag in sent:
            tagcount[tag] += 1
    
    for sent in Y_train:
        for i in range(len(sent) - 1):
            tagpaircount[sent[i], sent[i + 1]] += 1

    for sent in Y_train:
        for i in range(len(sent) - 2):
            tagtriplecount[sent[i], sent[i + 1], sent[i + 2]] += 1
    
    return wordcount, tagcount, tagpaircount, tagtriplecount

In [8]:
parsed_sentences[:5]

[(['At',
   'that',
   'time',
   'highway',
   'engineers',
   'traveled',
   'rough',
   'and',
   'dirty',
   'roads',
   'to',
   'accomplish',
   'their',
   'duties',
   '.'],
  ['ADP',
   'DET',
   'NOUN',
   'NOUN',
   'NOUN',
   'VERB',
   'ADJ',
   'CONJ',
   'ADJ',
   'NOUN',
   'PRT',
   'VERB',
   'DET',
   'NOUN',
   '.']),
 (['Using',
   'privately-owned',
   'vehicles',
   'was',
   'a',
   'personal',
   'hardship',
   'for',
   'such',
   'employees',
   ',',
   'and',
   'the',
   'matter',
   'of',
   'providing',
   'state',
   'transportation',
   'was',
   'felt',
   'perfectly',
   'justifiable',
   '.'],
  ['VERB',
   'ADJ',
   'NOUN',
   'VERB',
   'DET',
   'ADJ',
   'NOUN',
   'ADP',
   'ADJ',
   'NOUN',
   '.',
   'CONJ',
   'DET',
   'NOUN',
   'ADP',
   'VERB',
   'NOUN',
   'NOUN',
   'VERB',
   'VERB',
   'ADV',
   'ADJ',
   '.']),
 (['Once',
   'the',
   'principle',
   'was',
   'established',
   ',',
   'the',
   'increase',
   'in',
   'state-owned'

In [0]:
def build_model():
    model = Sequential()
    # model.add(InputLayer(input_shape=(padlen, )))
    model.add(Masking(mask_value=float(vocabulary2id['UNK']),input_shape=(padlen,)))
    model.add(Embedding(len(vocabulary2id), 100))
    model.add(Bidirectional(LSTM(int((128+256)/2), return_sequences=True)))
    model.add(TimeDistributed(Dense(len(tag2id))))
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy',
                  optimizer=Adam(0.001),
                  metrics=['accuracy'])
    model.summary()
    return model

In [0]:
def id2onehot(Y, numtags):
    out = []
    for s in Y:
        categories = []
        for item in s:
            categories.append(np.zeros(numtags))
            categories[-1][item] = 1.0
        out.append(categories)
    return np.array(out)

In [12]:
# Build the test and training sets of sentences.
kf = KFold(n_splits = 3, shuffle = False)
parsed_sentences = np.asarray(parsed_sentences)
scores = []
scores1 = []
y_pred_idx = []
y_pred_idx1 = []
y_test_idx = []
y_test_idx1 = []

preds_all_folds = []
golds_all_folds = []

for fold_num, (train_index, test_index) in enumerate(kf.split(parsed_sentences)):
    train_data = parsed_sentences[train_index]
    test_data = parsed_sentences[test_index]
    X_train = [a[0] for a in train_data]
    Y_train = [a[1] for a in train_data]
    X_test = [a[0] for a in test_data]
    Y_test = [a[1] for a in test_data]
    
    # Build the vocabulary and word counts.
    vocabulary2id, tag2id = get_vocab(X_train, Y_train)
    
    padlen = max(len(i) for i in X_train)
    def pad(sentence, padid=vocabulary2id['PAD']):
        out = sentence[:padlen]
        padding = [padid for _ in range(padlen - len(out))]
        return out + padding
    
    X_train_ids = np.asarray([pad([vocabulary2id[word] if word in vocabulary2id.keys() else vocabulary2id['UNK'] for word in sent]) for sent in X_train])
    X_test_ids = np.array([pad([vocabulary2id[word] if word in vocabulary2id.keys() else vocabulary2id['UNK'] for word in sent]) for sent in X_test])

    Y_train_ids = np.asarray([pad([tag2id[word] if word in tag2id.keys() else tag2id['UNK'] for word in sent], tag2id['PAD']) for sent in Y_train])
    Y_test_ids = np.asarray([pad([tag2id[word] if word in tag2id.keys() else tag2id['UNK'] for word in sent], tag2id['PAD']) for sent in Y_test])

    Y_train_onehot = id2onehot(Y_train_ids, len(tag2id))
    Y_test_onehot = id2onehot(Y_test_ids, len(tag2id))

    model = build_model()
    model.fit(X_train_ids, Y_train_onehot, batch_size=128, epochs=5, validation_split=0.2)

    predictions = model.predict(X_test_ids)

    # test_accuracy = np.sum((Y_test_ids == np.argmax(predictions, axis=-1)) * (Y_test_ids != 0)) / np.sum((Y_test_ids != 0))
    # print('Fold {} test_accuracy: {}'.format(fold_num + 1, test_accuracy))

    predictions_argmax = np.argmax(predictions, axis=-1)

    y_pred_nopad = []
    y_true_nopad = []

    for i in range(len(Y_test_ids)):
        for j in range(len(Y_test_ids[i])):
            if Y_test_ids[i][j] != 0 and predictions_argmax[i][j] != 0:
                y_true_nopad.append(Y_test_ids[i][j])
                if predictions_argmax[i][j] == 0:
                    y_pred_nopad.append(1)
                else:
                    y_pred_nopad.append(predictions_argmax[i][j])

    preds_all_folds.extend(y_pred_nopad)
    golds_all_folds.extend(y_true_nopad)

    y_pred_nopad = np.asarray(y_pred_nopad)
    y_true_nopad = np.asarray(y_true_nopad)
    test_accuracy = (y_pred_nopad == y_true_nopad).mean()
    print('Fold {} test_accuracy: {}'.format(fold_num + 1, test_accuracy))
    prec, rec, fscore, _ = precision_recall_fscore_support(y_true_nopad, y_pred_nopad, average = 'weighted')
    print('Fold {} Precision: {} Recall: {} F1-Score: {}'.format(fold_num + 1, prec, rec, fscore))

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
masking_1 (Masking)          (None, 172)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 172, 100)          2211500   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 172, 384)          450048    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 172, 13)           5005      
_________________________________________________________________
activation_1 (Activation)    (None, 172, 13)           0         
Total params: 2,666,553
Trainable params: 2,666,553
Non-trainable params: 0
_________________________________________________________________


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 14661 samples, validate on 3666 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Fold 1 test_accuracy: 0.9010598534402203
Fold 1 Precision: 0.9030468569039867 Recall: 0.9010598534402203 F1-Score: 0.8959284425488834
Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
masking_2 (Masking)          (None, 386)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 386, 100)          2850100   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 386, 384)          450048    
_________________________________________________________________
time_distributed_2 (TimeDist (None, 386, 13)           5005      
_________________________________________________________________
activation_2 (Activation)    (None, 386, 13)           0         
Total params: 3,305

  _warn_prf(average, modifier, msg_start, len(result))


Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
masking_3 (Masking)          (None, 386)               0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 386, 100)          2745900   
_________________________________________________________________
bidirectional_3 (Bidirection (None, 386, 384)          450048    
_________________________________________________________________
time_distributed_3 (TimeDist (None, 386, 13)           5005      
_________________________________________________________________
activation_3 (Activation)    (None, 386, 13)           0         
Total params: 3,200,953
Trainable params: 3,200,953
Non-trainable params: 0
_________________________________________________________________
Train on 14662 samples, validate on 3666 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Fold 3 test

In [14]:
print("---Averaged Results over all the epochs---")
test_accuracy = (np.asarray(preds_all_folds) == np.asarray(golds_all_folds)).mean()
print('Average K-Fold Test Accuracy: {}'.format(test_accuracy))
prec, rec, fscore, _ = precision_recall_fscore_support(preds_all_folds, golds_all_folds, average = 'weighted')
print('Average K-Fold Precision: {} Recall: {} F1-Score: {}'.format(prec, rec, fscore))

---Averaged Results over all the epochs---
Average K-Fold Test Accuracy: 0.9151568111261053
Average K-Fold Precision: 0.9203422617207301 Recall: 0.9151568111261053 F1-Score: 0.9163942865540586


In [15]:
id2tag = {v: k for k, v in tag2id.items()}
print(classification_report([id2tag[i] for i in golds_all_folds], [id2tag[i] for i in preds_all_folds]))

              precision    recall  f1-score   support

           .       0.99      1.00      1.00     74779
         ADJ       0.84      0.86      0.85     51841
         ADP       0.90      0.97      0.94     38739
         ADV       0.85      0.72      0.78     37519
        CONJ       0.85      0.94      0.89     73239
         DET       0.95      0.93      0.94     44180
        NOUN       0.91      0.97      0.94     83267
         NUM       0.97      0.81      0.88      6794
        PRON       0.98      0.81      0.89     27093
         PRT       0.95      0.89      0.92     45995
        VERB       0.94      0.92      0.93     58401
           X       0.00      0.00      0.00       589

    accuracy                           0.92    542436
   macro avg       0.84      0.82      0.83    542436
weighted avg       0.92      0.92      0.91    542436

