In [0]:
# Import the required libraries.
import re
import math
import random
import collections
import operator
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import precision_recall_fscore_support, f1_score
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict

random.seed(11)
np.random.seed(11)

In [0]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Conv1D, InputLayer, Bidirectional, TimeDistributed, Embedding, Activation, Masking, Flatten
from keras.optimizers import Adam, SGD
from keras.preprocessing.sequence import pad_sequences

In [4]:
from google.colab import files
uploaded = files.upload()

Saving Brown_train.txt to Brown_train.txt


In [0]:
def parse_sentence(sentence):
    '''
    Function for parsing the words and tags from the
    sentences of the input corpus.
    '''
    word_tag_pairs = sentence.split(" ")
    words = []
    tags = []

    for i, word_tag in enumerate(word_tag_pairs):
        word, tag = word_tag.strip().rsplit('/', 1)
        words.append(word)
        tags.append(tag)
        
    return words, tags

In [0]:
# Parse the sentences into a list.
parsed_sentences = []

with open('./Brown_train.txt', 'r') as file:
    sentences = file.readlines()

    for sentence in sentences:
        sentence = sentence.strip()
        parsed_sentences.append(parse_sentence(sentence))

In [0]:
def get_vocab(X_train, Y_train):
    '''
    Function for building the vocabulary from the training set of
    words and tags.
    '''
    vocabulary2id = dict()    
    tag2id = dict()
    vocabulary2id['PAD'] = 0
    vocabulary2id['UNK'] = 1

    for sent in X_train:
        for word in sent:
            if word not in vocabulary2id.keys():
                vocabulary2id[word] = len(vocabulary2id)
    
    tag2id['PAD'] = 0
    for sent in Y_train:
        for tag in sent:
            if tag not in tag2id.keys():
                tag2id[tag] = len(tag2id)
    
    return vocabulary2id, tag2id

def get_word_tag_counts(X_train, Y_train, vocabulary2id, tag2id):
    '''
    Function for calculating the counts pertaining to the
    individual word tags.
    '''
    wordcount = defaultdict(int)
    tagcount = defaultdict(int)
    tagpaircount = defaultdict(int)
    tagtriplecount = defaultdict(int)
    
    for sent in X_train:
        for word in sent:
            wordcount[word] += 1
    
    for sent in Y_train:
        for tag in sent:
            tagcount[tag] += 1
    
    for sent in Y_train:
        for i in range(len(sent) - 1):
            tagpaircount[sent[i], sent[i + 1]] += 1

    for sent in Y_train:
        for i in range(len(sent) - 2):
            tagtriplecount[sent[i], sent[i + 1], sent[i + 2]] += 1
    
    return wordcount, tagcount, tagpaircount, tagtriplecount

In [8]:
parsed_sentences[:5]

[(['At',
   'that',
   'time',
   'highway',
   'engineers',
   'traveled',
   'rough',
   'and',
   'dirty',
   'roads',
   'to',
   'accomplish',
   'their',
   'duties',
   '.'],
  ['ADP',
   'DET',
   'NOUN',
   'NOUN',
   'NOUN',
   'VERB',
   'ADJ',
   'CONJ',
   'ADJ',
   'NOUN',
   'PRT',
   'VERB',
   'DET',
   'NOUN',
   '.']),
 (['Using',
   'privately-owned',
   'vehicles',
   'was',
   'a',
   'personal',
   'hardship',
   'for',
   'such',
   'employees',
   ',',
   'and',
   'the',
   'matter',
   'of',
   'providing',
   'state',
   'transportation',
   'was',
   'felt',
   'perfectly',
   'justifiable',
   '.'],
  ['VERB',
   'ADJ',
   'NOUN',
   'VERB',
   'DET',
   'ADJ',
   'NOUN',
   'ADP',
   'ADJ',
   'NOUN',
   '.',
   'CONJ',
   'DET',
   'NOUN',
   'ADP',
   'VERB',
   'NOUN',
   'NOUN',
   'VERB',
   'VERB',
   'ADV',
   'ADJ',
   '.']),
 (['Once',
   'the',
   'principle',
   'was',
   'established',
   ',',
   'the',
   'increase',
   'in',
   'state-owned'

In [0]:
def build_model():
    model = Sequential()
    model.add(InputLayer(input_shape=(5, )))
    # model.add(Masking(mask_value=float(vocabulary2id['UNK']),input_shape=(5,)))
    model.add(Embedding(len(vocabulary2id), 100))
    model.add(Flatten())
    # model.add(Bidirectional(LSTM(int((128+256)/2), return_sequences=True)))
    # model.add(TimeDistributed(Dense(len(tag2id))))
    model.add(Dense(len(tag2id)))
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy',
                  optimizer=Adam(0.001),
                  metrics=['accuracy'])
    model.summary()
    return model

In [0]:
def id2onehot(Y, numtags):
    out = []
    for s in Y:
        out.append(np.zeros(numtags))
        out[-1][s] = 1.0
    return np.array(out)

In [0]:
def make_example(words, vocabulary2id):
    words_new = ['PAD', 'PAD'] + words + ['PAD', 'PAD']
    examples = []
    for i in range(len(words)):
        context_words = words_new[i: i + 5]
        context_word_idx = [vocabulary2id[w] if w in vocabulary2id.keys() else vocabulary2id['UNK'] for w in context_words]
        examples.append(context_word_idx)

    return examples

In [65]:
# Build the test and training sets of sentences.
kf = KFold(n_splits = 3, shuffle = False)
parsed_sentences = np.asarray(parsed_sentences)
scores = []
scores1 = []
y_pred_idx = []
y_pred_idx1 = []
y_test_idx = []
y_test_idx1 = []

preds_all_folds = []
golds_all_folds = []

for fold_num, (train_index, test_index) in enumerate(kf.split(parsed_sentences)):
    train_data = parsed_sentences[train_index]
    test_data = parsed_sentences[test_index]
    X_train = [a[0] for a in train_data]
    Y_train = [a[1] for a in train_data]
    X_test = [a[0] for a in test_data]
    Y_test = [a[1] for a in test_data]

    # Build the vocabulary and word counts.
    vocabulary2id, tag2id = get_vocab(X_train, Y_train)

    # padlen = max(len(i) for i in X_train)
    # def pad(sentence, padid=vocabulary2id['PAD']):
    #     out = sentence[:padlen]
    #     padding = [padid for _ in range(padlen - len(out))]
    #     return out + padding
    # break
    X_train_ids = []
    Y_train_ids = []
    for x_sent, y_sent in zip(X_train, Y_train):
        X_train_ids.extend(make_example(x_sent, vocabulary2id))
        Y_train_ids.extend([tag2id[word] if word in tag2id.keys() else tag2id['UNK'] for word in y_sent])

    X_test_ids = []
    Y_test_ids = []
    for x_sent, y_sent in zip(X_test, Y_test):
        X_test_ids.extend(make_example(x_sent, vocabulary2id))
        Y_test_ids.extend([tag2id[word] if word in tag2id.keys() else tag2id['UNK'] for word in y_sent])
    
    X_train_ids = np.asarray(X_train_ids)
    X_test_ids = np.asarray(X_test_ids)

    # X_test_ids = np.array([pad([vocabulary2id[word] if word in vocabulary2id.keys() else vocabulary2id['UNK'] for word in sent]) for sent in X_test])

    # Y_train_ids = np.asarray([pad([tag2id[word] if word in tag2id.keys() else tag2id['UNK'] for word in sent], tag2id['PAD']) for sent in Y_train])
    # Y_test_ids = np.asarray([pad([tag2id[word] if word in tag2id.keys() else tag2id['UNK'] for word in sent], tag2id['PAD']) for sent in Y_test])

    Y_train_onehot = id2onehot(Y_train_ids, len(tag2id))
    Y_test_onehot = id2onehot(Y_test_ids, len(tag2id))

    model = build_model()
    model.fit(X_train_ids, Y_train_onehot, batch_size=128, epochs=5, validation_split=0.2)

    predictions = model.predict(X_test_ids)

    # test_accuracy = np.sum((Y_test_ids == np.argmax(predictions, axis=-1)) * (Y_test_ids != 0)) / np.sum((Y_test_ids != 0))
    # print('Fold {} test_accuracy: {}'.format(fold_num + 1, test_accuracy))

    predictions_argmax = np.argmax(predictions, axis=-1)

    y_pred_nopad = predictions_argmax[:]
    y_true_nopad = Y_test_ids[:]

    # for i in range(len(Y_test_ids)):
    #     if Y_test_ids[i] != 0 and predictions_argmax[i] != 0:
    #         y_true_nopad.append(Y_test_ids[i][j])
    #         if predictions_argmax[i][j] == 0:
    #             y_pred_nopad.append(1)
    #         else:
    #             y_pred_nopad.append(predictions_argmax[i][j])

    preds_all_folds.extend(y_pred_nopad)
    golds_all_folds.extend(y_true_nopad)

    y_pred_nopad = np.asarray(y_pred_nopad)
    y_true_nopad = np.asarray(y_true_nopad)
    test_accuracy = (y_pred_nopad == y_true_nopad).mean()
    print('Fold {} test_accuracy: {}'.format(fold_num + 1, test_accuracy))
    prec, rec, fscore, _ = precision_recall_fscore_support(y_true_nopad, y_pred_nopad, average = 'weighted')
    print('Fold {} Precision: {} Recall: {} F1-Score: {}'.format(fold_num + 1, prec, rec, fscore))

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 5, 100)            2211500   
_________________________________________________________________
flatten_4 (Flatten)          (None, 500)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 13)                6513      
_________________________________________________________________
activation_3 (Activation)    (None, 13)                0         
Total params: 2,218,013
Trainable params: 2,218,013
Non-trainable params: 0
_________________________________________________________________


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 238759 samples, validate on 59690 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Fold 1 test_accuracy: 0.9145525132815693
Fold 1 Precision: 0.9154295258590754 Recall: 0.9145525132815693 F1-Score: 0.9129063570699006


  _warn_prf(average, modifier, msg_start, len(result))


Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 5, 100)            2850100   
_________________________________________________________________
flatten_5 (Flatten)          (None, 500)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 13)                6513      
_________________________________________________________________
activation_4 (Activation)    (None, 13)                0         
Total params: 2,856,613
Trainable params: 2,856,613
Non-trainable params: 0
_________________________________________________________________
Train on 318057 samples, validate on 79515 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Fold 2 test_accuracy: 0.9538182542571972
Fold 2 Precision: 0.9539599018345375 Recall: 0.9538182542571972 F1-Score: 0.9538492424867576
Model: 

In [67]:
print("---Averaged Results over all the epochs---")
test_accuracy = (np.asarray(preds_all_folds) == np.asarray(golds_all_folds)).mean()
print('Average K-Fold Test Accuracy: {}'.format(test_accuracy))
prec, rec, fscore, _ = precision_recall_fscore_support(preds_all_folds, golds_all_folds, average = 'weighted')
print('Average K-Fold Precision: {} Recall: {} F1-Score: {}'.format(prec, rec, fscore))

---Averaged Results over all the epochs---
Average K-Fold Test Accuracy: 0.9347582339284433
Average K-Fold Precision: 0.9363865544203436 Recall: 0.9347582339284433 F1-Score: 0.9351816000908588


  _warn_prf(average, modifier, msg_start, len(result))


In [68]:
id2tag = {v: k for k, v in tag2id.items()}
print(classification_report([id2tag[i] for i in golds_all_folds], [id2tag[i] for i in preds_all_folds]))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           .       0.99      1.00      0.99     74854
         ADJ       0.87      0.89      0.88     51942
         ADP       0.96      0.97      0.96     38784
         ADV       0.89      0.79      0.84     37582
        CONJ       0.88      0.93      0.91     73425
         DET       0.95      0.94      0.95     44196
        NOUN       0.95      0.95      0.95     83364
         NUM       0.97      0.95      0.96      6795
         PAD       0.00      0.00      0.00         0
        PRON       0.98      0.92      0.95     27098
         PRT       0.95      0.93      0.94     46106
        VERB       0.94      0.95      0.95     58413
           X       0.46      0.22      0.30       590

    accuracy                           0.93    543149
   macro avg       0.83      0.80      0.81    543149
weighted avg       0.93      0.93      0.93    543149

