In [14]:
from tqdm import tqdm
from nltk import bigrams
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import recall_score, precision_score, accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn import preprocessing
import keras
from keras.utils import np_utils
from gensim.models import KeyedVectors
from keras.models import Sequential
from keras.layers import Dense, Embedding, Flatten, BatchNormalization, Dropout, Input, Activation
from keras.callbacks import ModelCheckpoint, EarlyStopping, TensorBoard

In [7]:
embeddingPath = './../data/GoogleNews-vectors-negative300.bin'
embeddings = KeyedVectors.load_word2vec_format(embeddingPath, binary=True)

In [8]:
DATA_PATH = './../data/Brown_tagged_train.txt'

In [9]:
with open(DATA_PATH, 'r') as f:
    data = f.read().splitlines()
data = np.array(data)


In [10]:
def split_Xy(test_Xy):
    """
    test_Xy: List of list of tokens and tags
    Returns: List of tokens and list of tags
    """
    test_y = []
    test_X = []

    for sent in test_Xy:
        tagged_sent = sent.split(' ')
        sent_y = []
        sent_X = []

        for word in tagged_sent:
            if word == "":
                continue
            actual_word, tag = split_tag_word(word)
            sent_X.append(actual_word)
            sent_y.append(tag)

        test_y.append(sent_y)
        test_X.append(sent_X)

    return test_X, test_y


In [11]:
def split_tag_word(inp):
    """
    Returns word, tag for the given input
    """
    arr = inp.split('/')
    tag = arr[-1]
    del arr[-1]
    word = '/'.join(arr)
    return word, tag


In [12]:
trainSet, testXy = split_Xy(data)

In [18]:
def transform_to_dataset(sentences, tags, window=1):
    X = []
    y = []
    unk = 0
    for ctr in tqdm(range(len(sentences))):
        sent = sentences[ctr]
        for i in range(len(sent)):
            vec, unknown = features_embs(sent, i, window)
            X.append(vec)
            y.append(tags[ctr][i])
            unk += unknown
    return X, y, unk


def vectorize(trainSent, trainTags, window=1, embedding='word2vec'):
    X, y, unk = transform_to_dataset(trainSent, trainTags, window)
    # X = np.array(X)
    # y = np.array(y)
    print('Unknown words:', unk)
    X = np.asarray(X, dtype=object)
    y = np.asarray(y, dtype=object)
    print('X shape:', X.shape)
    print('y shape:', y.shape)
    return X, y

def getYtrain(sentences, tags):
    y_train = []
    for ctr in tqdm(range(len(sentences))):
        sent = sentences[ctr]
        for i in range(len(sent)):
            y_train.append(tags[ctr][i])
    y_train = np.array(y_train)
    return y_train


In [19]:
y_train = getYtrain(trainSet, testXy)

100%|██████████| 27491/27491 [00:00<00:00, 114988.01it/s]


In [20]:
classes = sorted(list(set(y_train)))


In [21]:
classes

['.',
 'ADJ',
 'ADP',
 'ADV',
 'CONJ',
 'DET',
 'NOUN',
 'NUM',
 'PRON',
 'PRT',
 'VERB',
 'X']

In [22]:
le = preprocessing.LabelEncoder()
y_train = le.fit_transform(y_train)
y_train = np_utils.to_categorical(y_train)

print(y_train.shape)


(543149, 12)


In [23]:
embs = []
out_ = 0
in_ = 0
for i in tqdm(range(len(trainSet))):
    for j in trainSet[i]:
        try:
            embs.append(embeddings[j])
            in_ += 1
        except:
            oov = np.random.uniform(-0.25, 0.25, 300)
            embs.append(oov)
            out_ += 1


100%|██████████| 27491/27491 [00:16<00:00, 1647.26it/s]


In [24]:
len(embs), y_train.shape


(543149, (543149, 12))

In [25]:
from sklearn.model_selection import train_test_split
embs_train1, embs_test1, y_train1, y_test1 = train_test_split(
    embs, y_train, test_size=0.2, random_state=42)
print("Previous: ", len(embs), y_train.shape)
print("Current: ", len(embs_train1), len(
    embs_test1), y_train1.shape, y_test1.shape)


Previous:  543149 (543149, 12)
Current:  434519 108630 (434519, 12) (108630, 12)


In [26]:
embs_train1 = np.array(embs_train1)
embs_test1 = np.array(embs_test1)

In [27]:
model = Sequential()
model.add(Dense(600, activation='relu', input_dim=embs_train1.shape[1]))
model.add(Dense(300, activation='relu'))
model.add(Dense(150, activation='relu'))
model.add(Dense(y_train1.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy',
              optimizer='adam', metrics=['accuracy'])
model.summary()

2021-10-31 22:24:39.091503: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 600)               180600    
_________________________________________________________________
dense_1 (Dense)              (None, 300)               180300    
_________________________________________________________________
dense_2 (Dense)              (None, 150)               45150     
_________________________________________________________________
dense_3 (Dense)              (None, 12)                1812      
Total params: 407,862
Trainable params: 407,862
Non-trainable params: 0
_________________________________________________________________


In [28]:
nb_epoch = 7
batch_size = 128
cp = ModelCheckpoint(filepath="tagger.h5",
                     monitor='val_acc',
                     save_best_only=True,
                     verbose=1)

tb = TensorBoard(log_dir='./logs',
                 histogram_freq=0,
                 write_graph=True,
                 write_images=True)

early_stopping = EarlyStopping(monitor='val_acc', patience=5)

history = model.fit(embs_train1, y_train1,
                    epochs=nb_epoch,
                    batch_size=batch_size,
                    shuffle=True,
                    verbose=1, validation_data=(embs_test1, y_test1))


2021-10-31 22:25:14.946734: I tensorflow/core/profiler/lib/profiler_session.cc:131] Profiler session initializing.
2021-10-31 22:25:14.946757: I tensorflow/core/profiler/lib/profiler_session.cc:146] Profiler session started.
2021-10-31 22:25:14.949936: I tensorflow/core/profiler/lib/profiler_session.cc:164] Profiler session tear down.
2021-10-31 22:25:19.762063: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
