In [38]:
from tqdm import tqdm
from nltk import bigrams
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import recall_score, precision_score, accuracy_score, confusion_matrix, ConfusionMatrixDisplay
import gensim
import gensim
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from sklearn.decomposition import PCA
from keras.utils import np_utils
from matplotlib import pyplot
from keras.preprocessing.sequence import pad_sequences
from tensorflow.python import keras
from keras import utils as np_util
import numpy as np
from tensorflow.keras.utils import to_categorical
from tensorflow.math import argmax
from sklearn.model_selection import train_test_split
import copy
from keras.preprocessing.text import Tokenizer
from sklearn import preprocessing
from keras.callbacks import ModelCheckpoint, EarlyStopping, TensorBoard


In [2]:
DATA_PATH = './../data/Brown_tagged_train.txt'


In [3]:
with open(DATA_PATH, 'r') as f:
    data = f.read().splitlines()
data = np.array(data)
trainData, validData = train_test_split(data, test_size=0.2, random_state=0)

In [4]:
def split_Xy(test_Xy):
    """
    test_Xy: List of list of tokens and tags
    Returns: List of tokens and list of tags
    """
    test_y = []
    test_X = []

    for sent in test_Xy:
        tagged_sent = sent.split(' ')
        sent_y = []
        sent_X = []

        for word in tagged_sent:
            if word == "":
                continue
            actual_word, tag = split_tag_word(word)
            sent_X.append(actual_word)
            sent_y.append(tag)

        test_y.append(sent_y)
        test_X.append(sent_X)

    return test_X, test_y

In [5]:
def split_tag_word(inp):
    """
    Returns word, tag for the given input
    """
    arr = inp.split('/')
    tag = arr[-1]
    del arr[-1]
    word = '/'.join(arr)
    return word, tag


In [6]:
trainSents, trainSentTags = split_Xy(trainData)


In [7]:
testSents, testSentTags = split_Xy(validData)


In [8]:

def preprocessData(trainSents, trainSentTags):
    uniqueWords = set()
    for sent in trainSents:
        for word in sent:
            uniqueWords.add(word)

    uniqueTags = set()
    for sent in trainSentTags:
        for tag in sent:
            uniqueTags.add(tag)

    tagIndex = {}
    idx = 0
    for i in uniqueTags:
        tagIndex[i] = idx
        idx += 1

    wordIndex = {}
    idx = 0
    for i in uniqueWords:
        wordIndex[i] = idx
        idx += 1

    # trainSentsText = trainSents.copy()
    # trainTagsText = trainSentTags.copy()
    trainSents[0]
    wordTokenize = Tokenizer()
    wordTokenize.fit_on_texts(trainSents)
    trainSents = wordTokenize.texts_to_sequences(trainSents)
    paddedTrainSents = pad_sequences(
        trainSents, maxlen=387, padding='pre', truncating='post')
    tagTokenize = Tokenizer()
    tagTokenize.fit_on_texts(trainSentTags)
    trainSentTags = tagTokenize.texts_to_sequences(trainSentTags)
    paddedTrainSentTags = pad_sequences(
        trainSentTags, maxlen=387, padding='pre', truncating='post')

    oneHotEncodedTrainSentTags = to_categorical(
        paddedTrainSentTags, num_classes=None, dtype='float32')

    return paddedTrainSents, oneHotEncodedTrainSentTags, tagIndex, wordIndex, trainSents, wordTokenize, tagTokenize


In [10]:
gloveEmbeddings = {} 
f = open('../data/glove.6B.300d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    gloveEmbeddings[word] = coefs
f.close()

In [11]:
def makeModel(trainSents, trainSentTags, testSents, testSentTags):
    trainPaddedSents, trainOneHotEncodedTags, trainTagIndex, trainWordIndex, trainSents, trainWordTokenizer, traintagWordTokenizer = preprocessData(
        trainSents, trainSentTags)
    testPaddedSents, testOneHotEncodedTags, testTagIndex, testWordIndex, testSents, testWordTokenizer, testTagWordTokenizer = preprocessData(
        testSents, testSentTags)
    w2vgensim = gensim.models.word2vec.Word2Vec(
        trainSents, vector_size=300, min_count=1, window=5)
    embeddings = np.zeros((len(trainWordTokenizer.word_index) + 1, 300))
    wordIndices = trainWordTokenizer.word_index
    for word, index in wordIndices.items():
        try:
            embeddings[index, :] = gloveEmbeddings[word]
        except:
            embeddings[index, :] = np.random.uniform(-0.25, 0.25, 300)

    mlp = keras.Sequential()
    mlp.add(keras.layers.Embedding(len(trainWordTokenizer.word_index) + 1,
                                   300, weights=[embeddings], input_length=387, trainable=True))
    mlp.add((keras.layers.Dense(100, activation='relu')))
    mlp.add((keras.layers.Dense(13, activation='softmax')))
    mlp.compile(loss='mean_squared_error',
                optimizer='adam', metrics=['accuracy'])
    mlp.summary()
    mlp.fit(trainPaddedSents, trainOneHotEncodedTags, batch_size=128,
            epochs=2, validation_data=(testPaddedSents, testOneHotEncodedTags))
    return mlp, traintagWordTokenizer, testTagWordTokenizer, trainPaddedSents, trainOneHotEncodedTags, testPaddedSents, testOneHotEncodedTags


In [12]:
mlp, trainTagWordTokenizer, testTagWordTokenizer, trainFinalSents, trainFinalTags, testFinalSents, testFinalTags = makeModel(
    trainSents, trainSentTags, testSents, testSentTags)

2021-10-31 21:43:35.520433: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 387, 300)          8256000   
_________________________________________________________________
dense (Dense)                (None, 387, 100)          30100     
_________________________________________________________________
dense_1 (Dense)              (None, 387, 13)           1313      
Total params: 8,287,413
Trainable params: 8,287,413
Non-trainable params: 0
_________________________________________________________________
Epoch 1/2


2021-10-31 21:43:36.075259: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 2/2


# Without taking context

In [18]:
train_embs = []
train_tags = []
out_ = 0
in_ = 0
for i in tqdm(range(len(trainSents))):
    for k in range(len(trainSents[i])):
        j = trainSents[i][k]
        train_tags.append(trainSentTags[i][k])
        try:
            train_embs.append(gloveEmbeddings[j])
            in_ += 1
        except:
            oov = np.random.uniform(-0.25, 0.25, 300)
            train_embs.append(oov)
            out_ += 1

100%|██████████| 21992/21992 [00:03<00:00, 6299.31it/s]


In [20]:
len(train_embs), len(train_tags)

(433559, 433559)

In [21]:
test_embs = []
test_tags = []
out_ = 0
in_ = 0
for i in tqdm(range(len(testSents))):
    for k in range(len(testSents[i])):
        j = testSents[i][k]
        test_tags.append(testSentTags[i][k])
        try:
            test_embs.append(gloveEmbeddings[j])
            in_ += 1
        except:
            oov = np.random.uniform(-0.25, 0.25, 300)
            test_embs.append(oov)
            out_ += 1


100%|██████████| 5499/5499 [00:01<00:00, 5327.70it/s]


In [23]:
len(test_embs), len(test_tags)

(109590, 109590)

In [34]:
model = keras.Sequential()
model.add(keras.layers.Dense(600, activation='relu', input_dim=300))
model.add(keras.layers.Dense(300, activation='relu'))
model.add(keras.layers.Dense(150, activation='relu'))
model.add(keras.layers.Dense(1, activation='softmax'))
model.compile(loss='categorical_crossentropy',
              optimizer='adam', metrics=['accuracy'])
model.summary()


Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_6 (Dense)              (None, 600)               180600    
_________________________________________________________________
dense_7 (Dense)              (None, 300)               180300    
_________________________________________________________________
dense_8 (Dense)              (None, 150)               45150     
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 151       
Total params: 406,201
Trainable params: 406,201
Non-trainable params: 0
_________________________________________________________________


In [35]:
train_embs = np.array(train_embs)
train_tags = np.array(train_tags)
test_embs = np.array(test_embs)
test_tags = np.array(test_tags)

In [39]:
le = preprocessing.LabelEncoder()
train_embs = le.fit_transform(train_embs)
train_embs = np_utils.to_categorical(train_embs)

ValueError: y should be a 1d array, got an array of shape (433559, 300) instead.

In [36]:
nb_epoch = 5
batch_size = 128
cp = ModelCheckpoint(filepath="tagger.h5",
                     monitor='val_acc',
                     save_best_only=True,
                     verbose=1)

tb = TensorBoard(log_dir='./logs',
                 histogram_freq=0,
                 write_graph=True,
                 write_images=True)

early_stopping = EarlyStopping(monitor='val_acc', patience=5)

history = model.fit(train_embs, train_tags,
                    epochs=nb_epoch,
                    batch_size=batch_size,
                    shuffle=True,
                    verbose=1, validation_data=(test_embs, test_tags))


2021-10-31 22:03:54.097325: I tensorflow/core/profiler/lib/profiler_session.cc:131] Profiler session initializing.
2021-10-31 22:03:54.097360: I tensorflow/core/profiler/lib/profiler_session.cc:146] Profiler session started.
2021-10-31 22:03:54.098195: I tensorflow/core/profiler/lib/profiler_session.cc:164] Profiler session tear down.


Epoch 1/5


2021-10-31 22:04:03.687607: W tensorflow/core/framework/op_kernel.cc:1669] OP_REQUIRES failed at cast_op.cc:121 : Unimplemented: Cast string to float is not supported


UnimplementedError:  Cast string to float is not supported
	 [[node categorical_crossentropy/Cast (defined at var/folders/fc/z3ktrz354nddfg1wt432tbm80000gn/T/ipykernel_16244/319088529.py:15) ]] [Op:__inference_train_function_2862]

Function call stack:
train_function
