In [2]:
from tqdm import tqdm
from nltk import bigrams
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import recall_score, precision_score, accuracy_score, confusion_matrix, ConfusionMatrixDisplay
import gensim
import gensim
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from sklearn.decomposition import PCA
from matplotlib import pyplot
from keras.preprocessing.sequence import pad_sequences
from tensorflow.python import keras
from keras import utils as np_util
import numpy as np
from gensim.models import KeyedVectors
from tensorflow.keras.utils import to_categorical
from tensorflow.math import argmax
from sklearn.model_selection import train_test_split
import copy
from keras.preprocessing.text import Tokenizer


In [3]:
DATA_PATH = './../data/Brown_tagged_train.txt'

In [4]:
with open(DATA_PATH, 'r') as f:
    data = f.read().splitlines()
data = np.array(data)

In [5]:
trainData, validData = train_test_split(data, test_size=0.2, random_state=0)

In [6]:
def split_Xy(test_Xy):
    """
    test_Xy: List of list of tokens and tags
    Returns: List of tokens and list of tags
    """
    test_y = []
    test_X = []

    for sent in test_Xy:
        tagged_sent = sent.split(' ')
        sent_y = []
        sent_X = []

        for word in tagged_sent:
            if word == "":
                continue
            actual_word, tag = split_tag_word(word)
            sent_X.append(actual_word)
            sent_y.append(tag)

        test_y.append(sent_y)
        test_X.append(sent_X)

    return test_X, test_y

In [7]:
def split_tag_word(inp):
    """
    Returns word, tag for the given input
    """
    arr = inp.split('/')
    tag = arr[-1]
    del arr[-1]
    word = '/'.join(arr)
    return word, tag

In [71]:
trainSents, trainSentTags = split_Xy(trainData)

In [72]:
testSents, testSentTags = split_Xy(validData)

In [73]:
def preprocessData(trainSents, trainSentTags):
    uniqueWords = set()
    for sent in trainSents:
        for word in sent:
            uniqueWords.add(word)

    uniqueTags = set()
    for sent in trainSentTags:
        for tag in sent:
            uniqueTags.add(tag)

    tagIndex = {}
    idx = 0
    for i in uniqueTags:
        tagIndex[i] = idx
        idx += 1

    wordIndex = {}
    idx = 0
    for i in uniqueWords:
        wordIndex[i] = idx
        idx += 1

    # trainSentsText = trainSents.copy()
    # trainTagsText = trainSentTags.copy()
    trainSents[0]
    wordTokenize = Tokenizer()
    wordTokenize.fit_on_texts(trainSents)
    trainSents = wordTokenize.texts_to_sequences(trainSents)
    paddedTrainSents = pad_sequences(
        trainSents, maxlen=387, padding='pre', truncating='post')
    tagTokenize = Tokenizer()
    tagTokenize.fit_on_texts(trainSentTags)
    trainSentTags = tagTokenize.texts_to_sequences(trainSentTags)
    paddedTrainSentTags = pad_sequences(trainSentTags, maxlen=387, padding='pre', truncating='post')

    oneHotEncodedTrainSentTags = to_categorical(paddedTrainSentTags, num_classes=None, dtype='float32')

    return paddedTrainSents, oneHotEncodedTrainSentTags, tagIndex, wordIndex, trainSents, wordTokenize, tagTokenize


In [11]:
embeddingPath = './../data/GoogleNews-vectors-negative300.bin'
embeddingsw2v = KeyedVectors.load_word2vec_format(embeddingPath, binary=True)


In [76]:
def makeModel(trainSents, trainSentTags, testSents, testSentTags):
    trainPaddedSents, trainOneHotEncodedTags, trainTagIndex, trainWordIndex, trainSents, trainWordTokenizer, traintagWordTokenizer = preprocessData(
        trainSents, trainSentTags)
    testPaddedSents, testOneHotEncodedTags, testTagIndex, testWordIndex, testSents, testWordTokenizer, testTagWordTokenizer = preprocessData(
        testSents, testSentTags)
    w2vgensim = gensim.models.word2vec.Word2Vec(
        trainSents, vector_size=300, min_count=1, window=5)
    embeddings = np.zeros((len(trainWordTokenizer.word_index) + 1, 300))
    wordIndices = trainWordTokenizer.word_index
    for word, index in wordIndices.items():
        try:
            embeddings[index, :] = embeddingsw2v[word]
        except:
            embeddings[index, :] = np.random.uniform(-0.25, 0.25, 300)
    
    mlp = keras.Sequential()
    mlp.add(keras.layers.Embedding(len(trainWordTokenizer.word_index) + 1,
                                300, weights=[embeddings], input_length=387, trainable=True))
    mlp.add((keras.layers.Dense(100, activation='relu')))
    mlp.add((keras.layers.Dense(13, activation='softmax')))
    mlp.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])
    mlp.summary()
    mlp.fit(trainPaddedSents, trainOneHotEncodedTags, batch_size=128,
            epochs=2, validation_data=(testPaddedSents, testOneHotEncodedTags))
    return mlp, traintagWordTokenizer, testTagWordTokenizer, trainPaddedSents, trainOneHotEncodedTags, testPaddedSents, testOneHotEncodedTags


In [77]:
mlp, trainTagWordTokenizer, testTagWordTokenizer, trainFinalSents, trainFinalTags, testFinalSents, testFinalTags = makeModel(trainSents, trainSentTags, testSents, testSentTags)

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 387, 300)          8256000   
_________________________________________________________________
dense_8 (Dense)              (None, 387, 100)          30100     
_________________________________________________________________
dense_9 (Dense)              (None, 387, 13)           1313      
Total params: 8,287,413
Trainable params: 8,287,413
Non-trainable params: 0
_________________________________________________________________
Epoch 1/2
Epoch 2/2


In [78]:
pred = mlp.predict_classes(testFinalSents[0])





In [79]:
pred.shape

(387, 1)

In [80]:
reverseTagMapTest = dict(map(reversed, testTagWordTokenizer.word_index.items()))

In [81]:
reverseTagMapTest

{1: 'noun',
 2: 'verb',
 3: '.',
 4: 'adp',
 5: 'det',
 6: 'adj',
 7: 'adv',
 8: 'pron',
 9: 'conj',
 10: 'prt',
 11: 'num',
 12: 'x'}

In [82]:
reverseTagMapTest[0] = 'pad'

In [83]:
reverseTagMapTest

{1: 'noun',
 2: 'verb',
 3: '.',
 4: 'adp',
 5: 'det',
 6: 'adj',
 7: 'adv',
 8: 'pron',
 9: 'conj',
 10: 'prt',
 11: 'num',
 12: 'x',
 0: 'pad'}

In [84]:
pred = pred.reshape(pred.shape[0],)

In [85]:
pred = [reverseTagMapTest[i] for i in pred]

In [86]:
pred

['pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',


In [87]:
expected = testFinalTags[0]

In [88]:
expected = [np.argmax(i) for i in expected]

In [89]:
expected = [reverseTagMapTest[i] for i in expected]

In [90]:
expected

['pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
 'pad',
