In [1]:
from tqdm import tqdm
from nltk import bigrams
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import recall_score, precision_score, accuracy_score, confusion_matrix, ConfusionMatrixDisplay
import gensim
import gensim
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from sklearn.decomposition import PCA
from matplotlib import pyplot
from keras.preprocessing.sequence import pad_sequences
from tensorflow.python import keras
from keras import utils as np_util
import numpy as np
from tensorflow.keras.utils import to_categorical
from tensorflow.math import argmax
from sklearn.model_selection import train_test_split
import copy
from keras.preprocessing.text import Tokenizer

In [2]:
DATA_PATH = './../data/Brown_tagged_train.txt'

In [3]:
with open(DATA_PATH, 'r') as f:
    data = f.read().splitlines()
data = np.array(data)

In [74]:
trainData, validData = train_test_split(data, test_size=0.2, random_state=0)

In [75]:
def split_Xy(test_Xy):
    """
    test_Xy: List of list of tokens and tags
    Returns: List of tokens and list of tags
    """
    test_y = []
    test_X = []

    for sent in test_Xy:
        tagged_sent = sent.split(' ')
        sent_y = []
        sent_X = []

        for word in tagged_sent:
            if word == "":
                continue
            actual_word, tag = split_tag_word(word)
            sent_X.append(actual_word)
            sent_y.append(tag)

        test_y.append(sent_y)
        test_X.append(sent_X)

    return test_X, test_y

In [76]:
def split_tag_word(inp):
    """
    Returns word, tag for the given input
    """
    arr = inp.split('/')
    tag = arr[-1]
    del arr[-1]
    word = '/'.join(arr)
    return word, tag

In [77]:
trainSents, trainSentTags = split_Xy(trainData)

In [78]:
testSents, testSentTags = split_Xy(validData)

In [79]:
def preprocessData(trainSents, trainSentTags):
    uniqueWords = set()
    for sent in trainSents:
        for word in sent:
            uniqueWords.add(word)

    uniqueTags = set()
    for sent in trainSentTags:
        for tag in sent:
            uniqueTags.add(tag)

    tagIndex = {}
    idx = 0
    for i in uniqueTags:
        tagIndex[i] = idx
        idx += 1

    wordIndex = {}
    idx = 0
    for i in uniqueWords:
        wordIndex[i] = idx
        idx += 1

    # trainSentsText = trainSents.copy()
    # trainTagsText = trainSentTags.copy()
    trainSents[0]
    wordTokenize = Tokenizer()
    wordTokenize.fit_on_texts(trainSents)
    trainSents = wordTokenize.texts_to_sequences(trainSents)
    paddedTrainSents = pad_sequences(
        trainSents, maxlen=300, padding='pre', truncating='post')
    tagTokenize = Tokenizer()
    tagTokenize.fit_on_texts(trainSentTags)
    trainSentTags = tagTokenize.texts_to_sequences(trainSentTags)
    paddedTrainSentTags = pad_sequences(trainSentTags, maxlen=300, padding='pre', truncating='post')

    oneHotEncodedTrainSentTags = to_categorical(paddedTrainSentTags, num_classes=None, dtype='float32')

    return paddedTrainSents, oneHotEncodedTrainSentTags, tagIndex, wordIndex, trainSents, wordTokenize, tagTokenize


In [80]:
trainPaddedSents, trainOneHotEncodedTags, trainTagIndex, trainWordIndex, trainSents, trainWordTokenizer, tagWordTokenize = preprocessData(trainSents, trainSentTags)

In [81]:
testPaddedSents, testOneHotEncodedTags, testTagIndex, testWordIndex, testSents, testWordTokenizer, testTagWordTokenizer = preprocessData(testSents, testSentTags)

In [82]:
w2vgensim = gensim.models.word2vec.Word2Vec(trainSents, vector_size=300, min_count=1, window=5)


In [83]:
embeddings = np.zeros((len(trainWordTokenizer.word_index) + 1, 300))

In [84]:
wordIndices = trainWordTokenizer.word_index

In [85]:
for word, index in wordIndices.items():
    try:
        embeddings[index, :] = w2vgensim.wv.get_vector(trainWordTokenizer.word_index[word])
    except:
        embeddings[index, :] = np.random.uniform(-0.25, 0.25, 300)

In [86]:
mlp = keras.Sequential()
mlp.add(keras.layers.Embedding(len(trainWordTokenizer.word_index) + 1, 300, weights=[embeddings], input_length=300, trainable=True))
mlp.add((keras.layers.Dense(100, activation='relu')))
mlp.add((keras.layers.Dense(13, activation='relu')))
mlp.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])
mlp.summary()


Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 300, 300)          8256000   
_________________________________________________________________
dense_6 (Dense)              (None, 300, 100)          30100     
_________________________________________________________________
dense_7 (Dense)              (None, 300, 13)           1313      
Total params: 8,287,413
Trainable params: 8,287,413
Non-trainable params: 0
_________________________________________________________________


In [87]:
mlp.fit(trainPaddedSents, trainOneHotEncodedTags, batch_size=128,
          epochs=2, validation_data=(testPaddedSents, testOneHotEncodedTags))

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7ff4a0dc5460>