In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import gensim

In [None]:
def read_only_consistent(path):
    # create a dataframe consisting of word and PoS tag
    sentences = []
    with open(path, 'r', encoding="utf-8") as f:
        data = f.readlines()
        r = len(data)
        i = 0
        while(i < r):
            sentence = []
            ne_representation = []
            while(i < r and data[i] != "\n"):
                parts = data[i].split(" ")
                word = parts[0].strip()
                ne = parts[-1].strip()

                ne_representation.append(ne)
                sentence.append(word) 
                i += 1
            i+=1
            sentences.append((sentence, ne_representation))
        return sentences

# read test, dev and train csv files
train = read_only_consistent("data/gungor.ner.train.14.only_consistent")
dev = read_only_consistent("data/gungor.ner.dev.14.only_consistent")
test = read_only_consistent("data/gungor.ner.test.14.only_consistent")

In [None]:
import random
sentence, pos_repr = random.choice(train)
print(sentence, ":", pos_repr)

In [None]:

# create a dataframe
traindf = pd.DataFrame(
    {'sentence': [i[0] for i in train],
        'ne': [i[1] for i in train]
    })

testdf = pd.DataFrame(
    {'sentence': [i[0] for i in test],
        'ne': [i[1] for i in test]
    })

devdf = pd.DataFrame(
    {'sentence': [i[0] for i in dev],
        'ne': [i[1] for i in dev]
    })


traindf.head(1), devdf.head(1), testdf.head(1)

In [None]:
trainx = traindf['sentence']
trainy = traindf['ne']

trainx[0], trainy[0]

In [None]:
####### VECTORIZATION #######
word_tokenizer = tf.keras.preprocessing.text.Tokenizer()
word_tokenizer.fit_on_texts(trainx)                    # fit tokeniser on data
X_encoded = word_tokenizer.texts_to_sequences(trainx)  # use the tokeniser to encode input sequence

In [None]:
####### VECTORIZATION #######
tag_tokenizer = tf.keras.preprocessing.text.Tokenizer()
tag_tokenizer.fit_on_texts(trainy)                    # fit tokeniser on data
Y_encoded = tag_tokenizer.texts_to_sequences(trainy)  # use the tokeniser to encode input sequence

In [None]:
# make sure that each sequence of input and output is same length

different_length = [1 if len(input) != len(output) else 0 for input, output in zip(X_encoded, Y_encoded)]
print("{} sentences have disparate input-output lengths.".format(sum(different_length)))

In [None]:
lengths = [len(seq) for seq in X_encoded]
print("Length of longest sentence: {}".format(max(lengths)))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.boxplot(lengths, orient='h')
plt.title("Boxplot of sentence lengths")
plt.show()

In [None]:
MAX_SEQ_LENGTH = 130  # sequences greater than MAX_SEQ_LENGTH in length will be truncated

X_padded = tf.keras.preprocessing.sequence.pad_sequences(X_encoded, maxlen=MAX_SEQ_LENGTH, padding="pre", truncating="post")
Y_padded = tf.keras.preprocessing.sequence.pad_sequences(Y_encoded, maxlen=MAX_SEQ_LENGTH, padding="pre", truncating="post")

In [None]:
print(X_padded[0], "\n"*3)
print(Y_padded[0])

In [None]:
MODEL = "C:/Users/karab/Desktop/Models/glove.txt"
EMBEDDING_SIZE  = 300  # each word in word2vec model is represented using a 300 dimensional vector
VOCABULARY_SIZE = len(word_tokenizer.word_index) + 1

#load fasttext embeddings
print('loading word embeddings...')
word_vectors = gensim.models.KeyedVectors.load_word2vec_format(
    MODEL,
    binary=False,
    no_header=True)

In [None]:
# create an empty embedding matrix
embedding_weights = np.zeros((VOCABULARY_SIZE, EMBEDDING_SIZE))
word2id = word_tokenizer.word_index
for word, index in word2id.items():
    try:  embedding_weights[index, :] = word_vectors[word]
    except KeyError: pass

In [None]:
print("Embeddings shape: {}".format(embedding_weights.shape))

In [None]:
Y_final = tf.keras.utils.to_categorical(Y_padded)
# print Y of the first output sequence
print(Y_final.shape)  # (number of sequences, length of each sequence, number of entities)

In [None]:
### Perform all the operations for dev set ###
devx = devdf['sentence']
devy = devdf['ne']

devx_encoded = word_tokenizer.texts_to_sequences(devx)
devy_encoded = tag_tokenizer.texts_to_sequences(devy)

devx_padded = tf.keras.preprocessing.sequence.pad_sequences(devx_encoded, maxlen=MAX_SEQ_LENGTH, padding="pre", truncating="post")
devy_padded = tf.keras.preprocessing.sequence.pad_sequences(devy_encoded, maxlen=MAX_SEQ_LENGTH, padding="pre", truncating="post")

devy_final= tf.keras.utils.to_categorical(devy_padded)

In [None]:
### Perform all the operations for test set ###
testx = testdf['sentence']
testy = testdf['ne']

testx_encoded = word_tokenizer.texts_to_sequences(testx)
testy_encoded = tag_tokenizer.texts_to_sequences(testy)

testx_padded = tf.keras.preprocessing.sequence.pad_sequences(testx_encoded, maxlen=MAX_SEQ_LENGTH, padding="pre", truncating="post")
testy_padded = tf.keras.preprocessing.sequence.pad_sequences(testy_encoded, maxlen=MAX_SEQ_LENGTH, padding="pre", truncating="post")

testy_final = tf.keras.utils.to_categorical(testy_padded)

In [None]:
print("TRAINING DATA")
print('Shape of input sequences: {}'.format(X_padded.shape))
print('Shape of output sequences: {}'.format(Y_final.shape))
print("-"*50)
print("VALIDATION DATA")
print('Shape of input sequences: {}'.format(devx_padded.shape))
print('Shape of output sequences: {}'.format(devy_final.shape))
print("-"*50)
print("TESTING DATA")
print('Shape of input sequences: {}'.format(testx_padded.shape))
print('Shape of output sequences: {}'.format(testy_final.shape))

In [None]:
# print unique elements in trainy
print("Unique elements in trainy: {}".format(set([item for sublist in trainy for item in sublist])))
# print unique elements in devy
print("Unique elements in devy: {}".format(set([item for sublist in devy for item in sublist])))
# print unique elements in testy
print("Unique elements in testy: {}".format(set([item for sublist in testy for item in sublist])))

In [None]:
# total number of tags
NUM_CLASSES = Y_final.shape[2]
NUM_CLASSES

In [None]:
# import tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, TimeDistributed, Embedding

bidirect_model = Sequential()
bidirect_model.add(Embedding(input_dim     = VOCABULARY_SIZE,
                             output_dim    = EMBEDDING_SIZE,
                             input_length  = MAX_SEQ_LENGTH,
                             weights       = [embedding_weights],
                             trainable     = False
))
bidirect_model.add(Bidirectional(LSTM(16, return_sequences=True)))
bidirect_model.add(TimeDistributed(Dense(NUM_CLASSES, activation='softmax')))

In [None]:
bidirect_model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])

In [None]:
bidirect_model.summary()

In [None]:
bidirect_training = bidirect_model.fit(X_padded, Y_final, batch_size=128, epochs=10, validation_data=(devx_padded, devy_final))

In [None]:
# visualise training history
plt.plot(bidirect_training.history['acc'])
plt.plot(bidirect_training.history['val_acc'])
plt.title('model accuracy')
plt.grid()
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc="lower right")
plt.show()

In [None]:
loss, accuracy = bidirect_model.evaluate(testx_padded, testy_final, verbose = 1)
print("Loss: {0},\nAccuracy: {1}".format(loss, accuracy))