In [1]:
#import of modules
import pickle
import numpy as np
import os

In [2]:
#import of modules for traning neural network
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

from keras.layers import Embedding
from keras.layers import Dense, Input
from keras.layers import TimeDistributed
from keras.layers import LSTM, Bidirectional
from keras.models import Model

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
# import models for using pos tagger 
from keras.models import load_model

In [4]:
# number of files from brown corpus to be used
n_sample_files = 500

# glove file that will be used
GLOVE_FILE = 'glove.6B.50d.txt'

MAX_SEQUENCE_LENGTH = 100
EMBEDDING_DIM = 50
TEST_SPLIT = 0.3
VALIDATION_SPLIT = 0.3
BATCH_SIZE = 32

MODEL_NAME_TRAIN = 'test_50_tt03'
MODEL_NAME_EVALUATE = 'test_50_tt03'
MODEL_NAME_TEST = 'test_50_tt03'

# if train is true, new module will be created and saved with name MODEL_NAME_TRAIN
TRAIN = True
# if evaluate is true, model with name MODEL_NAME_EVALUATE will be evaluated
EVALUATE = True

# new sentence to be tested
sentence_for_test = 'i want to dance with a dog'

In [6]:
#listing corpus dir and choosing number of files
files = os.listdir('brown/')

print('TOTAL NO. OF FILES ', len(files), '\n')
print('RUNNING ON ', n_sample_files, ' FILES\n')

TOTAL NO. OF FILES  500 

RUNNING ON  500  FILES



In [7]:
#reading files from corpus and creating list with contents
raw_corpus = ''

for file in files[0:n_sample_files]:
    with open('brown/' + file) as f:
        raw_corpus = raw_corpus + '\n' + f.read()

corpus = raw_corpus.split('\n')
print('CORPUS SIZE', len(corpus), '\n')

CORPUS SIZE 89830 



In [8]:
# initalizing variables
X_train = []
Y_train = []

words = []
tags = []

with_slash = False
n_omitted = 0

print(corpus[70])

He/pps arraigned/vbd the/at citizens/nns in/in language/nn of/in so/ql little/ap courtesy/nn that/cs they/ppss had/hvd to/to respond/vb with/in ,/, at/in the/at least/ap ,/, resentment/nn ./.


In [9]:
# extract words and tags and separate them and remove omitted words
for line in corpus:
    # for every file with content
    if(len(line)>0):
        tempX = []
        tempY = []
        for word in line.split():
            try:
                # if word has '/', first part is word, second is tag
                w, tag = word.split('/')
            except:
                # with_slash = True
                # else word generaly containing "/" 
                n_omitted = n_omitted + 1
                #print(word)
                #print(line)
                #print("\n")
                break

            w = w.lower()

            # save word and tag
            words.append(w)
            tags.append(tag)
            
            # add each word and tag
            tempX.append(w)
            tempY.append(tag)
        
        # add words and tags from sentence
        X_train.append(tempX)
        Y_train.append(tempY)

In [10]:
print('OMITTED sentences: ', n_omitted, '\n')
print('TOTAL NO OF SAMPLES: ', len(X_train), '\n')

print('sample X_train: ', X_train[42], '\n')
print('sample Y_train: ', Y_train[42], '\n')

# removing duplicates
words = set(words)
tags = set(tags)

print('VOCAB SIZE: ', len(words))
print('TOTAL TAGS: ', len(tags))

# number of words and tags should be the same
assert len(X_train) == len(Y_train)

OMITTED sentences:  182 

TOTAL NO OF SAMPLES:  57341 

sample X_train:  ['opinion', 'is', 'less', 'individual', 'or', 'runs', 'more', 'into', 'masses', ',', 'and', 'often', 'rules', 'with', 'a', 'rod', 'of', 'iron', "''", '.'] 

sample Y_train:  ['nn', 'bez', 'ql', 'jj', 'cc', 'vbz', 'rbr', 'in', 'nns', ',', 'cc', 'rb', 'vbz', 'in', 'at', 'nn', 'in', 'nn', "''", '.'] 

VOCAB SIZE:  49645
TOTAL TAGS:  472


In [11]:
# enumerate words and tags in sets
word2int = {}
int2word = {}

for i, word in enumerate(words):
    word2int[word] = i+1
    int2word[i+1] = word

tag2int = {}
int2tag = {}

for i, tag in enumerate(tags):
    tag2int[tag] = i+1
    int2tag[i+1] = tag
    
X_train_numberised = []
Y_train_numberised = []

# save ordinal numbers of words and tags in sentences
for sentence in X_train:
    tempX = []
    for word in sentence:
        tempX.append(word2int[word])
    X_train_numberised.append(tempX)
    
for tags in Y_train:
    tempY = []
    for tag in tags:
        tempY.append(tag2int[tag])
    Y_train_numberised.append(tempY)

print('sample X_train_numberised: ', X_train_numberised[42], '\n')
print('sample Y_train_numberised: ', Y_train_numberised[42], '\n')

sample X_train_numberised:  [21575, 24789, 45288, 18509, 20114, 8339, 48472, 39631, 32023, 33592, 49407, 148, 35321, 22749, 13293, 48370, 7379, 14074, 25908, 22161] 

sample Y_train_numberised:  [368, 178, 70, 434, 153, 97, 353, 300, 141, 162, 153, 310, 97, 300, 375, 368, 300, 368, 174, 270] 



In [12]:
# create arrays from lists
X_train_numberised = np.asarray(X_train_numberised)
Y_train_numberised = np.asarray(Y_train_numberised)

# collect data to save in picle files
pickle_files = [X_train_numberised, Y_train_numberised, word2int, int2word, tag2int, int2tag]

# create directory if it doesn't exist
if not os.path.exists('PickledData/'):
    print('MAKING DIRECTORY PickledData/ to save pickled glove file')
    os.makedirs('PickledData/')

# open file and write 
with open('PickledData/data.pkl', 'wb') as f:
    pickle.dump(pickle_files, f)

print('Saved as pickle file')

Saved as pickle file


In [13]:
embeddings_index = {}

glove_file = open(GLOVE_FILE, encoding="utf8")

for line in glove_file:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
glove_file.close()

# create directory if it doesn't exist
if not os.path.exists('PickledData/'):
    print('MAKING DIRECTORY PickledData/ to save pickled glove file')
    os.makedirs('PickledData/')

# saving dictionary to pickle file
with open('PickledData/Glove.pkl', 'wb') as f:
    pickle.dump(embeddings_index, f)

print('SUCESSFULLY SAVED Glove data as a pickle file in PickledData/')

SUCESSFULLY SAVED Glove data as a pickle file in PickledData/


In [14]:
# reading data from Pickle files
with open('PickledData/data.pkl', 'rb') as f:
    X, y, word2int, int2word, tag2int, int2tag = pickle.load(f)
    
with open('PickledData/Glove.pkl', 'rb') as f:
	embeddings_index = pickle.load(f)

print('Total %s word vectors.' % len(embeddings_index))

Total 400000 word vectors.


In [15]:
# generator function for creating batches
def generator(all_X, all_y, n_classes, batch_size=BATCH_SIZE):
    num_samples = len(all_X)

    while True:

        for offset in range(0, num_samples, batch_size):
            
            X = all_X[offset:offset+batch_size]
            y = all_y[offset:offset+batch_size]

            y = to_categorical(y, num_classes=n_classes)
            
            # shuffle both lists in same order
            yield shuffle(X, y)

In [16]:
# number of tags available
n_tags = len(tag2int)

#print(X[50:53])

# TODO: pad_sequences ne radi nista ovde?
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
y = pad_sequences(y, maxlen=MAX_SEQUENCE_LENGTH)

#print(X[50:53])

# y = to_categorical(y, num_classes=len(tag2int) + 1)

print('TOTAL TAGS', n_tags)
print('TOTAL WORDS', len(word2int))

# shuffle the data in the same order for both arrays
X, y = shuffle(X, y)

TOTAL TAGS 472
TOTAL WORDS 49645


In [17]:
# split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SPLIT,random_state=42)

# split training data into train and validation
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=VALIDATION_SPLIT, random_state=1)

n_train_samples = X_train.shape[0]
n_val_samples = X_val.shape[0]
n_test_samples = X_test.shape[0]

print('We have %d TRAINING samples' % n_train_samples)
print('We have %d VALIDATION samples' % n_val_samples)
print('We have %d TEST samples' % n_test_samples)

# make generators for training and validation sets
train_generator = generator(all_X=X_train, all_y=y_train, n_classes=n_tags + 1)
validation_generator = generator(all_X=X_val, all_y=y_val, n_classes=n_tags + 1)

We have 28096 TRAINING samples
We have 12042 VALIDATION samples
We have 17203 TEST samples


In [18]:
# + 1 to include the unkown word
embedding_matrix = np.random.random((len(word2int) + 1, EMBEDDING_DIM))

print(embedding_matrix.shape)

for word, i in word2int.items():
    # taking vector of embeddings index for specific word from glove file
    embedding_vector = embeddings_index.get(word)
    
    if embedding_vector is not None:
        # words not found in embeddings_index will remain unchanged and thus will be random.
        embedding_matrix[i] = embedding_vector

print('Embedding matrix shape', embedding_matrix.shape)
print('X_train shape', X_train.shape)

(49646, 50)
Embedding matrix shape (49646, 50)
X_train shape (28096, 100)


In [18]:
if TRAIN:
    embedding_layer = Embedding(len(word2int) + 1,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=True)
    sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)

    l_lstm = Bidirectional(LSTM(64, return_sequences=True))(embedded_sequences)
    preds = TimeDistributed(Dense(n_tags + 1, activation='softmax'))(l_lstm)
    model = Model(sequence_input, preds)

    model.compile(loss='categorical_crossentropy',
                  optimizer='rmsprop',
                  metrics=['acc'])

    print("model fitting - Bidirectional LSTM")
    model.summary()

    model.fit_generator(train_generator, 
                         steps_per_epoch=n_train_samples//BATCH_SIZE,
                         validation_data=validation_generator,
                         validation_steps=n_val_samples//BATCH_SIZE,
                         epochs=2,
                         verbose=1,
                         workers=4)

model fitting - Bidirectional LSTM
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 100, 50)           2482300   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 128)          58880     
_________________________________________________________________
time_distributed_1 (TimeDist (None, 100, 473)          61017     
Total params: 2,602,197
Trainable params: 2,602,197
Non-trainable params: 0
_________________________________________________________________
Epoch 1/2
Epoch 2/2


In [22]:
if TRAIN:
    # create folder is needed and save model that is generated   
    if not os.path.exists('Models/'):
        print('MAKING DIRECTORY Models/ to save model file')
        os.makedirs('Models/')
    
    model.save('Models/' + MODEL_NAME_TRAIN)
    print('MODEL SAVED in Models/ as ' + MODEL_NAME_TRAIN)    

MODEL SAVED in Models/ as test_50_tt03


In [19]:
if EVALUATE:

    #import model that is previously generated
    from keras.models import load_model
    model = load_model('Models/' + MODEL_NAME_EVALUATE)
    
    y_test = to_categorical(y_test, num_classes=n_tags+1)
    test_results = model.evaluate(X_test, y_test, verbose=0)
    print('TEST LOSS %f \nTEST ACCURACY: %f' % (test_results[0], test_results[1]))

TEST LOSS 2.264156 
TEST ACCURACY: 0.799294
MAKING DIRECTORY Statistic/ to save evaluation data


TypeError: write() argument must be str, not numpy.float64

In [33]:
model.summary()
    
if not os.path.exists('Statistic/'):
    print('MAKING DIRECTORY Statistic/ to save evaluation data')
    os.makedirs('Statistic/')

if not os.path.exists('Statistic/' + MODEL_NAME_EVALUATE):
    file = open('Statistic/' + MODEL_NAME_EVALUATE + ".txt","w")
    file.write("MODEL NAME: " + MODEL_NAME_EVALUATE + '\n')
    file.write("TEST LOSS:")
    file.write(str(test_results[0]) + '\n')
    file.write("TEST ACCURACY:")
    file.write(str(test_results[1]) + '\n')

    file.close() 

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 100, 50)           2482300   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 128)          58880     
_________________________________________________________________
time_distributed_1 (TimeDist (None, 100, 473)          61017     
Total params: 2,602,197
Trainable params: 2,602,197
Non-trainable params: 0
_________________________________________________________________


In [24]:
# loading Pickle files and removing train data
with open('PickledData/data.pkl', 'rb') as f:
    X_train, Y_train, word2int, int2word, tag2int, int2tag = pickle.load(f)

    del X_train
    del Y_train

In [26]:
if TEST:

    # split sentence to be tested
    sentence_split = sentence_for_test.split()
    
    tokenized_sentence = []

    for word in sentence_split:
        tokenized_sentence.append(word2int[word])

    tokenized_sentence = np.asarray([tokenized_sentence])
    #keras
    padded_tokenized_sentence = pad_sequences(tokenized_sentence, maxlen=100)

    print('The sentence is ', sentence_for_test)
    print('The tokenized sentence is ',tokenized_sentence)
    print('The padded tokenized sentence is ', padded_tokenized_sentence)

    model = load_model('Models/' + MODEL_NAME_TEST)

    prediction = model.predict(padded_tokenized_sentence)

    print(prediction.shape)

    for i, pred in enumerate(prediction[0]):
        try:
            print(sentence[i], ' : ', int2tag[np.argmax(pred)])
        except:
            pass
            # print('NA')


The sentence is  i want to dance with a dog
The tokenized sentence is  [[12019 19534 23289 36754 41225 49137 16068]]
The padded tokenized sentence is  [[    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0 12019 19534 23289
  36754 41225 49137 16068]]
(1, 100, 473)
