In [1]:
import conllu
from conllu import parse,parse_incr
import pandas as pd
import numpy as np
from nltk.util import ngrams
from math import floor
import wget

In [2]:
def list_flat(alist):
    flat_list = []
    for sublist in alist:
        for item in sublist:
            flat_list.append(item)
    return flat_list

In [3]:
### Convert Conllu files into datasets

In [4]:
def convert_to_dataset(conllu_file):
    data_file = open(conllu_file, "r", encoding="utf-8")
    sentences = []
    pos_tags = []
    length = []
    for tokenlist in parse_incr(data_file):
        sentence = [f['lemma'].lower() for f in tokenlist]
        tags = [f['upos'] for f in tokenlist]
        length.append(len(sentence))
        sentences.append(sentence)
        pos_tags.append(tags)
    
    avg_length = floor(np.array(length).mean())
    print(f"Found {len(sentences)} sentences of average size {avg_length} in the dataset!\n")
    
    return sentences, pos_tags

In [5]:
train_data = convert_to_dataset('gum/train.conllu')
dev_data = convert_to_dataset('gum/dev.conllu')
test_data = convert_to_dataset('gum/test.conllu')

Found 4287 sentences of average size 19 in the dataset!

Found 784 sentences of average size 19 in the dataset!

Found 890 sentences of average size 17 in the dataset!



In [6]:
def convert_to_window_dataset(data, window_size, pad_symbol='</s>'):
    z_sentences = data[0]
    z_pos_tags = list_flat(data[1])
    windowed_sentences = []
    for sentence in z_sentences:
        gen = ngrams(sentence, window_size, pad_left=True, pad_right=True, left_pad_symbol=pad_symbol, right_pad_symbol=pad_symbol)
        for f in list(gen)[window_size//2:-(window_size//2)]:
            windowed_sentences.append(f)
    return windowed_sentences, z_pos_tags

In [7]:
x_train, y_train = convert_to_window_dataset(train_data,5)
x_dev, y_dev = convert_to_window_dataset(dev_data,5)

In [8]:
### Get all known words in train set and make a vocabulary out of them (add special </s> token)
train_vocab = set(list_flat(x_train))
train_vocab = {v:k for k,v in enumerate(train_vocab)}

In [9]:
train_target = set(y_train)
train_target = {v:k for k,v in enumerate(train_target)}

In [10]:
### Download Word2Vec Embeddings

In [11]:
#wget.download('https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz')

In [12]:
### Use Gensim to Load the Weights
from gensim.models import KeyedVectors

In [13]:
filename = 'GoogleNews-vectors-negative300.bin'
model = KeyedVectors.load_word2vec_format(filename, binary=True)

In [14]:
embedding_matrix = np.zeros((len(train_vocab), 300))

for i in range(len(model.wv.vocab)):
    candidate_word = model.wv.index2word[i]
    if candidate_word in train_vocab:
        embedding_vector = model.wv[model.wv.index2word[i]]
        if embedding_vector is not None:
            embedding_matrix[train_vocab[candidate_word]] = embedding_vector

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  


In [15]:
### Convert train and dev sets
x_train_tokenized = []
for sentence in x_train:
    tokenized_sentence = []
    for token in sentence:
        if token in train_vocab:
            tokenized_sentence.append(train_vocab[token])
        else:
            tokenized_sentence.append(train_vocab['</s>'])
    x_train_tokenized.append(tokenized_sentence)


x_dev_tokenized = []
for sentence in x_dev:
    tokenized_sentence = []
    for token in sentence:
        if token in train_vocab:
            tokenized_sentence.append(train_vocab[token])
        else:
            tokenized_sentence.append(train_vocab['</s>'])
    x_dev_tokenized.append(tokenized_sentence)

In [16]:
### Convert train and dev targets
y_train_tokenized = []
for label in y_train:
    y_train_tokenized.append(train_target[label])
    
y_dev_tokenized = []
for label in y_dev:
    y_dev_tokenized.append(train_target[label])

In [21]:
### Ready for Model Preprocessing and Feeding

In [63]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Embedding, Dense, Dropout, Flatten
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [71]:
model = Sequential()

In [72]:
model.add(Input(shape=(5,),dtype='float32',name='MyInputLayer'))
model.add(Embedding(len(train_vocab), 300, weights=[embedding_matrix], input_length=5, trainable=True))
model.add(Flatten())
model.add(Dense(units=200, activation='relu'))
model.add(Dropout(rate=0.3))
model.add(Dense(units=100, activation='relu'))
model.add(Dropout(rate=0.1))
model.add(Dense(units=len(train_target), activation='softmax'))

In [73]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 5, 300)            2731500   
_________________________________________________________________
flatten_1 (Flatten)          (None, 1500)              0         
_________________________________________________________________
dense_3 (Dense)              (None, 200)               300200    
_________________________________________________________________
dropout_2 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 100)               20100     
_________________________________________________________________
dropout_3 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 17)               

In [74]:
xt = np.array(x_train_tokenized)
xt = xt.astype('float32')

xd = np.array(x_dev_tokenized)
xd = xd.astype('float32')


yt = np.array(y_train_tokenized)
yt = yt.astype('int')

yd = np.array(y_dev_tokenized)
yd = yd.astype('int')

In [85]:
from sklearn.metrics import *

In [86]:
### Custom Callbacks
class Metrics(tf.keras.callbacks.Callback):
    def __init__(self, valid_data):
        super(Metrics, self).__init__()
        self.validation_data = valid_data

    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        val_predict = np.argmax(self.model.predict(self.validation_data[0]), -1)
        val_targ = self.validation_data[1]
        if len(val_targ.shape) == 2 and val_targ.shape[1] != 1:
            val_targ = np.argmax(val_targ, -1)
        val_targ = tf.cast(val_targ,dtype=tf.float32)
        

        _val_f1 = f1_score(val_targ, val_predict,average="weighted")
        _val_recall = recall_score(val_targ, val_predict,average="weighted")
        _val_precision = precision_score(val_targ, val_predict,average="weighted")

        logs['val_f1'] = _val_f1
        logs['val_recall'] = _val_recall
        logs['val_precision'] = _val_precision
        print(" — val_f1: %f — val_precision: %f — val_recall: %f" % (_val_f1, _val_precision, _val_recall))
        return
    

In [87]:
cp = ModelCheckpoint('weights.hdf5',
 monitor='val_f1', 
 mode='max', verbose=2,
 save_best_only=True,
 save_weights_only=True)

In [88]:
model.compile(optimizer='adam',loss='sparse_categorical_crossentropy')

In [89]:
model.fit(xt,yt,epochs=200, batch_size=256, callbacks=[Metrics(valid_data=(xd, yd)),cp])

Train on 81857 samples

Epoch 00001: val_f1 improved from -inf to 0.90138, saving model to weights.hdf5


<tensorflow.python.keras.callbacks.History at 0x1ed79d3c828>

In [56]:
phrase_to_predict = "this a dog who run"

In [57]:
phrase_to_predict = [train_vocab[f] for f in phrase_to_predict.split(' ')]

In [58]:
phrase_to_predict

[1000, 4447, 8561, 8356, 4977]

In [59]:
phrase_to_predict = np.array([phrase_to_predict])

In [60]:
np.argmax(model.predict(phrase_to_predict))

4

In [61]:
train_target

{'X': 0,
 'NUM': 1,
 'VERB': 2,
 'PART': 3,
 'NOUN': 4,
 'ADV': 5,
 'SYM': 6,
 'AUX': 7,
 'ADJ': 8,
 'DET': 9,
 'INTJ': 10,
 'SCONJ': 11,
 'PRON': 12,
 'ADP': 13,
 'PROPN': 14,
 'PUNCT': 15,
 'CCONJ': 16}