In [1]:
import tensorflow as tf
import torch

In [83]:
import nltk
# nltk.download('treebank')
 
tagged_sentences = nltk.corpus.treebank.tagged_sents()
 
print(tagged_sentences[0])
print("Tagged sentences: ", len(tagged_sentences))
print("Tagged words:", len(nltk.corpus.treebank.tagged_words()))

[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]
Tagged sentences:  3914
Tagged words: 100676


In [84]:
import numpy as np
 
sentences, sentence_tags =[], [] 
for tagged_sentence in tagged_sentences:
    sentence, tags = zip(*tagged_sentence)
    sentences.append(np.array(sentence))
    sentence_tags.append(np.array(tags))
 
# Let's see how a sequence looks
 
print(sentences[5])
print(sentence_tags[5])

['Lorillard' 'Inc.' ',' 'the' 'unit' 'of' 'New' 'York-based' 'Loews'
 'Corp.' 'that' '*T*-2' 'makes' 'Kent' 'cigarettes' ',' 'stopped' 'using'
 'crocidolite' 'in' 'its' 'Micronite' 'cigarette' 'filters' 'in' '1956'
 '.']
['NNP' 'NNP' ',' 'DT' 'NN' 'IN' 'JJ' 'JJ' 'NNP' 'NNP' 'WDT' '-NONE-' 'VBZ'
 'NNP' 'NNS' ',' 'VBD' 'VBG' 'NN' 'IN' 'PRP$' 'NN' 'NN' 'NNS' 'IN' 'CD'
 '.']


# Bangla data

In [85]:
from bangla_processor import *
import codecs

with codecs.open('data.txt', 'r', 'utf-8') as f:
    sentences, sentence_tags =[], [] 
    for line in f:
        sentences.append(word_tokenizer_bangla(line))
        tag_line = f.readline()
        sentence_tags.append(tag_line.strip().split(' '))
 


In [86]:
print(sentences[0])
print(sentence_tags[0])

['আজ', 'দোকানপাট', 'বন্ধ', 'থাকবে']
['ADV', 'Np', 'VB', 'VBf']


In [87]:
from sklearn.model_selection import train_test_split
train_sentences, train_tags = sentences, sentence_tags
 
(train_sentences, test_sentences, train_tags, test_tags) = train_test_split(sentences, sentence_tags, test_size=0.00001)

In [88]:
words, tags = set([]), set([])
 
for s in train_sentences:
    for w in s:
        words.add(w.lower())
 
for ts in train_tags:
    for t in ts:
        tags.add(t)
 
word2index = {w: i + 2 for i, w in enumerate(list(words))}
word2index['-PAD-'] = 0  # The special value used for padding
word2index['-OOV-'] = 1  # The special value used for OOVs
 
tag2index = {t: i + 1 for i, t in enumerate(list(tags))}
tag2index['-PAD-'] = 0  # The special value used to padding
 

In [89]:
train_sentences_X, test_sentences_X, train_tags_y, test_tags_y = [], [], [], []
 
for s in train_sentences:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
 
    train_sentences_X.append(s_int)
 
for s in test_sentences:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
 
    test_sentences_X.append(s_int)
 
for s in train_tags:
    train_tags_y.append([tag2index[t] for t in s])
 
for s in test_tags:
    try:
        test_tags_y.append([tag2index[t] for t in s])
    except:
        test_tags_y.append([])
 
print(train_sentences_X[0])
print(test_sentences_X[0])
print(train_tags_y[0])
print(test_tags_y[0])

[2, 14, 23, 26, 8, 12, 7]
[4, 29, 10, 34, 1]
[15, 3, 10, 15, 14, 9, 9]
[13, 9, 15, 16, 18]


In [90]:
print(len(train_sentences_X))
print(len(test_sentences_X))
print(len(train_tags_y))
print(len(test_tags_y))

9
1
9
1


In [91]:
MAX_LENGTH = len(max(train_sentences_X, key=len))
print(MAX_LENGTH)  # 271

7


In [92]:
from keras.preprocessing.sequence import pad_sequences
 
train_sentences_X = pad_sequences(train_sentences_X, maxlen=MAX_LENGTH, padding='post')
test_sentences_X = pad_sequences(test_sentences_X, maxlen=MAX_LENGTH, padding='post')
train_tags_y = pad_sequences(train_tags_y, maxlen=MAX_LENGTH, padding='post')
test_tags_y = pad_sequences(test_tags_y, maxlen=MAX_LENGTH, padding='post')
 
print(train_sentences_X[0])
print(test_sentences_X[0])
print(train_tags_y[0])
print(test_tags_y[0])
 

[ 2 14 23 26  8 12  7]
[ 4 29 10 34  1  0  0]
[15  3 10 15 14  9  9]
[13  9 15 16 18  0  0]


In [93]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, InputLayer, Bidirectional, TimeDistributed, Embedding, Activation
from keras.optimizers import Adam
 
 
model = Sequential()
model.add(InputLayer(input_shape=(MAX_LENGTH, )))
model.add(Embedding(len(word2index), 128))
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(TimeDistributed(Dense(len(tag2index))))
model.add(Activation('softmax'))
 
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(0.001),
              metrics=['accuracy'])
 
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 7, 128)            4480      
_________________________________________________________________
bidirectional_4 (Bidirection (None, 7, 512)            788480    
_________________________________________________________________
time_distributed_4 (TimeDist (None, 7, 19)             9747      
_________________________________________________________________
activation_4 (Activation)    (None, 7, 19)             0         
Total params: 802,707
Trainable params: 802,707
Non-trainable params: 0
_________________________________________________________________


In [94]:
def to_categorical(sequences, categories):
    cat_sequences = []
    for s in sequences:
        cats = []
        for item in s:
            cats.append(np.zeros(categories))
            cats[-1][item] = 1.0
        cat_sequences.append(cats)
    return np.array(cat_sequences)

In [95]:
cat_train_tags_y = to_categorical(train_tags_y, len(tag2index))
print(cat_train_tags_y[0])

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


In [53]:
model.fit(train_sentences_X, to_categorical(train_tags_y, len(tag2index)), batch_size=1, epochs=40, validation_split=0.1)

Train on 8 samples, validate on 1 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.callbacks.History at 0x7fdf994f1a50>

In [54]:
scores = model.evaluate(test_sentences_X, to_categorical(test_tags_y, len(tag2index)))
print(f"{model.metrics_names[1]}: {scores[1] * 100}")   # acc: 99.09751977804825

accuracy: 85.71428656578064


# Testing

In [56]:
test_samples = [
        "মা ছবিটা আনতে পারত".split(),
        "মা ছবিটা আনতে পারত".split(),
        "গতকাল ছবিটা দেখনি".split(),
        "ছবিটা না দেখে যাব না".split(),
        "এসব স্কুলে গতকাল উপস্থিত ছিল".split(),
        "ছবিটা ঢাকা ছিল".split(),
        "মুখেও কি ছিল না".split()
]
print(test_samples)

[['মা', 'ছবিটা', 'আনতে', 'পারত'], ['মা', 'ছবিটা', 'আনতে', 'পারত'], ['গতকাল', 'ছবিটা', 'দেখনি'], ['ছবিটা', 'না', 'দেখে', 'যাব', 'না'], ['এসব', 'স্কুলে', 'গতকাল', 'উপস্থিত', 'ছিল'], ['ছবিটা', 'ঢাকা', 'ছিল'], ['মুখেও', 'কি', 'ছিল', 'না']]


In [57]:
test_samples_X = []
for s in test_samples:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
    test_samples_X.append(s_int)
 
test_samples_X = pad_sequences(test_samples_X, maxlen=MAX_LENGTH, padding='post')
print(test_samples_X)
 

[[ 2  6 33  9  0  0  0]
 [ 2  6 33  9  0  0  0]
 [29  6 25  0  0  0  0]
 [ 6  4 23 20  4  0  0]
 [19 10 29 21 13  0  0]
 [ 6 27 13  0  0  0  0]
 [32 28 13  4  0  0  0]]


In [68]:
predictions = model.predict(test_samples_X)
print(predictions.shape)

(7, 7, 19)


In [59]:
def logits_to_tokens(sequences, index):
    token_sequences = []
    for categorical_sequence in sequences:
        token_sequence = []
        for categorical in categorical_sequence:
            token_sequence.append(index[np.argmax(categorical)])
 
        token_sequences.append(token_sequence)
 
    return token_sequences

In [61]:
print(test_samples)
print(logits_to_tokens(predictions, {i: t for t, i in tag2index.items()}))

[['মা', 'ছবিটা', 'আনতে', 'পারত'], ['মা', 'ছবিটা', 'আনতে', 'পারত'], ['গতকাল', 'ছবিটা', 'দেখনি'], ['ছবিটা', 'না', 'দেখে', 'যাব', 'না'], ['এসব', 'স্কুলে', 'গতকাল', 'উপস্থিত', 'ছিল'], ['ছবিটা', 'ঢাকা', 'ছিল'], ['মুখেও', 'কি', 'ছিল', 'না']]
[['PN2s', 'Ns', 'Ns', 'VBint', '-PAD-', '-PAD-', '-PAD-'], ['PN2s', 'Ns', 'Ns', 'VBint', '-PAD-', '-PAD-', '-PAD-'], ['ADV', 'Ns', 'Ns', 'VB2', '-PAD-', '-PAD-', '-PAD-'], ['PN3s', 'Ns', 'Ns', 'VBint', 'VB1', '-PAD-', '-PAD-'], ['PN1', 'Ns', 'Ns', 'ADJ', 'VB2', '-PAD-', '-PAD-'], ['PN1', 'Ns', 'Vf', 'VB1', '-PAD-', '-PAD-', '-PAD-'], ['PN1', 'Ns', 'Vf', 'ADV', '-PAD-', '-PAD-', '-PAD-']]


# Custom loss
You probably are fairly acquainted with the PennTreebank tagset by now and you’re probably disappointed with the result. What’s wrong?

For most of the sentences, the largest part is “padding tokens”. These are really easy to guess, hence the super high performance. Let’s write a custom accuracy, that ignores the paddings:

In [96]:
from keras import backend as K
 
def ignore_class_accuracy(to_ignore=0):
    def ignore_accuracy(y_true, y_pred):
        y_true_class = K.argmax(y_true, axis=-1)
        y_pred_class = K.argmax(y_pred, axis=-1)
 
        ignore_mask = K.cast(K.not_equal(y_pred_class, to_ignore), 'int32')
        matches = K.cast(K.equal(y_true_class, y_pred_class), 'int32') * ignore_mask
        accuracy = K.sum(matches) / K.maximum(K.sum(ignore_mask), 1)
        return accuracy
    return ignore_accuracy

In [97]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, InputLayer, Bidirectional, TimeDistributed, Embedding, Activation
from keras.optimizers import Adam
 
 
model = Sequential()
model.add(InputLayer(input_shape=(MAX_LENGTH, )))
model.add(Embedding(len(word2index), 128))
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(TimeDistributed(Dense(len(tag2index))))
model.add(Activation('softmax'))
 
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(0.001),
              metrics=['accuracy', ignore_class_accuracy(0)])
 
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 7, 128)            4480      
_________________________________________________________________
bidirectional_5 (Bidirection (None, 7, 512)            788480    
_________________________________________________________________
time_distributed_5 (TimeDist (None, 7, 19)             9747      
_________________________________________________________________
activation_5 (Activation)    (None, 7, 19)             0         
Total params: 802,707
Trainable params: 802,707
Non-trainable params: 0
_________________________________________________________________


In [98]:
model.fit(train_sentences_X, to_categorical(train_tags_y, len(tag2index)), batch_size=1, epochs=40, validation_split=0.1)

Train on 8 samples, validate on 1 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.callbacks.History at 0x7fdf9947b210>