In [1]:
import nltk
nltk.download("treebank")

[nltk_data] Downloading package treebank to /home/sann-
[nltk_data]     htet/nltk_data...
[nltk_data]   Package treebank is already up-to-date!


True

In [2]:
import numpy as np
import os
import tensorflow as tf

2023-08-20 10:32:56.129222: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-08-20 10:32:56.167736: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-08-20 10:32:56.168764: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
def download_and_read(dataset_dir, num_pairs=None):
    sent_filename = os.path.join(dataset_dir, "treebank-sents.txt")
    poss_filename = os.path.join(dataset_dir, "treebank-poss.txt")
    if not(os.path.exists(sent_filename) and os.path.exists(poss_filename)):
        if not os.path.exists(dataset_dir):
            os.makedirs(dataset_dir)
        fsents = open(sent_filename, "w")
        fposs = open(poss_filename, "w")
        sentences = nltk.corpus.treebank.tagged_sents()
        for sent in sentences:
            fsents.write(" ".join([w for w, p in sent]) + "\n")
            fposs.write(" ".join([p for w, p in sent]) + "\n")
        fsents.close()
        fposs.close()
    sents, poss = [], []
    with open(sent_filename, "r") as fsent:
        for idx, line in enumerate(fsent):
            sents.append(line.strip())
            if num_pairs is not None and idx >= num_pairs:
                break
    with open(poss_filename, "r") as fposs:
        for idx, line in enumerate(fposs):
            poss.append(line.strip())
            if num_pairs is not None and idx >= num_pairs:
                break
    return sents, poss

In [4]:
sents, poss = download_and_read("./datasets")
assert(len(sents) == len(poss))
print("# of records: {:d}".format(len(sents)))

# of records: 3914


In [5]:
def tokenize_and_build_vocab(texts, vocab_size=None, lower=True):
    if vocab_size is None:
        tokenizer = tf.keras.preprocessing.text.Tokenizer(lower=lower)
    else:
        tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=vocab_size+1, oov_token="UNK", lower=lower)
    tokenizer.fit_on_texts(texts)
    if vocab_size is not None:
        tokenizer.word_index = {e:i for e, i in tokenizer.word_index.items() if i <= vocab_size+1}
    word2idx = tokenizer.word_index
    idx2word = {v:k for k, v in word2idx.items()}
    return word2idx, idx2word, tokenizer

In [6]:
word2idx_s, idx2word_s, tokenizer_s = tokenize_and_build_vocab(sents, vocab_size=9000)
word2idx_t, idx2word_t, tokenizer_t = tokenize_and_build_vocab(poss, vocab_size=38, lower=False)
source_vocab_size = len(word2idx_s)
target_vocab_size = len(word2idx_t)
print("vocab sizes (source): {:d}, (target): {:d}".format(source_vocab_size, target_vocab_size))

vocab sizes (source): 9001, (target): 39


In [7]:
sequence_lengths = np.array([len(s.split()) for s in sents])
print([(p, np.percentile(sequence_lengths, p)) for p in [75, 80, 90, 95, 99, 100]])

[(75, 33.0), (80, 35.0), (90, 41.0), (95, 47.0), (99, 58.0), (100, 271.0)]


In [8]:
max_seqlen = 271

sents_as_ints = tokenizer_s.texts_to_sequences(sents)
sents_as_ints = tf.keras.preprocessing.sequence.pad_sequences(sents_as_ints, maxlen=max_seqlen, padding="post")

# convert POS tags to sequence of (categorical) integers
poss_as_ints = tokenizer_t.texts_to_sequences(poss)
poss_as_ints = tf.keras.preprocessing.sequence.pad_sequences(poss_as_ints, maxlen=max_seqlen, padding="post")

poss_as_catints = []
for p in poss_as_ints:
    poss_as_catints.append(tf.keras.utils.to_categorical(p, num_classes=target_vocab_size+1, dtype="int32"))
    
poss_as_catints = tf.keras.preprocessing.sequence.pad_sequences(poss_as_catints, maxlen=max_seqlen)

dataset = tf.data.Dataset.from_tensor_slices((sents_as_ints, poss_as_catints))

idx2word_s[0], idx2word_t[0] = "PAD", "PAD"

# split into training, validation, and test datasets
dataset = dataset.shuffle(10000)
test_size = len(sents) // 3
val_size = (len(sents) - test_size) // 10

test_dataset = dataset.take(test_size)
val_dataset = dataset.skip(test_size).take(val_size)
train_dataset = dataset.skip(test_size + val_size)

# create batches
batch_size = 128
train_dataset = train_dataset.batch(batch_size)
val_dataset = val_dataset.batch(batch_size)
test_dataset = test_dataset.batch(batch_size)

2023-08-20 10:32:58.189474: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1960] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [9]:
class POSTaggingModel(tf.keras.Model):
    def __init__(self, source_vocab_size, target_vocab_size, embedding_dim, max_seqlen, rnn_output_dim, **kwargs):
        super(POSTaggingModel, self).__init__(**kwargs)
        self.embed = tf.keras.layers.Embedding(source_vocab_size, embedding_dim, input_length=max_seqlen)
        self.dropout = tf.keras.layers.SpatialDropout1D(0.2)
        self.rnn = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(rnn_output_dim, return_sequences=True))
        self.dense = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(target_vocab_size+1))
        self.activation = tf.keras.layers.Activation("softmax")
    
    def call(self, x):
        x = self.embed(x)
        x = self.dropout(x)
        x = self.rnn(x)
        x = self.dense(x)
        x = self.activation(x)
        return x

embedding_dim = 128
rnn_output_dim = 256
model = POSTaggingModel(source_vocab_size, target_vocab_size, embedding_dim, max_seqlen, rnn_output_dim)

model.build(input_shape=(batch_size, max_seqlen))

model.summary()

Model: "pos_tagging_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       multiple                  1152128   
                                                                 
 spatial_dropout1d (Spatial  multiple                  0         
 Dropout1D)                                                      
                                                                 
 bidirectional (Bidirection  multiple                  592896    
 al)                                                             
                                                                 
 time_distributed (TimeDist  multiple                  20520     
 ributed)                                                        
                                                                 
 activation (Activation)     multiple                  0         
                                                 

In [10]:
def masked_accuracy():
    def masked_accuracy_fn(ytrue, ypred):
        ytrue = tf.keras.backend.argmax(ytrue, axis=-1)
        ypred = tf.keras.backend.argmax(ypred, axis=-1)
        mask = tf.keras.backend.cast(tf.keras.backend.not_equal(ypred, 0), tf.int32)
        matches = tf.keras.backend.cast(tf.keras.backend.equal(ytrue, ypred), tf.int32) * mask
        numer = tf.keras.backend.sum(matches)
        denom = tf.keras.backend.maximum(tf.keras.backend.sum(mask), 1)
        accuracy = numer / denom
        return accuracy
    return masked_accuracy_fn

In [11]:
model.compile(loss="categorical_crossentropy",
              optimizer="adam",
              metrics=["accuracy", masked_accuracy()])

In [12]:
num_epochs = 50
data_dir = './data/'
logs_dir = os.path.join("./logs_pos")

best_model_file = os.path.join(data_dir, "best_model_pos.h5")
checkpoint = tf.keras.callbacks.ModelCheckpoint(best_model_file, save_weights_only=True, save_best_only=True)
tensorboard = tf.keras.callbacks.TensorBoard(log_dir=logs_dir)

history = model.fit(train_dataset,
                    epochs=num_epochs,
                    validation_data=val_dataset,
                    callbacks=[checkpoint, tensorboard])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50


Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [13]:
# evaluate with test set
best_model = POSTaggingModel(source_vocab_size, target_vocab_size,
    embedding_dim, max_seqlen, rnn_output_dim)
best_model.build(input_shape=(batch_size, max_seqlen))
best_model.load_weights(best_model_file)
best_model.compile(
    loss="categorical_crossentropy",
    optimizer="adam", 
    metrics=["accuracy", masked_accuracy()])

test_loss, test_acc, test_masked_acc = best_model.evaluate(test_dataset)
print("test loss: {:.3f}, test accuracy: {:.3f}, masked test accuracy: {:.3f}".format(
    test_loss, test_acc, test_masked_acc))

test loss: 0.066, test accuracy: 0.980, masked test accuracy: 0.779


In [14]:
# predict on batches
labels, predictions = [], []
is_first_batch = True
accuracies = []

for test_batch in test_dataset:
    inputs_b, outputs_b = test_batch
    preds_b = best_model.predict(inputs_b)
    # convert from categorical to list of ints
    preds_b = np.argmax(preds_b, axis=-1)
    outputs_b = np.argmax(outputs_b.numpy(), axis=-1)
    for i, (pred_l, output_l) in enumerate(zip(preds_b, outputs_b)):
        assert(len(pred_l) == len(output_l))
        pad_len = np.nonzero(output_l)[0][0]
        acc = np.count_nonzero(
            np.equal(
                output_l[pad_len:], pred_l[pad_len:]
            )
        ) / len(output_l[pad_len:])
        accuracies.append(acc)
        if is_first_batch:
            words = [idx2word_s[x] for x in inputs_b.numpy()[i][pad_len:]]
            postags_l = [idx2word_t[x] for x in output_l[pad_len:] if x > 0]
            postags_p = [idx2word_t[x] for x in pred_l[pad_len:] if x > 0]
            print("labeled  : {:s}".format(" ".join(["{:s}/{:s}".format(w, p) 
                for (w, p) in zip(words, postags_l)])))
            print("predicted: {:s}".format(" ".join(["{:s}/{:s}".format(w, p) 
                for (w, p) in zip(words, postags_p)])))
            print(" ")
    is_first_batch = False

accuracy_score = np.mean(np.array(accuracies))
print("pos tagging accuracy: {:.3f}".format(accuracy_score))

labeled  : robert/NNP s/NNP jenkins/NNP cambridge/NNP mass/NNP
predicted: robert/NNP s/NNP jenkins/NNP cambridge/NNP mass/NNP
 
labeled  : these/DT imports/NNS totaled/VBD about/IN 17/CD million/CD u/NONE last/JJ year/NN
predicted: these/DT imports/NNS totaled/VBD about/IN 17/CD million/CD u/NONE last/JJ year/NN
 
labeled  : he/PRP predicted/VBD that/IN the/DT board/NN would/MD give/VB the/DT current/JJ duo/NN until/IN early/JJ next/JJ year/NN before/IN naming/NONE a/VBG new/DT chief/JJ executive/NN PAD/NN
predicted: he/PRP predicted/VBD that/IN the/DT board/NN would/MD give/VB the/DT current/JJ duo/NN until/IN early/JJ next/JJ year/NN before/IN naming/VBG a/VBG new/JJ chief/NN executive/NN PAD/NN
 
labeled  : today/NN pc/NN shipments/NNS annually/RB total/VBP some/DT 38/CD 3/CD billion/NONE u/JJ
predicted: today/NN pc/NN shipments/NNS annually/RB total/VBP some/DT 38/CD 3/CD billion/NONE u/NN
 
labeled  : the/DT energy/NN segment/NN with/IN a/DT 15/CD rise/NN in/NN operating/IN profit

labeled  : for/IN the/DT nine/CD months/NNS the/DT company/NN reported/VBD a/DT net/JJ loss/NN of/IN UNK/CD UNK/NONE u/CC or/CD 39/NNS cents/DT a/NN share/VBN compared/IN with/JJ year/JJ earlier/NN net/IN income/CD of/NONE UNK/CC UNK/CD u/NNS or/DT 62/NN
predicted: for/IN the/DT nine/CD months/NNS the/DT company/NN reported/VBD a/DT net/JJ loss/NN of/IN UNK/CD UNK/NONE u/CC or/CD 39/NNS cents/DT a/NN share/IN compared/IN with/JJ year/JJ earlier/JJ net/NN income/IN of/NNP UNK/NNP UNK/CC u/CC or/CD 62/NNS cents/DT
 
labeled  : the/DT company/NN said/VBD 0/NONE it/PRP made/VBD the/DT purchase/NN in/IN order/NN 1/NONE to/TO locally/RB produce/VB hydraulically/RB operated/VBN shovels/NNS
predicted: the/DT company/NN said/VBD 0/NONE it/PRP made/VBD the/DT purchase/NN in/IN order/NN 1/NONE to/TO locally/RB produce/VB hydraulically/RB operated/VBN shovels/NNS
 
labeled  : james/NNP mason/NNP assistant/NN secretary/NN for/IN health/NN said/VBD 0/NONE the/DT ban/NN on/IN federal/JJ funding/NN of

pos tagging accuracy: 0.980


---