# Install and import dependencies

In [None]:
!pip install tensorflow_addons --quiet

[?25l[K     |▎                               | 10 kB 29.3 MB/s eta 0:00:01[K     |▋                               | 20 kB 3.8 MB/s eta 0:00:01[K     |█                               | 30 kB 5.5 MB/s eta 0:00:01[K     |█▏                              | 40 kB 7.2 MB/s eta 0:00:01[K     |█▌                              | 51 kB 8.7 MB/s eta 0:00:01[K     |█▉                              | 61 kB 10.2 MB/s eta 0:00:01[K     |██                              | 71 kB 11.6 MB/s eta 0:00:01[K     |██▍                             | 81 kB 12.7 MB/s eta 0:00:01[K     |██▊                             | 92 kB 13.9 MB/s eta 0:00:01[K     |███                             | 102 kB 15.1 MB/s eta 0:00:01[K     |███▎                            | 112 kB 15.1 MB/s eta 0:00:01[K     |███▋                            | 122 kB 15.1 MB/s eta 0:00:01[K     |███▉                            | 133 kB 15.1 MB/s eta 0:00:01[K     |████▏                           | 143 kB 15.1 MB/s eta 0:00:0

Load dependencies required

In [1]:
from dataclasses import dataclass
import os
import os.path
import re
import shutil
import typing
import urllib.request

import keras
import keras.callbacks
import keras.layers
import keras.losses
import keras.optimizers
import keras.regularizers
import numpy as np
import pandas as pd
import tensorflow as tf
import tqdm

# Dataset

In [3]:
!cat dependency_treebank/wsj_0001.dp

Pierre	NNP	2
Vinken	NNP	8
,	,	2
61	CD	5
years	NNS	6
old	JJ	2
,	,	2
will	MD	0
join	VB	8
the	DT	11
board	NN	9
as	IN	9
a	DT	15
nonexecutive	JJ	15
director	NN	12
Nov.	NNP	9
29	CD	16
.	.	8

Mr.	NNP	2
Vinken	NNP	3
is	VBZ	0
chairman	NN	3
of	IN	4
Elsevier	NNP	7
N.V.	NNP	12
,	,	12
the	DT	12
Dutch	NNP	12
publishing	VBG	12
group	NN	5
.	.	3


## Download the data

- Download the dataset
- Unzip the dataset in the local path

In [2]:
file_url = "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/dependency_treebank.zip"
zip_name = os.path.join(os.getcwd(), "dependency_treebank.zip")
file_name = os.path.join(os.getcwd(), "dependency_treebank")

if not os.path.isdir(file_name):
    if not os.path.isfile(zip_name):
        with urllib.request.urlopen(file_url) as res, open(zip_name, "wb") as f:
            shutil.copyfileobj(res, f)
    shutil.unpack_archive(zip_name, os.getcwd())

## Load the data

- Read the data for training (1-100), validation (101-150) and test (151-199)

In [4]:
class DataLoader:
    PUNCTUATIONS = r"[\.,\\\"'\-\?\:\!;\[\]\(\)\{\}⟨⟩‒–—―‐…\/⁄‘’“”`]+"

    def __init__(self):
        self.prog = re.compile(self.PUNCTUATIONS)

    def __call__(self, start_i: int, end_i: int, desc: typing.Optional[str] = None):
        df = pd.DataFrame(columns=["text", "tags"])
        # df_y = pd.DataFrame()

        for i in tqdm.trange(start_i, end_i+1, desc=f"Reading {desc} data"):
            idx_len = "0" * (3 - len(str(i)))
            data_file = os.path.join(file_name, f"wsj_0{idx_len}{i}.dp")
            with open(data_file, "r") as f:
                content = [l.strip().split() for l in f.readlines()]
            
            Xrows, yrows = [""], [""]
            i = 0
            for c in content:
                # New line encountered, split the following lines in a new sentence
                if len(c) == 0:
                    i += 1
                    Xrows.append("")
                    yrows.append("")
                    continue
                # Process sentence
                assert len(c) == 3
                if self.prog.fullmatch(c[1]) is None:
                    Xrows[i] += (" " if len(Xrows[i]) > 0 else "") +  c[0]
                    yrows[i] += (" " if len(yrows[i]) > 0 else "") + c[1]

            new_df = pd.DataFrame.from_dict({"text": Xrows, "tags": yrows})
            df = pd.concat([df, new_df], axis=0, ignore_index=True)
        return df

data_loader = DataLoader()
df_train = data_loader(1, 100, desc="train")
df_val = data_loader(101, 150, desc="validation")
df_test = data_loader(151, 199, desc="test")

Reading train data: 100%|██████████| 100/100 [00:00<00:00, 717.20it/s]
Reading validation data: 100%|██████████| 50/50 [00:00<00:00, 639.76it/s]
Reading test data: 100%|██████████| 49/49 [00:00<00:00, 779.08it/s]


## Closer look at datasets

In [5]:
df_train.head()

Unnamed: 0,text,tags
0,Pierre Vinken 61 years old will join the board...,NNP NNP CD NNS JJ MD VB DT NN IN DT JJ NN NNP CD
1,Mr. Vinken is chairman of Elsevier N.V. the Du...,NNP NNP VBZ NN IN NNP NNP DT NNP VBG NN
2,Rudolph Agnew 55 years old and former chairman...,NNP NNP CD NNS JJ CC JJ NN IN NNP NNP NNP NNP ...
3,A form of asbestos once used to make Kent ciga...,DT NN IN NN RB VBN TO VB NNP NN NNS VBZ VBN DT...
4,The asbestos fiber crocidolite is unusually re...,DT NN NN NN VBZ RB JJ IN PRP VBZ DT NNS IN RB ...


## TODO
- Distribution of classes
- Sentences length
    - Split sentences too long


# Model

In [56]:
@dataclass
class ModelConfiguration:
    EMBEDDING_DIM = 300
    MAX_SEQUENCE = 128
    VOCABULARY_SIZE = 15000

model_conf = ModelConfiguration()

## Dataframe to dataset

In [44]:
def df_to_dataset(df: pd.DataFrame, batch_size: int = 32):
    df = df.copy()
    return tf.data.Dataset.from_tensor_slices((df["text"], df["tags"])).batch(batch_size)

raw_train_ds = df_to_dataset(df_train, 32)
raw_val_ds = df_to_dataset(df_val, 256)
raw_test_ds = df_to_dataset(df_test, 512)
raw_all_ds = df_to_dataset(pd.concat([df_train, df_val, df_test]), 512)

## Tokenization and encoding

### Tokenize features

TODO
- [ ] Tokenization should include also validation and test (for OOV)

In [57]:
class TokenizeFeatures:
    vocab_size: int
    max_sequence: int
    layer: keras.layers.TextVectorization

    def __init__(self, dataset: tf.data.Dataset, vocab_size: int = 10000, max_sequence: int = 64):
        self.vocab_size = vocab_size
        self.max_sequence = max_sequence

        self.layer = keras.layers.TextVectorization(
            max_tokens=self.vocab_size,
            output_mode="int",
            standardize="lower",
            output_sequence_length=self.max_sequence,
        )
        
        features_ds = dataset.map(lambda x, _: x)
        features_ds = features_ds.map(lambda x: tf.strings.split(x, sep=" ").to_tensor())
        features_ds = features_ds.map(lambda x: tf.expand_dims(x, -1))

        self.layer.adapt(features_ds)

    def __call__(self, dataset: tf.data.Dataset) -> tf.data.Dataset:
        return dataset.map(lambda x, y: (self.layer(x), y))

    def get_vocabulary(self):
        return self.layer.get_vocabulary()

# tokenize_features = TokenizeFeatures(raw_train_ds, model_conf.VOCABULARY_SIZE, model_conf.MAX_SEQUENCE)
tokenize_features = TokenizeFeatures(raw_all_ds, model_conf.VOCABULARY_SIZE, model_conf.MAX_SEQUENCE)

### Encode of labels

In [58]:
class EncodeLabels:
    max_sequence: int
    layer: keras.layers.StringLookup

    def __init__(self, dataset: tf.data.Dataset, max_sequence: int):
        self.max_sequence = max_sequence
        self.layer = keras.layers.StringLookup(output_mode="int")

        label_ds = dataset.map(lambda _, y: y)
        label_ds = label_ds.map(lambda y: tf.strings.split(y, sep=" ").to_tensor())
        label_ds = label_ds.map(lambda y: y[..., :self.max_sequence])
        label_ds = label_ds.map(lambda y: 
                                tf.concat([
                                    y, 
                                    tf.fill((tf.shape(y)[0], self.max_sequence - tf.shape(y)[-1]), "")], -1))
        
        self.layer.adapt(label_ds)

    def __call__(self, dataset: tf.data.Dataset):
        ds = dataset.map(lambda x, y: (x, tf.strings.split(y, sep=" ").to_tensor()))
        ds = ds.map(lambda x, y: (x, y[..., :self.max_sequence]))
        ds = ds.map(lambda x, y: 
                    (x, tf.concat([
                        y, 
                        tf.fill((tf.shape(y)[0], self.max_sequence - tf.shape(y)[-1]), "")], -1)))
        ds = ds.map(lambda x, y: (x, self.layer(y)))
        return ds

    def get_vocabulary(self):
        return self.layer.get_vocabulary()

    def get_vocabulary_size(self):
        return self.layer.vocabulary_size()

encode_labels = EncodeLabels(raw_all_ds, model_conf.MAX_SEQUENCE)
# encode_labels = EncodeLabels(raw_train_ds, model_conf.MAX_SEQUENCE)

### Decode of labels (use after inference)

In [59]:
class DecodeLabels:
    layer: keras.layers.StringLookup

    def __init__(self, vocabulary):
        self.layer = keras.layers.StringLookup(output_mode="int", vocabulary=vocabulary, invert=True)

    def __call__(self, dataset):
        ds =  tf.math.argmax(dataset, axis=-1)
        ds = self.layer(ds)
        ds =  tf.strings.reduce_join(ds, axis=-1, separator=" ")
        ds = tf.strings.strip(ds)
        return ds

decode_labels = DecodeLabels(encode_labels.get_vocabulary())

### Encode and tokenize datasets

In [60]:
def prepare_data(ds: tf.data.Dataset):
    ds = encode_labels(ds)
    return tokenize_features(ds).prefetch(tf.data.AUTOTUNE)

train_ds = prepare_data(raw_train_ds)    
val_ds = prepare_data(raw_val_ds)  
test_ds = prepare_data(raw_test_ds)

## Embedding layer

- [ ] Change GloVE

In [7]:
!wget -q https://huggingface.co/stanfordnlp/glove/resolve/main/glove.6B.zip
!unzip -qq glove.6B.zip

### Get GloVe vocabulary

In [32]:
glove_path = f"glove.6B.{model_conf.EMBEDDING_DIM}d.txt"
embeddings_index = {}
with open(glove_path) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, dtype=float, sep=" ")
        embeddings_index[word] = coefs

print(f"Found {len(embeddings_index)} word vectors.")

Found 400001 word vectors.


### Configure pretrained embedding

TODO
- [ ] Create vocabulary for traning, validation and test

In [61]:
vocabulary = tokenize_features.get_vocabulary()
vocabulary_size = len(vocabulary)

word_index = dict(zip(vocabulary, range(vocabulary_size)))

hits = 0
misses = 0

rng = np.random.default_rng(seed=42)

embedding_matrix = np.zeros((vocabulary_size, model_conf.EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        # random embedding for missed words
        embedding_matrix[i] = rng.random((model_conf.EMBEDDING_DIM))
        misses += 1
print(f"Converted {hits} words ({misses} misses)")

Converted 10259 words (678 misses)


## Baseline model

In [62]:
NUM_CLASSES = encode_labels.get_vocabulary_size()
print(f"Number of classes is: {NUM_CLASSES}")

Number of classes is: 42


In [39]:
!mkdir model_checkpoints

In [78]:
inputs = keras.layers.Input((model_conf.MAX_SEQUENCE, ), dtype="int64")

embedded = keras.layers.Embedding(
    embedding_matrix.shape[0], 
    embedding_matrix.shape[1], 
    embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
    trainable=False)(inputs)

x = keras.layers.SpatialDropout1D(0.3)(embedded)

lstm = keras.layers.GRU(
    128, 
    return_sequences=True, 
    recurrent_regularizer=keras.regularizers.l2(0.02))
x = keras.layers.Bidirectional(lstm)(x)

# lstm = keras.layers.LSTM(
#     128, 
#     return_sequences=True, 
#     recurrent_regularizer=keras.regularizers.l2(0.02))
# x = keras.layers.Bidirectional(lstm)(x)

x = keras.layers.Dropout(0.2)(x)

outputs = keras.layers.Dense(NUM_CLASSES, activation="softmax")(x)
model = keras.Model(inputs, outputs)

opt = keras.optimizers.Adam()
model.compile(
    optimizer=opt, 
    loss=keras.losses.SparseCategoricalCrossentropy(),
    metrics=["accuracy"])
    
model.summary()

Model: "model_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_10 (InputLayer)       [(None, 128)]             0         
                                                                 
 embedding_9 (Embedding)     (None, 128, 300)          3281100   
                                                                 
 spatial_dropout1d_7 (Spatia  (None, 128, 300)         0         
 lDropout1D)                                                     
                                                                 
 bidirectional_10 (Bidirecti  (None, 128, 256)         330240    
 onal)                                                           
                                                                 
 dropout_4 (Dropout)         (None, 128, 256)          0         
                                                                 
 dense_8 (Dense)             (None, 128, 42)           1079

## Training

In [79]:
model_checkpoint = keras.callbacks.ModelCheckpoint("./model_checkpoints", monitor="val_accuracy", save_best_only=True)
# early_stopping = EarlyStopping(monitor="val_loss", patience=5)
# reduce_lr = ReduceLROnPlateau(monitor="val_loss", patience=1, factor=0.3)

history = model.fit(train_ds, validation_data=val_ds, epochs=20)
# history = model.fit(train_ds, validation_data=val_ds, epochs=20, callbacks=[model_checkpoint])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


## Inference

- [ ] Compute F1-macro score

In [65]:
ypreds = model.predict(test_ds)

idx = 1
print(decode_labels(ypreds)[idx].numpy().decode("utf-8"))
print(df_test["tags"].iloc[idx])

DT NN NNP NNP NN NNP NNP POS NN TO CD NN IN CD NN CC MD VB VB NNP NNP IN VBG DT NN IN DT NN NN
DT NN VBZ NNP NNP NNP NNP POS NN TO CD NN IN CD NN CC MD VB VB NNP NNP IN VBG DT NN IN DT NNS NN


In [None]:
# print(classification_report(np.argmax(y_test, axis=1), np.argmax(model.predict(xtest_raw_ds), axis=1), digits=4))

### Plot the loss and the F1 score

Old code, remove if not useful

In [None]:
# fig, axs = plt.subplots(2, figsize=(8, 10))

# x_len = np.arange(1, len(history.history['f1_score'])+ 1)

# axs[0].plot(x_len, history.history['loss'])
# axs[0].plot(x_len, history.history['val_loss'])
# axs[0].set_title('model loss')
# axs[0].set_ylabel('loss')
# axs[0].set_xlabel('epoch')
# axs[0].legend(['train', 'val'], loc='upper left')
# axs[0].set_xticks(x_len)

# axs[1].plot(x_len, history.history['f1_score'])
# axs[1].plot(x_len, history.history['val_f1_score'])
# axs[1].set_title('model f1 score macro avg')
# axs[1].set_ylabel('f1 macro avg')
# axs[1].set_xlabel('epoch')
# axs[1].legend(['train', 'val'], loc='upper left')
# axs[1].set_xticks(x_len)
# axs[1].set_yticks(np.arange(0.35, 0.65, step=0.02))

# fig.show()