In [None]:
from tasks import preprocessing, util

import pandas as pd
import numpy as np
from tqdm.auto import tqdm

import os

In [None]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

INPUT_DIR = "input/UD_English-EWT"
OUTPUT_DIR = "output"
INTERMEDIATE_DIR = "intermediate"

## Dataset

Acquiring and preprocessing our data with the goal of eventually acquiring a sufficient representation of our text is the most difficult and time-consuming task. We thus split it in distinct phases:

* Original dataset acquisition and parsing
* Qualitative analysis and preprocessing
* Transformation for the NLP task

Note that due to the relative custom code complexity, most of the code used in this section was developed and imported from python source files located in the `tasks` module. In-depth documentation and implementation details can be found in these files. 

In [None]:
print("Loading training dataset...")
train_df = preprocessing.conllu_to_pd(os.path.join(INPUT_DIR, "en_ewt-ud-train.conllu"))
print("Loading validation dataset...")
val_df = preprocessing.conllu_to_pd(os.path.join(INPUT_DIR, "en_ewt-ud-dev.conllu"))
print("Loading test dataset...")
test_df = preprocessing.conllu_to_pd(os.path.join(INPUT_DIR, "en_ewt-ud-test.conllu"))

print(f"Training data shape: {train_df.shape}\nValidation data shape: {val_df.shape}"
      "\nTest data shape: {test_df.shape}")

Below we can see a preview of our parsed training dataset. Our preprocessing exploits pandas's ordering scheme in order to make sure the words are inserted in the order they appear in the sentence. This ordering will prove important later.

In [None]:
train_df

As mentioned above, our dataset features words connected with punctuation such as "don't". These are normally treated as two words, with the first being their intuitive POS tag ("do" - AUX) and the second as part of the first ("n't" - PART).

This dataset contains both the full words and their split versions, with only the latter featuring valid POS tags. The former are instead marked by a pseudo-tag (here "_").

In [None]:
invalid_idx = train_df.pos == "_"
train_df[invalid_idx]

In [None]:
" ".join(train_df[invalid_idx].words.unique()[:30])

Below we can see an example of a word being contained both times in the dataset, one in full with the pseudo-tag, and the other as split words with valid POS tags.

In [None]:
train_df.iloc[176:179]

We thus remove the full words including the pseudo-tag from our datasets, ensuring that all target POS tags will be compliant with the UPOS scheme.

In [None]:
train_df = train_df[~invalid_idx]
val_df = val_df[val_df.pos != "_"]
test_df = test_df[test_df.pos != "_"]

### Qualitative Analysis

We analyze our dataset in two granualities: sentences and individual words. We begin by analyzing how many words are in each sentence, which will give us an idea on the size of context available for each word.

In [None]:
def length_sentences(df: pd.DataFrame) -> float:
    lengths = df.groupby(["sent_id"]).agg(lambda x: len(x))
    return lengths.words


train_length = length_sentences(train_df)
val_length = length_sentences(val_df)
test_length = length_sentences(test_df)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt


stats_df = pd.DataFrame({"words": pd.concat([train_length, val_length, test_length], ignore_index=True),
                        "type": len(train_length)*["train"] +
                         len(val_length)*["validation"] + 
                         len(test_length)*["test"]})

sns.histplot(x="words", 
             hue="type", 
             data=stats_df, 
             multiple="stack")

plt.title("Number of sentences by word count")
util.save_plot("ex_2_dataset_stats.png", OUTPUT_DIR)
plt.show()

In [None]:
train_length.describe()

In [None]:
val_length.describe()

In [None]:
test_length.describe()

In [None]:
vocab_size = len(set(train_df.words))
print(f"Vocabulary size: {vocab_size}")

In [None]:
print(f"Total word count:\nTraining: {train_df.shape[0]}"
      f"\nValidation: {val_df.shape[0]}"
      f"\nTesting: {test_df.shape[0]}")

In [None]:
print(f"Total sentence count:\nTraining: {len(set(train_df.sent_id))}"
      f"\nValidation: {len(set(val_df.sent_id))}"
      f"\nTesting: {len(set(test_df.sent_id))}")

## Creating the RNN model

In [None]:
import tensorflow as tf

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input, Embedding, Bidirectional, GRU,\
                                    TextVectorization, LayerNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import CategoricalAccuracy

### Vectorization

In [None]:
from sklearn.preprocessing import LabelBinarizer


lb = LabelBinarizer()

y_train = lb.fit_transform(train_df.pos)
y_valid = lb.transform(val_df.pos)
y_test = lb.transform(test_df.pos)


In [None]:
MAX_SEQUENCE_LENGTH = int(np.quantile(train_length, 0.95))
MAX_SEQUENCE_LENGTH

In [None]:
MAX_WORDS = vocab_size
EMBEDDING_DIM = 300

In [None]:
vectorizer = TextVectorization(max_tokens=MAX_WORDS, output_mode='int', ngrams=1, 
              output_sequence_length=MAX_SEQUENCE_LENGTH)

print("Setting up vectorizer...")
vectorizer.adapt(train_df.words.values)

### Word embeddings

In [None]:
gpus = tf.config.list_physical_devices('GPU')
print("Executing with ", gpus[0].name if len(gpus) != 0 else "CPU")

In [None]:
# download and unzip only if the download and unzipped files do not exist 
!wget -nc -P input/fasttext https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz

![ -f "input/fasttext/cc.en.300.bin" ] && echo "Skipping model file" || gzip --decompress --keep --force "input/fasttext/cc.en.300.bin.gz"   

In [None]:
import fasttext 


print("Loading embedding model...")
fasttext_model = fasttext.load_model('input/fasttext/cc.en.300.bin')
embedding_matrix = np.zeros(shape=(MAX_WORDS, 300))

print("Computing word embeddings...")
for w2idx, _word in tqdm(enumerate(vectorizer.get_vocabulary()), 
                          total = len(vectorizer.get_vocabulary())):
    # Skip PAD and UNK tokens
    if w2idx < 2:
      continue
    embedding_matrix[w2idx] = fasttext_model.get_word_vector(_word)

# reclaim memory
del fasttext_model

(https://arxiv.org/pdf/1801.05134.pdf)

In [None]:
GRU_SIZE = 300
DENSE = 1000


# create empty sequential model
model = tf.keras.Sequential()
model.add(Input(shape=(1,), dtype=tf.string))
model.add(vectorizer)

model.add(Embedding(MAX_WORDS, EMBEDDING_DIM, weights=[embedding_matrix], 
                    input_length=MAX_SEQUENCE_LENGTH, mask_zero=True, trainable=False))

model.add(Bidirectional(GRU(GRU_SIZE, return_sequences=True)))
model.add(LayerNormalization())

model.add(Bidirectional(GRU(GRU_SIZE, return_sequences=False)))
model.add(LayerNormalization())

# add an MLP with 1 hidden layer
model.add(Dense(units=DENSE, activation='tanh' ))
model.add(Dropout(0.33))
model.add(Dense(len(np.unique(train_df.pos)), activation='softmax'))

print(model.summary())

In [31]:
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(learning_rate=0.001),
              metrics=["categorical_accuracy"])

train_data = train_df.words.values
val_data = val_df.words.values

history = model.fit(train_data, y_train,
              validation_data=(val_data, y_valid),
              batch_size=256,
              epochs=10,
              shuffle=True,
              callbacks=[util.Metrics(valid_data=(val_data, y_valid))])

Epoch 1/10
 — val_f1: 0.827580 — val_precision: 0.849112 — val_recall: 0.835003
Epoch 2/10
 — val_f1: 0.834724 — val_precision: 0.874597 — val_recall: 0.828364
Epoch 3/10
 — val_f1: 0.849330 — val_precision: 0.866722 — val_recall: 0.856711
Epoch 4/10
 — val_f1: 0.818269 — val_precision: 0.836999 — val_recall: 0.823950
Epoch 5/10
 — val_f1: 0.840201 — val_precision: 0.871193 — val_recall: 0.831345
Epoch 6/10
 — val_f1: 0.830842 — val_precision: 0.849783 — val_recall: 0.836434
Epoch 7/10
 — val_f1: 0.835939 — val_precision: 0.868446 — val_recall: 0.827290
Epoch 8/10
 — val_f1: 0.837204 — val_precision: 0.868404 — val_recall: 0.826853
Epoch 9/10
 — val_f1: 0.837643 — val_precision: 0.875884 — val_recall: 0.829119
Epoch 10/10
 — val_f1: 0.842884 — val_precision: 0.879063 — val_recall: 0.835321
