## Needed imports and versions

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import keras
import requests
import os
import copy
from preprocess import *
from oracle import *
from prediction import *
from keras import layers

In [2]:
print(tf.__version__)

2.13.0


In [3]:
print(keras.__version__)

2.13.1


## Downloading and preprocessing the datasets

In [4]:
def get_dataset(url, name):
    filename = os.path.join(os.getcwd(), name) #we save on the working directory

    r = requests.get(url) #get the webpage
    with open(filename, 'w', encoding="utf-8") as f: #and write it to a file
      f.write(r.text)

In [5]:
# download the datasets from github
get_dataset("https://raw.githubusercontent.com/UniversalDependencies/UD_English-ParTUT/master/en_partut-ud-train.conllu", "./datasets/original/en_partut-ud-train.conllu")
get_dataset("https://raw.githubusercontent.com/UniversalDependencies/UD_English-ParTUT/master/en_partut-ud-dev.conllu", "./datasets/original/en_partut-ud-dev.conllu")
get_dataset("https://raw.githubusercontent.com/UniversalDependencies/UD_English-ParTUT/master/en_partut-ud-test.conllu", "./datasets/original/en_partut-ud-test.conllu")

In [6]:
# preprocess the datasets to remove commentaries, multiwords and empty tokens
preprocess_dataset("./datasets/original/en_partut-ud-train.conllu", "./datasets/original/my_en_train.conllu")
preprocess_dataset("./datasets/original/en_partut-ud-dev.conllu", "./datasets/original/my_en_dev.conllu")
preprocess_dataset("./datasets/original/en_partut-ud-test.conllu", "./datasets/original/my_en_test.conllu")

In [7]:
# generate a dataframe for every dataset, removing non_projective sentences and adding ROOT
df_train = generate_dataframe("./datasets/original/my_en_train.conllu")
df_val = generate_dataframe("./datasets/original/my_en_dev.conllu")
df_test = generate_dataframe("./datasets/original/my_en_test.conllu")

## Apply the arc-eager oracle to generate training samples for our models

In [8]:
# dictionaries to encode the actions, the part-of-speech tags, and the dependency relations
actions = {'left-arc': 1, 'right-arc': 2, 'shift': 3, 'reduce': 4, 'end': 5}

pos={'ADJ': 1, 'ADP': 2, 'ADV': 3, 'AUX': 4, 'CCONJ': 5, 'DET': 6, 'INTJ': 7, 'NOUN': 8, 'NUM': 9, 'PART': 10, 'PRON': 11, 'PROPN': 12,
         'PUNCT': 13, 'SCONJ': 14, 'SYM': 15, 'VERB': 16, 'X': 17}

deprel={'root': 1, 'nsubj': 2, 'obj': 3, 'iobj': 4, 'csubj': 5, 'ccomp': 6, 'xcomp': 7, 'obl': 8, 'vocative': 9, 'expl': 10, 'dislocated': 11,
        'advcl': 12, 'advmod': 13, 'discourse': 14, 'aux': 15, 'cop': 16, 'mark': 17, 'nmod': 18, 'appos': 19, 'nummod': 20, 'acl': 21,
        'amod': 22, 'det': 23, 'clf': 24, 'case': 25, 'conj': 26, 'cc': 27, 'fixed': 28, 'flat': 29, 'list': 30, 'parataxis': 31, 'compound': 32,
        'orphan': 33, 'goeswith': 34, 'reparandum': 35, 'punct': 36, 'dep': 37, 'acl:relcl': 38, 'advcl:relcl': 39, 'advmod:emph': 40,
        'advmod:lmod': 41, 'aux:pass': 42, 'cc:preconj': 43, 'compound:lvc': 44, 'compound:prt': 45, 'compound:redup': 46, 'compound:svc': 47,
        'csubj:outer': 48, 'csubj:pass': 49, 'det:numgov': 50, 'det:nummod': 51, 'det:poss': 52, 'expl:impers': 53, 'expl:pass': 54,
        'expl:pv': 55, 'flat:foreign': 56, 'nmod:poss': 57, 'nmod:tmod': 58, 'nsubj:outer': 59, 'nsubj:pass': 60, 'nummod:gov': 61,
        'obl:agent': 62, 'obl:arg': 63, 'obl:lmod': 64, 'obl:tmod': 65, 'det:predet': 66, 'nmod:npmod': 67}

In [9]:
# we also need the inverse dictionaries for actions and dependency relations, to "trace back" the outputs
# of our models
inv_actions = {v: k for k, v in actions.items()}

inv_deprel = {v: k for k, v in deprel.items()}

In [10]:
# this function will take a dataframe, a text_vectorizer vocabulary, and the number of features to extract from 
# stack and buffer, and apply our arc-eager oracle to generate an execution trace and process it so it can be 
# used as the training input of a predictive model
def process_traces(dataframe, vocabulario, n_w):
  data_list=[] # list to store the training inputs
  for i in range(len(dataframe)): # for each element in the dataframe
    items = dataframe.iloc[i][1] # we extract the items (words and their features)
    arcs = dataframe.iloc[i][2] # we extract the arcs
    traza = oracle(items, arcs) # we use the oracle to generate an execution trace

    # we generate an array with the words in the original sentence, and then an equivalent one that contains their respective
    # "token" form, obtained from our vocabulary. The second line is long because it needs to check if the word exists in lowercase,
    # in normal case, or if it does not exist at all (token 1, Out Of Vocabulary).
    words=[items[k][1] for k in range(len(items))]
    vectorizer_index = [vocabulario.index(words[w].lower()) if words[w].lower() in vocabulario else vocabulario.index(words[w]) if words[w] in vocabulario else 1 for w in range(1,len(words))]
    # we insert an "artificial" token at the beggining, that represents ROOT
    vectorizer_index.insert(0, len(vocabulario))

    for j in range(len(traza)): #for each step of the trace
      stack=traza[j][0] # extract the different features of the state
      buffer=traza[j][1]
      action=traza[j][2]
      deps=traza[j][3]

      action = actions[action] # get the integer corresponding to the action
      if action < 3: # if its an arc
        dependency = deprel[deps[-1][1]] # establish the dependency
      else: # if it is not, then we ignore the dependency
        dependency = 0
      pos_stack=[]
      pos_buffer=[]

      # this code structure will be repeated, but it basically extracts n_w tokens from the end of the stack and the start of the buffer
      # and pads the arrays with 0 if there weren't enough tokens to fill n_w spaces.
      stack_sel=list(np.array(np.pad(stack[-n_w:], (n_w-len(stack[-n_w:]), 0), 'constant', constant_values=(0, 0)), dtype=np.int32))
      buffer_sel=list(np.array(np.pad(buffer[0:n_w], (0, n_w-len(buffer[0:n_w])), 'constant', constant_values=(0, 0)), dtype=np.int32))
      
      # for each token extracted from the stack and buffer, we obtain its part of speech from the sentence items. If the token extracted
      # was a zero (padding), we insert a 0.
      for element in stack_sel:
        if element == 0:
          pos_stack.append(0)
        else:
          pos_stack.append(pos[items[element][3]])
      for element in buffer_sel:
        if element == 0:
          pos_buffer.append(0)
        else:
          pos_buffer.append(pos[items[element][3]])

      # we create an equivalent stack and buffer using the tokenized form of the words instead of the baseline ids (remember that each
      # sentence considers itself as using ids 0, 1, 2..., but we need to supply the tokenized words to the model)
      new_stack=[vectorizer_index[indice] for indice in stack]
      new_buffer=[vectorizer_index[indice] for indice in buffer]

      # we extract the tokenized forms of the words as done above
      stack_sel=list(np.array(np.pad(new_stack[-n_w:], (n_w-len(new_stack[-n_w:]), 0), 'constant', constant_values=(0, 0)), dtype=np.int32))
      buffer_sel=list(np.array(np.pad(new_buffer[0:n_w], (0, n_w-len(new_buffer[0:n_w])), 'constant', constant_values=(0, 0)), dtype=np.int32))
      
      # we append the features of the input sample to our data list
      data_list.append([stack_sel, buffer_sel, pos_stack, pos_buffer, action, dependency])
  # we build and return a dataframe with the data list
  return pd.DataFrame(data_list, columns=['stack_tokens', 'buffer_tokens', 'stack_pos', 'buffer_pos', 'action', 'dependency'])


In [11]:
# we train a text vectorizer with our train data, and get its vocabulary to tokenize our words
train_X_tensor = tf.convert_to_tensor(df_train.iloc[:, 0])
text_vectorizer = layers.TextVectorization(output_mode='int', standardize='lower', split='whitespace')
text_vectorizer.adapt(train_X_tensor)
vocabulario=text_vectorizer.get_vocabulary()

# we process the three dataframes, so that we can use them to train, validate and test the model
# the second parameter is the number of features that will be extracted from stack and buffer
train_dataframe = process_traces(df_train, vocabulario, 2)
val_dataframe = process_traces(df_val, vocabulario, 2)
test_dataframe = process_traces(df_test, vocabulario, 2)

## Create, train and test the models

In [12]:
# We convert to tensor all the columns in our dataframes
st_tensor = tf.convert_to_tensor(list(train_dataframe.iloc[:,0]))
bt_tensor = tf.convert_to_tensor(list(train_dataframe.iloc[:,1]))
sp_tensor = tf.convert_to_tensor(list(train_dataframe.iloc[:,2]))
bp_tensor = tf.convert_to_tensor(list(train_dataframe.iloc[:,3]))
ac_tensor = tf.convert_to_tensor(list(train_dataframe.iloc[:,4]))
de_tensor = tf.convert_to_tensor(list(train_dataframe.iloc[:,5]))

st_tensor_val = tf.convert_to_tensor(list(val_dataframe.iloc[:,0]))
bt_tensor_val = tf.convert_to_tensor(list(val_dataframe.iloc[:,1]))
sp_tensor_val = tf.convert_to_tensor(list(val_dataframe.iloc[:,2]))
bp_tensor_val = tf.convert_to_tensor(list(val_dataframe.iloc[:,3]))
ac_tensor_val = tf.convert_to_tensor(list(val_dataframe.iloc[:,4]))
de_tensor_val = tf.convert_to_tensor(list(val_dataframe.iloc[:,5]))

st_tensor_test = tf.convert_to_tensor(list(test_dataframe.iloc[:,0]))
bt_tensor_test = tf.convert_to_tensor(list(test_dataframe.iloc[:,1]))
sp_tensor_test = tf.convert_to_tensor(list(test_dataframe.iloc[:,2]))
bp_tensor_test = tf.convert_to_tensor(list(test_dataframe.iloc[:,3]))
ac_tensor_test = tf.convert_to_tensor(list(test_dataframe.iloc[:,4]))
de_tensor_test = tf.convert_to_tensor(list(test_dataframe.iloc[:,5]))

In [13]:
# we define parameters for the network
# if num_features is modified, then it should be modified above in the calls to process_traces
num_features = 2
num_pos = len(pos.values())
num_deprels = len(deprel.values())
num_actions = len(actions.values())
num_words = len(vocabulario)

In [14]:
# we define our model

# Inputs
# one input layer for each type of feature
stack_tokens_input = layers.Input(shape=(num_features), name="stack_tokens_input")
buffer_tokens_input = layers.Input(shape=(num_features), name="buffer_tokens_input")
stack_pos_input = layers.Input(shape=(num_features), name="stack_pos_input")
buffer_pos_input = layers.Input(shape=(num_features), name="buffer_pos_input")

# Embedding
# one embedding for each type of feature
stack_tokens_embedding = layers.Embedding(num_words+1, 16, input_length=num_features, mask_zero=True, name="stack_tokens_embedding")(stack_tokens_input)
buffer_tokens_embedding = layers.Embedding(num_words+1, 16, input_length=num_features, mask_zero=True, name="buffer_tokens_embedding")(buffer_tokens_input)
stack_pos_embedding = layers.Embedding(num_pos+1, 16, input_length=num_features, mask_zero=True, name="stack_pos_embedding")(stack_pos_input)
buffer_pos_embedding = layers.Embedding(num_pos+1, 16, input_length=num_features, mask_zero=True, name="buffer_pos_embedding")(buffer_pos_input)

# concatenate the embeddings and flatten the result
concatenate_embeddings = layers.Concatenate(axis=1, name="concatenate_embeddings")([stack_tokens_embedding, buffer_tokens_embedding, stack_pos_embedding, buffer_pos_embedding])
flat_embeddings = layers.Flatten(name="flat_embeddings")(concatenate_embeddings)

# Dense-Outputs
# one dense layer for each desirable output
action_dense = layers.Dense(num_actions+1, activation='softmax', name="action_dense")(flat_embeddings)
dependency_dense = layers.Dense(num_deprels+1, activation='softmax', name="dependency_dense")(flat_embeddings)

# Model
model = keras.Model(inputs=[stack_tokens_input, buffer_tokens_input, stack_pos_input, buffer_pos_input],
                     outputs=[action_dense, dependency_dense],
                     name="dependency_parsing")

In [15]:
# compile and summarize the characteristics of the model

model.compile(
        loss="sparse_categorical_crossentropy",
        optimizer='adam',
        metrics=["accuracy"]
)

model.summary()

Model: "dependency_parsing"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 stack_tokens_input (InputL  [(None, 2)]                  0         []                            
 ayer)                                                                                            
                                                                                                  
 buffer_tokens_input (Input  [(None, 2)]                  0         []                            
 Layer)                                                                                           
                                                                                                  
 stack_pos_input (InputLaye  [(None, 2)]                  0         []                            
 r)                                                                              

In [16]:
# train the model, with the validation set
inputs=[st_tensor, bt_tensor, sp_tensor, bp_tensor]
outputs=[ac_tensor, de_tensor]
val_inputs=[st_tensor_val, bt_tensor_val, sp_tensor_val, bp_tensor_val]
val_outputs=[ac_tensor_val, de_tensor_val]

history=model.fit(x=inputs, y=outputs, epochs=10, validation_data=(val_inputs, val_outputs))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [17]:
# test the model

test_inputs=[st_tensor_test, bt_tensor_test, sp_tensor_test, bp_tensor_test]
test_outputs=[ac_tensor_test, de_tensor_test]

print(model.metrics_names)
model.evaluate(x=test_inputs, y=test_outputs)

['loss', 'action_dense_loss', 'dependency_dense_loss', 'action_dense_accuracy', 'dependency_dense_accuracy']


[1.0487929582595825,
 0.5185903310775757,
 0.5302026867866516,
 0.8392366170883179,
 0.8554198741912842]

## Run predictions for the test set, repair the trees and generate CONLLU evaluation files

In [18]:
# We perform "vertical" prediction

lote_estados = df_test.iloc[:, 1] # take a batch of states, in this case the whole dataset can be processed 
index = [i for i in range(len(df_test))] # create numerical ids from 0 to the lenght of the dataset
estados = [initial_state(i) for i in lote_estados] # compute the initial states for the batch
estados = list(zip(estados, index)) # zip together the state with the id, which references the row in the dataset
items = [i for i in lote_estados] # take the items (the words and their features) for the batch

# as we did before, build (for each sentence of the batch) an array of words, and the corresponding array of tokenized words
words = [[items[it][k][1] for k in range(len(items[it]))] for it in range(len(items))] 
vectorized_words = [[vocabulario.index(words[wor][w].lower()) if words[wor][w].lower() in vocabulario else vocabulario.index(words[wor][w]) if words[wor][w] in vocabulario else 1 for w in range(1,len(words[wor]))] for wor in range(len(words))]
for i in range(len(vectorized_words)): # add the ROOT token
  vectorized_words[i].insert(0, len(text_vectorizer.get_vocabulary()))

# array to store the final, predicted trees
predicted_trees=[]

# counter to control the infinite loop
count = 0

# loop infinitely
while True:
  stack_list = [] # lists for the different input features of each state
  buffer_list = []
  stack_pos_list = []
  buffer_pos_list = []
  finish_indexes=[] # list to store indexes of sentences whose processing has finished

  for i in range(len(estados)): # for each state
    if final_state(estados[i][0]): # if we're done with this sentence, store its index and pass to the next one
      finish_indexes.append(i)
      continue
    # extract the features of this state and store them on the lists
    stack, buffer, stack_pos, buffer_pos = extract_features(items[i], estados[i][0], vectorized_words[i], num_features, pos)
    stack_list.append(stack)
    buffer_list.append(buffer)
    stack_pos_list.append(stack_pos)
    buffer_pos_list.append(buffer_pos)

  # we reverse the ordering of the indexes of finished sentences (to not mess up the others while deleting) and for each one
  for index in sorted(finish_indexes, reverse=True):
    # we store the id (row of original dataset) and the arcs
    predicted_trees.append((estados[index][1], estados[index][0][3]))
    # we delete this sentence from everywhere
    del estados[index]
    del items[index]
    del vectorized_words[index]

  # if we end up having no states left, we break the infinite loop
  if not estados:
    break

  # we transform our feature lists to tensors and use them as inputs to get the current set of predictions
  net_inputs = [tf.convert_to_tensor(stack_list), tf.convert_to_tensor(buffer_list), tf.convert_to_tensor(stack_pos_list), tf.convert_to_tensor(buffer_pos_list)]
  action_pred, deprel_pred = model.predict(x=net_inputs, verbose=0)

  # for each state, and according to the predictions, we apply the action/dependency and update the state (keeping the same id) 
  for i in range(len(estados)):
    action, state = check_action_and_state(deprel_pred[i], action_pred[i], estados[i][0], inv_deprel)
    estados[i] = (state, estados[i][1])

  # we count one more step, and once we overtake a thousand, we break out of the infinite loop
  count = count + 1
  if count > 1000:
    break


In [19]:
# we create a deep copy of our test dataframe
df_test_predicted = pd.DataFrame(data = copy.deepcopy(df_test.values), columns = df_test.columns)

# and for each tree that we previously predicted, we repair its possible errors and update the new dataframe with its
# arc information
for tree in predicted_trees:
  repair_tree(tree, df_test)
  for arc in tree[1]:
    df_test_predicted.iloc[tree[0], 1][arc[2]][6] = arc[0]
    df_test_predicted.iloc[tree[0], 1][arc[2]][7] = arc[1]

In [24]:
# this writes the prediction dataset to a file in the CONLLU format

with open("./datasets/generated/en_test_predictions_c4.conllu", 'a', encoding="utf-8") as f:
  for item in df_test_predicted['items']:
    for word in item:
      if word[0] == 0:
        continue
      f.write('\t'.join(str(el) for el in word)+'\n')
    f.write('\n')


In [23]:
# this writes our test dataframe (the one which is clean, with non-projective sentences removed) to a file in the CONLLU format

with open("./datasets/generated/en_test_clean.conllu", 'a', encoding="utf-8") as f:
  for item in df_test['items']:
    for word in item:
      if word[0] == 0:
        continue
      f.write('\t'.join(str(el) for el in word)+'\n')
    f.write('\n')