# Graduate Assement 3
## Standard Neural Arc Dependency Parsing
### By: Kota Vinay Kumar

In [None]:
pip install conllu #installing conllu library

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting conllu
  Downloading conllu-4.5.2-py2.py3-none-any.whl (16 kB)
Installing collected packages: conllu
Successfully installed conllu-4.5.2


##### Importing Necessary Libraries

In [None]:
import conllu
from collections import Counter
import copy
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.functional import one_hot

from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Flatten, Embedding, Dense

#### Downloading English EWT Files

In [None]:
!wget "https://www.csee.umbc.edu/courses/undergraduate/473/f22/materials/a1/data/UD_English-EWT/en_ewt-ud-train.conllu"

!wget "https://www.csee.umbc.edu/courses/undergraduate/473/f22/materials/a1/data/UD_English-EWT/en_ewt-ud-dev.conllu"


!wget "https://www.csee.umbc.edu/courses/undergraduate/473/f22/materials/a1/data/UD_English-EWT/en_ewt-ud-test.conllu"

#### Parsing train, dev, test into a list of TokenList

In [226]:
with open('en_ewt-ud-train.conllu','r') as s:
  training_data = [a for a in conllu.parse_incr(s)]

with open('en_ewt-ud-dev.conllu','r') as s:
  dev_data = [a for a in conllu.parse_incr(s)]

with open('en_ewt-ud-test.conllu','r') as s:
  test_data = [a for a in conllu.parse_incr(s)]

#### function to generate gold dep trees, for comparision metrics

In [None]:
def get_god_deps(data):
  gold_deps=[]
  for token in data:
    for t in token:
      for deps,id in t['deps']:
        if type(id) is not tuple:
          head = token[id-1]['form'] if id > 0 else 'ROOT'
          # print(head,t,deps)
          gold_deps.append((head,t['form'],deps))

In [None]:
train_gold_deps = get_god_deps(training_data) #gold dep trees for training data
dev_gold_deps = get_god_deps(dev_data) #gold dep trees for dev data

In [None]:
# gold_deps=[]
# for token in dev_data:
#   for t in token:
#     for deps,id in t['deps']:
#       if type(id) is not tuple:
#         head = token[id-1]['form'] if id > 0 else 'ROOT'
#         # print(head,t,deps)
#         gold_deps.append((head,t['form'],deps))

In [None]:
len(gold_deps)

26242

In [199]:
def get_info_deps(data):
    '''Function that takes the tokenlist and returns 
    a list of all identities, words, pos tags, head and
    the dependency relation between the word and its head'''
  
    iden = []
    word = []
    pos = []
    head = []
    deprel = []


    for i in range(len(data)):
        for j in range(len(data[i])):
            iden.append(str(data[i][j]['id']))
            word.append(data[i][j]['form'])
            pos.append(data[i][j]['upos'])
            head.append(data[i][j]['head'])
            deprel.append(data[i][j]['deprel'])    

    return iden, word, pos, head, deprel     

In [None]:
# def get_info_deps(data):
  
#     iden = []
#     word = []
#     pos = []
#     head = []
#     deprel = []


#     for i in range(len(data)):
#         for j in range(len(data[i])):
#             iden.append(str(data[i][j]['id']))
#             word.append(data[i][j]['form'])
#             pos.append(data[i][j]['upos'])
#             head.append(str(data[i][j]['head']))
#             deprel.append(data[i][j]['deprel'])    

#     return iden, word, pos, head, deprel     
            

In [200]:
iden_t, word_t, pos_t, head_t, deprel_t = get_info_deps(training_data) 
# iden_d, word_d, pos_d, head_d, deprel_d = get_info_deps(dev_data)

In [201]:
class struct(object): 
    '''
    This class is helpful for getting the required information for each token list, whenever 
    necessary'''
    
    def __init__(self, training_sentence):
        self.iden = []
        self.word = []
        self.pos = []
        self.head = []
        self.deprel = []
        self.training_sentence = training_sentence
        
    def seperation(self):
        for i in range(len(self.training_sentence)):
            self.iden.append(self.training_sentence[i]['id'])
            self.word.append(self.training_sentence[i]['form'])
            self.pos.append(self.training_sentence[i]['upos'])
            self.head.append(self.training_sentence[i]['head'])
            self.deprel.append(self.training_sentence[i]['deprel'])
        
    

In [202]:
with open('Modified_conllu.txt','w') as m: #Storing the clean conllu data if there is any further use
    for a,b,c,d,e in zip(iden_t,word_t,pos_t,head_t,deprel_t): 
        m.write(f'{a} {b} {c} {d} {e}\n')

In [203]:
unique_words = set(word.lower() for word in word_t) #removing duplicates of words
unique_pos = set(pos_t) #removing duplicate pos tags

unique_words = ['UNK','ROOT','NULL']+list(unique_words) #appending unk, root, null for flexibility
pos_tags = ['UNK','ROOT','NULL']+list(unique_pos) #appending unk, root, null to pos tag for flexibility of usage
        

In [204]:
def create_word_Embeds(unique_words, pos_tags, deprel):
    '''
    A multipuporse function that helps us to create 
    embedding for the words, pos, deprels, and output tags 
    which can be used in the embedding part of the model development
    '''

      word_dict = {}
      for i,j in enumerate(unique_words):
          word_dict[j] = i

      pos_dict = {}
      for i,j in enumerate(pos_tags):
          pos_dict[j] = i

      unique_dependency = set(deprel)
      dependencies = {}
      for i,j in enumerate(unique_dependency):
          dependencies[j] = i

      labels = []
      labels.append(("shift",None))
      for i in unique_dependency:
          labels.append(("L",i))
          labels.append(("R",i))

      output_representations = {}
      for i, label in enumerate(labels):
          output_representations[label]= i

      indexed_output = dict((index,label) for (label,index) in output_representations.items())


      return word_dict, pos_dict, dependencies, output_representations, indexed_output

In [205]:
training_word_dict, training_pos_dict, training_dependencies, training_output_reps, indexed_output =  create_word_Embeds(unique_words,pos_tags,deprel_t)

In [206]:
indexed_output

{0: ('shift', None),
 1: ('L', 'dep'),
 2: ('R', 'dep'),
 3: ('L', 'csubj'),
 4: ('R', 'csubj'),
 5: ('L', 'csubj:pass'),
 6: ('R', 'csubj:pass'),
 7: ('L', 'ccomp'),
 8: ('R', 'ccomp'),
 9: ('L', 'iobj'),
 10: ('R', 'iobj'),
 11: ('L', 'conj'),
 12: ('R', 'conj'),
 13: ('L', 'mark'),
 14: ('R', 'mark'),
 15: ('L', 'orphan'),
 16: ('R', 'orphan'),
 17: ('L', 'parataxis'),
 18: ('R', 'parataxis'),
 19: ('L', 'obl:npmod'),
 20: ('R', 'obl:npmod'),
 21: ('L', 'cop'),
 22: ('R', 'cop'),
 23: ('L', 'det:predet'),
 24: ('R', 'det:predet'),
 25: ('L', 'cc:preconj'),
 26: ('R', 'cc:preconj'),
 27: ('L', 'acl'),
 28: ('R', 'acl'),
 29: ('L', 'nummod'),
 30: ('R', 'nummod'),
 31: ('L', 'nmod:poss'),
 32: ('R', 'nmod:poss'),
 33: ('L', 'aux:pass'),
 34: ('R', 'aux:pass'),
 35: ('L', '_'),
 36: ('R', '_'),
 37: ('L', 'punct'),
 38: ('R', 'punct'),
 39: ('L', 'acl:relcl'),
 40: ('R', 'acl:relcl'),
 41: ('L', 'list'),
 42: ('R', 'list'),
 43: ('L', 'cc'),
 44: ('R', 'cc'),
 45: ('L', 'discourse'),
 

In [207]:
class State: #the main useful class which gives us the actual dependencies, both while training and evaluating
    
    def __init__(self, sentence=[]):
        self.stack = []
        self.buffer = sentence[::-1]
        self.dep = set()
        
    def shift(self): self.stack.append(self.buffer.pop()) #pop a word from buffer and append to stack
        
    def left_arc(self,label): self.dep.add((self.buffer[-1],self.stack.pop(),label))  #pop a word from stack and create dep relation
    
    def right_arc(self, label):
        top = self.stack.pop() #pop a word from the buffer
        buffer_last = self.buffer.pop() 
        self.dep.add((top,buffer_last,label)) #add a dep relation
        self.buffer.append(top) #append it to the buffer again

In [208]:
def select_2_features(state,word_dict, pos_dict):
    '''
    A function that is useful in embedding and feature selection.
    It takes the a single state and uses the first two words of stack
    and the next two words of buffer and create a 4 feature matrix
    '''
    
    stack_length = len(state.stack)
    buffer_length = len(state.buffer)
    feat = []


    i = -1
    while stack_length > 0:
        if i <= -3: break
        word = state.stack[i]
        # print(word)
        
        if word in word_dict.keys(): feat.append(word_dict[word])
        # elif word in pos_dict.keys(): feat.append(pos_dict[word])
        elif word.lower() not in word_dict.keys(): feat.append(word_dict['UNK'])
        # elif word.lower() is None: feat.append(word_dict['ROOT'])
        # elif word.lower() not in pos_dict.keys(): feat.append(pos_dict['UNK'])
        else: feat.append(word_dict[word.lower()])
            
        stack_length =stack_length-1
        i = i-1

    while i>=-2:
        feat.append(word_dict['NULL'])
        i = i-1
        
    i= -1
    while buffer_length> 0:
        
        if i <= -3: break
            
        word = state.buffer[i]

        if word in word_dict.keys(): feat.append(word_dict[word])
        # elif word in pos_dict.keys(): feat.append(pos_dict[word])        
        elif word.lower() not in word_dict.keys(): feat.append(word_dict['UNK'])
        # elif word.lower() is None: feat.append(word_dict['ROOT'])
        # elif word.lower() not in pos_dict.keys(): feat.append(pos_dict['UNK'])
        else: feat.append(word_dict[word.lower()])
            
        buffer_length = buffer_length-1
        i = i-1

    while i>=-2:
        feat.append(word_dict['NULL'])
        i = i-1
    
    return np.array(feat)
        

In [209]:
def process_token_list(data):
    '''
    A function that is solely responsible for generating state for each operation
    and generting output sequences'''

      x=[]
      for i in data:
         x.append(i)
    

      seq = []
      for i in x:
          s = struct(i)
          s.seperation()
          
          sequence = []
          state = State(s.word)

          # print(state.buffer)
          
          childrens = Counter(s.head)
              
          words = [None] + s.word
          pos =[None]+ s.pos
          state.stack.append('NULL')

          while state.buffer:
              
              if not state.stack:
                  sequence.append((copy.deepcopy(state),("shift",None)))
                  state.shift()
                  continue
              
              
              last_word_stack = state.stack[-1]
              last_word_buffer = state.buffer[-1]
              
              if last_word_stack=='NULL': stack_index = 0
              else:
                stack_index = s.word.index(last_word_stack)
              buffer_index = s.word.index(last_word_buffer)
              
              if s.head[stack_index] == s.iden[buffer_index]:
                  childrens[s.iden[buffer_index]] -=1
                  sequence.append((copy.deepcopy(state),("L",s.deprel[stack_index])))
                  state.left_arc(s.deprel[stack_index])
              elif s.head[buffer_index] == s.iden[stack_index] and childrens[s.iden[buffer_index]] == 0:
                  childrens[s.iden[stack_index]] -=1
                  sequence.append((copy.deepcopy(state),("R",s.deprel[buffer_index])))
                  state.right_arc(s.deprel[buffer_index])
                  
              else:
                  sequence.append((copy.deepcopy(state),("shift",None)))
                  state.shift()
              

          seq.append(sequence)

      return seq

In [210]:
training_sequence = process_token_list(training_data)

In [None]:
training_sequence[:1]

In [None]:
# training_seq = []
# for i in training_sequence:
#   for j in i:
#     training_seq.append(j)

In [212]:
s=[]  #this cell gives us the output lables and the features in an encoded format
labels = []
for i in training_sequence:
  # print(i)
  for j,k in i:
    instu = select_2_features(j,training_word_dict, training_pos_dict)
    labels.append(training_output_reps[k])
    s.append(instu)

In [213]:
train_inputs = pd.DataFrame(s, columns=['stack_1','stack_2','buffer_1','buffer_2']) #dataframe generation of inputs and outputs
train_outputs = pd.DataFrame(labels, columns=['Outputs'])

In [214]:
print(f"size of train inpus is {train_inputs.shape} , and the output shape is {train_outputs.shape}")

size of train inpus is (351135, 4) , and the output shape is (351135, 1)


In [215]:
encoded_labels = keras.utils.to_categorical(torch.tensor(np.array(train_outputs)), num_classes=101)

In [221]:
def Depmodel(words, outputs): 
    '''
    Keras model on which we are implementing the embeddings'''
  
  model = Sequential()
  model.add(Embedding(words,32, input_length=4)) #embedding layer
  model.add(Flatten())
  model.add(Dense(units=100, activation='relu'))
  model.add(Dense(units=10, activation='relu'))
  model.add(Dense(units=101, activation='softmax'))
  model.compile(keras.optimizers.Adam(learning_rate=0.01),loss='categorical_crossentropy')

  return model

In [222]:
model = Depmodel(len(training_word_dict),len(training_output_reps))

In [218]:
np.array(train_inputs)

array([[    2,     2, 16511, 11036],
       [16511,     2, 11036, 14077],
       [    2,     2, 16511, 14077],
       ...,
       [    2,     2,  1756, 10219],
       [ 1756,     2, 10219,     2],
       [    2,     2,  1756,     2]])

In [219]:
train_outputs

Unnamed: 0,Outputs
0,0
1,38
2,0
3,84
4,0
...,...
351130,0
351131,38
351132,0
351133,38


In [223]:
model.fit(np.array(train_inputs), encoded_labels, epochs=10, batch_size=1000)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f5ee000c2b0>

In [None]:
np.argmax(model.predict(np.array([16511,     2, 11036, 14077]).reshape(1,-1)))



38

In [None]:
model.save('/content/drive/MyDrive/Assessment/Kerasmodel')

In [None]:
# dev_insts = process_token_list(dev_data)

In [None]:
# unique_words = set(word.lower() for word in word_t)
# unique_pos = set(pos_t)

# unique_words = ['UNK','ROOT','NULL']+list(unique_words)
# pos_tags = ['UNK','ROOT','NULL']+list(unique_pos)

In [None]:
# dev_word_dict, dev_pos_dict, dev_dependencies, dev_output_reps, indexed_output =  create_word_Embeds(unique_words,pos_tags,deprel_d)

In [None]:
# s=[]
# labels = []
# for i in dev_insts:
#   # print(i)
#   for j,k in i:
#     instu = select_2_features(j,training_word_dict, training_pos_dict)
#     labels.append(training_output_reps[k])
#     s.append(instu)

In [None]:
# encoded_dev_labels = keras.utils.to_categorical(labels)

In [None]:
len(test_data)

In [None]:
y=[]
for i in dev_data:
    y.append(i)

Now as we already have the model prepared, we now take each instance and predict the ouput and create a dependency relation set(which looks similar to gold dependency tree structure)

In [None]:
sequence_deps = []
j=1
for i in y:

    print(f'*******************currently processing {j} tokenlist sentence****************************************')
    s = struct(i)
    s.seperation()
    
    sequence = []
    state = State(s.word)
    # print(state.buffer)

    # print(state.buffer)   

    j=j+1

    state.stack.append('null')
    while len(state.buffer) >1:

          feat = select_2_features(state, training_word_dict, training_pos_dict)
          model_predictions = model.predict(feat.reshape(1,-1))[0]

    #       # action_list = list(indexed_output.values())
    #       # actions = [x for _, x in sorted(zip(model_predictions, action_list), reverse=True)]

          output = indexed_output[np.argmax(model_predictions)]
          buffer_len = len(state.buffer)
          stack_len = len(state.stack)
          

          action = output[0]
          label = output[1]
          # print(action,label)

          if action == 'shift':
            if (stack_len ==0 and buffer_len == 1) or buffer_len >1:
              state.shift()  
            else: state.left_arc(label)       
          elif action =='L':
            if stack_len > 0 and state.stack[-1]!='null':
              state.left_arc(label)
            else: state.shift()

          elif action =='R':
            if stack_len > 0:
              state.right_arc(label)
            else: state.left_arc(label)

          # else:
          #   state.shift()
    #       # print(state.dep)
        # print(state.dep)

    sequence_deps.append(state.dep)
  
    

In [None]:
total_predicted_deps = []
for i in sequence_deps:
  for j in i:
    total_predicted_deps.append(j)

In [None]:
len(total_predicted_deps), len(gold_deps)

(17328, 26242)

In [None]:
len(set(total_predicted_deps)), len(set(gold_deps))

(14450, 21502)

In [None]:
LAS = [i for i in total_predicted_deps if i in gold_deps]

In [None]:
count = 0
for i in total_predicted_deps:
  if i in gold_deps:
    count= count+1

In [None]:
Labelled_Attachment_Score_forPredicted = len(LAS)/len(total_predicted_deps)

In [None]:
Labelled_Attachment_Score_forGold = len(LAS)/len(gold_deps)

In [None]:
Labelled_Attachment_Score_forPredicted, Labelled_Attachment_Score_forGold

(0.5354916897506925, 0.35359347610700403)

#### The LAS score for the English EWT for total number of predicted sets is 0.53 or 53 percent
#### The LAS score for the English EWT for total number of gold dependency relations is 0.35 or 35 percent

In [None]:
unlabeled=[]

unlabeled_gold=[]
for i in total_predicted_deps:
  unlabeled.append((i[0],i[1]))

for i in gold_deps:
  unlabeled_gold.append((i[0],i[1]))

In [None]:
UAS = [i for i in unlabeled if i in unlabeled_gold]

In [None]:
len(UAS)

11293

In [None]:
len(UAS)/len(unlabeled_gold)

0.4303406752534106

#### The UAS for the predicted deps is 0.43 or 43 percent

#### Evaluating the model on test set.......(Unsuccessful)

In [227]:
z=[]
for i in test_data:
    z.append(i)

In [None]:
sequence_deps = []
j=1
for i in z:

    print(f'*******************currently processing {j} tokenlist sentence****************************************')
    s = struct(i)
    s.seperation()
    
    sequence = []
    state = State(s.word)
    # print(state.buffer)

    # print(state.buffer)   

    j=j+1

    state.stack.append('null')
    while len(state.buffer) >1:

          feat = select_2_features(state, training_word_dict, training_pos_dict)
          model_predictions = model.predict(feat.reshape(1,-1))[0]

    #       # action_list = list(indexed_output.values())
    #       # actions = [x for _, x in sorted(zip(model_predictions, action_list), reverse=True)]

          output = indexed_output[np.argmax(model_predictions)]
          buffer_len = len(state.buffer)
          stack_len = len(state.stack)
          

          action = output[0]
          label = output[1]
          # print(action,label)

          if action == 'shift':
            if (stack_len ==0 and buffer_len == 1) or buffer_len >1:
              state.shift()  
            else: state.left_arc(label)       
          elif action =='L':
            if stack_len > 0 and state.stack[-1]!='null':
              state.left_arc(label)
            else: state.shift()

          elif action =='R':
            if stack_len > 0:
              state.right_arc(label)
            else: state.left_arc(label)

          # else:
          #   state.shift()
    #       # print(state.dep)
        # print(state.dep)

    sequence_deps.append(state.dep)
  
    

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
*******************currently processing 1291 tokenlist sentence****************************************
*******************currently processing 1292 tokenlist sentence****************************************
*******************currently processing 1293 tokenlist sentence****************************************
*******************currently processing 1294 tokenlist sentence****************************************
*******************currently processing 1295 tokenlist sentence****************************************
*******************currently processing 1296 tokenlist sentence****************************************
*******************currently processing 1297 tokenlist sentence****************************************
*******************currently processing 1298 tokenlist sentence****************************************
*******************currently processing 1299 tokenlist sentence****************************************

==============================================================================

### NOW..... running the same model with Another Language from the Universal Dependencies (Estonian EWT)

In [161]:
with open('/content/et_ewt-ud-train.conllu','r') as s:
  training_data_estonian = [a for a in conllu.parse_incr(s)]

In [170]:
iden_t_span, word_t_span, pos_t_span, head_t_span, deprel_t_span = get_info_deps(training_data_estonian)

In [172]:
len(set(deprel_t_span))

39

In [163]:
unique_words_span = set(word.lower() for word in word_t_span)
unique_pos_span = set(pos_t_span)

unique_words_span = ['UNK','ROOT','NULL']+list(unique_words)
pos_tags_span = ['UNK','ROOT','NULL']+list(unique_pos)
        

In [173]:
training_word_dict_span, training_pos_dict_span, training_dependencies_span, training_output_reps_span, indexed_output_span =  create_word_Embeds(unique_words_span,pos_tags_span,deprel_t_span)

In [165]:
training_sequence_span = process_token_list(training_data_spanish)

In [166]:
s_span=[]
labels_span = []
for i in training_sequence_span:
  # print(i)
  for j,k in i:
    instu = select_2_features(j,training_word_dict_span, training_pos_dict_span)
    labels_span.append(training_output_reps_span[k])
    s_span.append(instu)

In [167]:
train_inputs_span = pd.DataFrame(s_span, columns=['stack_1','stack_2','buffer_1','buffer_2'])
train_outputs_span = pd.DataFrame(labels_span, columns=['Outputs'])

In [168]:
print(f"size of train inpus is {train_inputs_span.shape} , and the output shape is {train_outputs_span.shape}")

size of train inpus is (123861, 4) , and the output shape is (123861, 1)


In [180]:
indexed_output_span

{0: ('shift', None),
 1: ('L', 'dep'),
 2: ('R', 'dep'),
 3: ('L', 'csubj'),
 4: ('R', 'csubj'),
 5: ('L', 'ccomp'),
 6: ('R', 'ccomp'),
 7: ('L', 'nsubj:cop'),
 8: ('R', 'nsubj:cop'),
 9: ('L', 'conj'),
 10: ('R', 'conj'),
 11: ('L', 'mark'),
 12: ('R', 'mark'),
 13: ('L', 'orphan'),
 14: ('R', 'orphan'),
 15: ('L', 'parataxis'),
 16: ('R', 'parataxis'),
 17: ('L', 'csubj:cop'),
 18: ('R', 'csubj:cop'),
 19: ('L', 'cop'),
 20: ('R', 'cop'),
 21: ('L', 'cc:preconj'),
 22: ('R', 'cc:preconj'),
 23: ('L', 'acl'),
 24: ('R', 'acl'),
 25: ('L', 'nummod'),
 26: ('R', 'nummod'),
 27: ('L', '_'),
 28: ('R', '_'),
 29: ('L', 'punct'),
 30: ('R', 'punct'),
 31: ('L', 'acl:relcl'),
 32: ('R', 'acl:relcl'),
 33: ('L', 'cc'),
 34: ('R', 'cc'),
 35: ('L', 'discourse'),
 36: ('R', 'discourse'),
 37: ('L', 'advcl'),
 38: ('R', 'advcl'),
 39: ('L', 'obj'),
 40: ('R', 'obj'),
 41: ('L', 'compound'),
 42: ('R', 'compound'),
 43: ('L', 'advmod'),
 44: ('R', 'advmod'),
 45: ('L', 'goeswith'),
 46: ('R', '

In [174]:
training_output_reps_span

{('shift', None): 0,
 ('L', 'dep'): 1,
 ('R', 'dep'): 2,
 ('L', 'csubj'): 3,
 ('R', 'csubj'): 4,
 ('L', 'ccomp'): 5,
 ('R', 'ccomp'): 6,
 ('L', 'nsubj:cop'): 7,
 ('R', 'nsubj:cop'): 8,
 ('L', 'conj'): 9,
 ('R', 'conj'): 10,
 ('L', 'mark'): 11,
 ('R', 'mark'): 12,
 ('L', 'orphan'): 13,
 ('R', 'orphan'): 14,
 ('L', 'parataxis'): 15,
 ('R', 'parataxis'): 16,
 ('L', 'csubj:cop'): 17,
 ('R', 'csubj:cop'): 18,
 ('L', 'cop'): 19,
 ('R', 'cop'): 20,
 ('L', 'cc:preconj'): 21,
 ('R', 'cc:preconj'): 22,
 ('L', 'acl'): 23,
 ('R', 'acl'): 24,
 ('L', 'nummod'): 25,
 ('R', 'nummod'): 26,
 ('L', '_'): 27,
 ('R', '_'): 28,
 ('L', 'punct'): 29,
 ('R', 'punct'): 30,
 ('L', 'acl:relcl'): 31,
 ('R', 'acl:relcl'): 32,
 ('L', 'cc'): 33,
 ('R', 'cc'): 34,
 ('L', 'discourse'): 35,
 ('R', 'discourse'): 36,
 ('L', 'advcl'): 37,
 ('R', 'advcl'): 38,
 ('L', 'obj'): 39,
 ('R', 'obj'): 40,
 ('L', 'compound'): 41,
 ('R', 'compound'): 42,
 ('L', 'advmod'): 43,
 ('R', 'advmod'): 44,
 ('L', 'goeswith'): 45,
 ('R', 'goes

In [175]:
def Depmodel(words, outputs):
  
  model = Sequential()
  model.add(Embedding(words,64, input_length=4))
  model.add(Flatten())
  model.add(Dense(units=128, activation='relu'))
  model.add(Dense(units=10, activation='relu'))
  model.add(Dense(units=79, activation='softmax'))
  model.compile(keras.optimizers.Adam(learning_rate=0.001),loss='categorical_crossentropy')

  return model

In [176]:
encoded_labels_span = keras.utils.to_categorical(torch.tensor(np.array(train_outputs_span)), num_classes=79)

model = Depmodel(len(training_word_dict_span),len(training_output_reps_span))

In [177]:
model.fit(np.array(train_inputs_span), encoded_labels_span, epochs=10, batch_size=1000)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f5e7ae9a040>

In [178]:
with open('/content/et_ewt-ud-dev.conllu','r') as s:
  dev_data_span = [a for a in conllu.parse_incr(s)]

In [181]:
y=[]
for i in dev_data_span:
    y.append(i)

sequence_deps = []
j=1
for i in y:

    print(f'*******************currently processing {j} tokenlist sentence****************************************')
    s = struct(i)
    s.seperation()
    
    sequence = []
    state = State(s.word)
    # print(state.buffer)

    # print(state.buffer)   

    j=j+1

    state.stack.append('null')
    while len(state.buffer) >1:

          feat = select_2_features(state, training_word_dict_span, training_pos_dict_span)
          model_predictions = model.predict(feat.reshape(1,-1))[0]

    #       # action_list = list(indexed_output.values())
    #       # actions = [x for _, x in sorted(zip(model_predictions, action_list), reverse=True)]

          output = indexed_output_span[np.argmax(model_predictions)]
          buffer_len = len(state.buffer)
          stack_len = len(state.stack)
          

          action = output[0]
          label = output[1]
          # print(action,label)

          if action == 'shift':
            if (stack_len ==0 and buffer_len == 1) or buffer_len >1:
              state.shift()  
            else: state.left_arc(label)       
          elif action =='L':
            if stack_len > 0 and state.stack[-1]!='null':
              state.left_arc(label)
            else: state.shift()

          elif action =='R':
            if stack_len > 0:
              state.right_arc(label)
            else: state.left_arc(label)

          # else:
          #   state.shift()
    #       # print(state.dep)
        # print(state.dep)

    sequence_deps.append(state.dep)
  
    



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
*******************currently processing 562 tokenlist sentence****************************************
*******************currently processing 563 tokenlist sentence****************************************
*******************currently processing 564 tokenlist sentence****************************************
*******************currently processing 565 tokenlist sentence****************************************
*******************currently processing 566 tokenlist sentence****************************************
*******************currently processing 567 tokenlist sentence****************************************
*******************currently processing 568 tokenlist sentence****************************************
*******************currently processing 569 tokenlist sentence****************************************
*******************currently processing 570 tokenlist sentence****************************************
********

In [187]:
gold_deps_span=[]
for token in dev_data_span:
  for t in token:
    for deps,id in t['deps']:
      if type(id) is not tuple:
        head = token[id-1]['form'] if id > 0 else 'ROOT'
        # print(head,t,deps)
        gold_deps_span.append((head,t['form'],deps))

In [183]:
len(sequence_deps), len(dev_data_span)

(833, 833)

In [184]:
total_predicted_deps = []
for i in sequence_deps:
  for j in i:
    total_predicted_deps.append(j)

In [188]:
len(set(total_predicted_deps)), len(set(gold_deps_span))

(4688, 9036)

In [190]:
len(set(total_predicted_deps).intersection(set(gold_deps_span)))

570

In [191]:
LAS = [i for i in total_predicted_deps if i in gold_deps_span]

In [193]:
Labelled_Attachment_Score_forPredicted = len(LAS)/len(total_predicted_deps)
Labelled_Attachment_Score_forPredicted

0.12935424730087594

In [195]:
Labelled_Attachment_Score_forGold = len(LAS)/len(gold_deps_span)
Labelled_Attachment_Score_forGold

0.06325961346881849

### LAS for the predictions among the predicted sets is 0.12 or 12 percent
### LAS for the predictions over the total gold deps is 0.06 or 6 percent


In [196]:
unlabeled=[]

unlabeled_gold=[]
for i in total_predicted_deps:
  unlabeled.append((i[0],i[1]))

for i in gold_deps_span:
  unlabeled_gold.append((i[0],i[1]))

In [197]:

UAS = [i for i in unlabeled if i in unlabeled_gold]

len(UAS)/len(unlabeled_gold)

0.14355449292687786

#### UASfor the predictions over the gold dependencies is 0.14 or 14 percent

*Not able to run through test set