In [None]:
import os
import copy
from sklearn import svm
import numpy as np

In [None]:
#mount
from google.colab import drive 
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#move to current working directory
work_dir = '/content/drive/My Drive/Colab Notebooks/NLP/UD/'
os.chdir(work_dir)
%ls

ca_ancora-ud-dev.conllu    CatalanParser.ipynb      test.conllu
ca_ancora-ud-test.conllu   CatalanParser_old.ipynb  [0m[01;34mudpipe[0m/
ca_ancora-ud-train.conllu  ParserTraining.ipynb


In [None]:
class Token:
    
    labels = ('index','token','lemma','upos','xpos','morph','head','deprel','enh','other')
    
    def __init__(self, line=None):
        if line==None:
            line="0\tROOT"+"\t_"*8
        self._data = {k: v for (k, v) in zip(Token.labels,line.split('\t'))}
        #print('here',self._data)
        
       
    def __getitem__(self, arg):
        if arg not in Token.labels:
            raise ValueError('unknown token key: '+arg)
        else:
            return self._data[arg]
        
    def __str__(self):
        return '('+self['index']+','+self['token']+')'
    
    __repr__ = __str__

In [None]:

class Dependency:
    
    def __init__(self, head, dep):
        self._head = head
        self._dep = dep
        
    def head_index(self):
      return self._head['index']
        
    def __str__(self):
        return str(self._head)+'→'+str(self._dep)
    
    __repr__ = __str__

In [None]:
training_features = dict()        #dictionary to store training feature values.
class Configuration:
    
    def __init__(self, tokens):
        self._depcounts = dict()
        for t in tokens:
          if t['head'] != '_':
            self._depcounts[t['head']] = self._depcounts.get(t['head'],0) + 1

        self._tokens = tokens
        self._stack = [Token()]
        self._deps = []

        #Initialize the heads with the number of tokens
        self.heads =[None] * (len(tokens)+1)
        
    def leftarc(self):
        dep = self._stack.pop(-2)
        self._deps.append(Dependency(self._stack[-1],dep))
        self.heads[int(dep['index'])]= self._stack[-1]['index']       
        
    def shift(self):
        if len(self._tokens)==0:
            raise IndexError('Trying to shift from configuration:'+str(self))
        self._stack.append(self._tokens.pop(0))
        
    def rightarc(self):
        dep = self._stack.pop(-1)
        self._deps.append(Dependency(self._stack[-1],dep))
        self.heads[int(dep['index'])]= self._stack[-1]['index']   #Assign the predicted heads 
        
    def _dependents_found(self, headindex):
        return sum(1 for d in self._deps if d.head_index()==headindex)
        
    def done_p(self):
        return len(self._tokens)==0 and len(self._stack)==1

    #feature extraction code. Features have been taken from Jurafsky martin
    def extract_features(self):

      fv = {}

      s1w,s1t,s2w,s2t,b0w,b0t,b1w,b1t,s1wt,s2wt,b0wt,b1wt= ("NULL","NULL","NULL","NULL","NULL","NULL","NULL","NULL","NULL","NULL","NULL","NULL",)

      if len(self._stack)>0:
        s1w = self._stack[-1]['token']
        s1t = self._stack[-1]['upos']

      if len(self._stack)>1:
        s2w = self._stack[-2]['token']
        s2t = self._stack[-2]['upos']

      if len(self._tokens)>0:
        b0w = self._tokens[0]['token']
        b0t = self._tokens[0]['upos']

      if len(self._tokens)>1:
        b1w = self._tokens[1]['token']
        b1t = self._tokens[1]['upos']

      s1wt = s1w+s1t
      s2wt = s2w+s2t
      b0wt = b0w+b0t
      b1wt = b1w+b1t
      
      fv["s2w=" + s2w] = 1
      fv["s1w=" + s1w] = 1
      fv["s2wt=" + s2wt] = 1
      fv["s1wt=" + s1wt] = 1
      fv["b0w=" + b0w] = 1
      fv["b1w=" + b1w] = 1
      fv["s2t=" + s2t] = 1
      fv["s1t=" + s1t] = 1
      fv["b0wt=" + b0wt] = 1
      
      fv["b0t=" + b0t] = 1
      fv["b1wt=" + b1wt] = 1
      
      fv["b1t=" + b1t] = 1

      return fv 

    #get the actual head values  
    def get_gold_conf(self,sentence):
      heads = []
      for t in range(len(sentence)):
          heads.append(sentence[t]['head'])
      return heads

    #check if tree is projective.  
    def check_projective(self):
      while not self.done_p:
        if len(self._stack)>2 and self._stack[-2]['head'] == self._stack[-1]['index']:
          self._stack.pop(-2)
        elif len(self._stack)>1 and self._stack[-1]['head'] == self._stack[-2]['index'] and self._dependents_found(self._stack[-1]['index']) == self._depcounts.get(self._stack[-1]['index'],0):
          self._stack.pop(-1)
        elif len(self._tokens)>0:
          if len(self._tokens)==0:
            pass
          self._stack.append(self._tokens.pop(0))
        else:
          return False
      return True

    def training_oracle(self):
        
        if len(self._stack)>2 and self._stack[-2]['head'] == self._stack[-1]['index']:
            answer = 'LeftArc'
            self.leftarc()
        elif len(self._stack)>1 and self._stack[-1]['head'] == self._stack[-2]['index'] and self._dependents_found(self._stack[-1]['index']) == self._depcounts.get(self._stack[-1]['index'],0):
            answer = 'RightArc'
            self.rightarc()
        elif len(self._tokens)>0:
            answer = 'Shift'
            self.shift()
        else:
            # Fail; usually because of non-projectivity, or sometimes bug in treebank!
            answer = 'Fail'
            pass

        if answer != 'Fail':
          features = self.extract_features()
        else:
          features=[]
          
        return answer , features
        
    def __str__(self):
        return 'stack='+str(self._stack)+',tokens='+str(self._tokens)+',deps='+str(self._deps)
        

In [None]:
from collections import defaultdict
label_idx = {'Shift':0,'LeftArc':1,'RightArc':2}
idx_label = {0:'Shift',1:'LeftArc',2:'RightArc'}
class ArcParser():
  def __init__(self):
    self.feature_dict = defaultdict(int)
    self.feature_dict['UNK'] = 0
    self.dataset = []
    self.labels = []

  #creating the feature dictionary for each stage
  def create_feature_dict(self,fv):
    for feat in fv:
      if feat not in self.feature_dict:
        self.feature_dict[feat] = len(self.feature_dict)
  #Feature to id mapping
  def featuretoidx(self,fv):
      feature2id = [self.feature_dict[feat] for feat in fv]
      return feature2id

  # sentence s is a list of dictionaries, each representing one token
  # return True iff oracle-training parse succeeded
  def process_sentence(self,s,isTrain,verbose=False):
      config = Configuration(s)
      label = ''
      feature_vec = []
      transitions = []

      #check if tree is projective before processing
      config_copy = copy.deepcopy(config)
      config_check = config_copy.check_projective()
      #print('config_check:',config_check)
      if config_check:
        gold_conf = config.get_gold_conf(s)
      else:
        return [],[]

      if verbose:
          print('\n\nStarting new parse:')
          print(config)
      while label != 'Fail' and not config.done_p():          
          label, fv = config.training_oracle()
          if label != 'Fail' and fv != {}:
            #If training data then we add to feature dictionary
            if isTrain:
              self.create_feature_dict(fv)
              self.dataset.append(self.featuretoidx(fv))
            #If not the training set, we get the value for the fature
            else:
              self.dataset.append([training_features.get(feat,training_features['UNK']) for feat in fv])
            
            self.labels.append(label_idx[label])
          else:
            feature_vec = []
            transitions = []
            gold_conf = []
          if verbose:
            #print('feat',feature_vec,transitions)
            print('Applied '+label+' operation...')
            print(config)
      #This gives us the predicted heads after parsing
      pred_conf = config.heads
      pred_conf.pop(0)

      #print(gold_conf,pred_conf)
      return label != 'Fail',feature_vec,gold_conf

In [None]:
len(training_features)


235423

In [None]:

def parse_data(corpus,isTrain=False):
  verbose = False
  total = 0
  ok = 0

  y_train = []
  with open(corpus,'r') as f:
      parser = ArcParser()
      sentence = []
      for line in f:
        line = line.rstrip('\n')
        if line == '':
            total += 1
            valid, x,y = parser.process_sentence(sentence,isTrain,verbose)
            if valid:
              ok+=1
            sentence = []
        elif line[0] == '#' or re.match('^[0-9]+-',line):
            # skip comments and multiword tokens
            pass
        else:
          #print(line)
          sentence.append(Token(line))
  print(ok) 
  if isTrain:
    training_features = parser.feature_dict
    return parser.dataset,parser.labels,training_features
  else:
    return parser.dataset,parser.labels,_


#print('Successfully parsed',ok,'of',total,'sentences')

In [None]:
import re
train_file = 'ca_ancora-ud-train.conllu'
dev_file = 'ca_ancora-ud-dev.conllu'
test_file = 'ca_ancora-ud-test.conllu'
x_train ,y_train,training_features = parse_data(train_file,True)
X_val , y_val,_ = parse_data(dev_file)


12542
1641


In [None]:
#Training with SVM model. Need to check why its crashing even for subset of data
model = svm.SVC(
        kernel="poly",
        degree=2,
        verbose=True,
        C=1.0,
        probability=True,
        )

model.fit(x_train[:1000], y_train[:1000])

[LibSVM]

In [None]:
#Training model using Keras
from tensorflow.keras.layers import Dense, Embedding, Activation, Dropout,TimeDistributed,LSTM, Input,Convolution1D
from tensorflow.keras import Sequential,Model

vocab_length = len(training_features)
model = Sequential() # a sequential model is a stack of layers - we will add them one by one


model.add(Embedding(input_dim = vocab_length,
                    output_dim=12, # output of this layer is the embedding of the input word
                    input_length=12)) # specifies how many indexes we are looking up

model.add(Convolution1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(LSTM(100))
model.add(Dense(8, activation='relu'))
model.add(Dense(len(label_idx), activation='softmax'))

model.compile(optimizer="Adam",
              loss="categorical_crossentropy",
              metrics=["accuracy"])



model.summary()


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 12, 12)            2825076   
_________________________________________________________________
conv1d (Conv1D)              (None, 12, 32)            1184      
_________________________________________________________________
lstm (LSTM)                  (None, 100)               53200     
_________________________________________________________________
dense (Dense)                (None, 8)                 808       
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 27        
Total params: 2,880,295
Trainable params: 2,880,295
Non-trainable params: 0
_________________________________________________________________


In [None]:
from tensorflow.keras.utils import to_categorical
y_cat = to_categorical(y_train)
y_val_cat =  to_categorical(y_val)

In [None]:
model.fit(np.array(x_train),y_cat, batch_size=512, epochs=10, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f56b4cf5f98>

In [None]:
model.evaluate(np.array(X_val), y_val_cat, batch_size=32, verbose=1)



[0.9276489615440369, 0.8091919422149658]

In [None]:
pred = model.predict(np.array(X_val))
p = np.argmax(pred,axis=-1)

In [None]:
# returns number of labels that count toward LAS, so discard PUNCT and multiword tokens
def real_label_count(sentence):
  answer = 0
  for tok in sentence:
    if re.search(r'-',tok[0]):
      continue
    if tok[3] != 'PUNCT':
      answer += 1
  return answer

In [None]:
def evaluate_parse(sentence, parse):
  answer = 0
  for i in range(len(sentence)):
    if sentence[i][6]==parse[i][6]:
      answer += 1
  return answer

In [None]:
# a sentence is a list of tokens
# a token is a list of the first six features in CoNNL-U format
# this function should append the head and the dependency label as the 7th and 8th features
# Baseline algorithm labels everything as a determiner with the following token as the head (~13% accuracy!)
def parse_sentence(sentence):
  s= []
  configs =[]
  label = ''
  for tok in sentence:
    if re.search(r'-',tok[0]):
      tok.append('_')
    else:   
      tok.append('_')
      tok.append('_')
      tok.append('_')
      tok.append('_')
      #print(tok)
      s.append(Token("\t".join(tok)))         #Since we are only taking first 6 columns we add a '_' for the rest 
      #print(s)
 
  config = Configuration(s)
  config_copy = copy.deepcopy(config)
  config_check = config_copy.check_projective()
  if not config_check:
      return []
  config.feature_dict = training_features

  #Normally the first two operations are always shift
  config.shift()
  config.shift()

  #using the feature dictionary from training
  parser = ArcParser()
  parser.feature_dict = training_features
  
  while label != 'Fail' and not config.done_p():
      #converting features to indexes
      features = config.extract_features()    
      fv = parser.featuretoidx(features) 
      pred = model.predict([fv])  
      pred_label = np.argmax(pred)
      #This parsing is taken from Dr.Scannell's code and the parts with the known heads is removed.
      if idx_label[pred_label] == 'LeftArc':
        if len(config._stack)>2:
          config.leftarc()
          label = 'LeftArc'
        elif len(config._tokens)>0:       #if stack is too small, shift is always a good choice
          config.shift()
          label = 'Shift'
      elif idx_label[pred_label] == 'RightArc':
        if len(config._stack)>1:
         config.rightarc()
         label = 'RightArc'
        else:                          #if buffer is too small, shift is always a good choice
          if len(config._tokens)>0:      
            config.shift()
           label = 'Shift'
      elif idx_label[pred_label] == 'Shift':
        if len(config._tokens)>0:
          config.shift()
          label = 'Shift'
      else:
          # Fail; usually because of non-projectivity, or sometimes bug in treebank!
          label = 'Fail'
          pass

      print('Applied '+ label +' operation...')
      print(config)
  pred_conf = config.heads
  pred_conf.pop(0)

  return sentence,pred_conf
    

In [None]:
x_test , y_test,_ = parse_data(test_file)

In [None]:
#y_test_cat =  to_categorical(y_test)
pred_test = model.predict(np.array(x_test))
test_labels = np.argmax(pred_test,axis=-1)

In [None]:
def evaluate():
  #testfile = open('ca_ancora-ud-test.conllu','r')
  testfile = open(dev_file,'r')
  sentence = []
  total = 0
  correct = 0
  for line in testfile:
    line = line.rstrip('\n')
    #print(line[:6])
    if line=='':
     
      parsed = parse_sentence(tok[:6] for tok in sentence)
      total += 1
      correct += evaluate_parse(sentence, parsed)
      total += real_label_count(sentence)
      sentence = []
    elif line[0]=='#':
      pass
    else:
      sentence.append(line.split('\t'))

    if total > 1:
      break
  return correct/total

In [None]:
evaluate()