# Token classification
This notebook shows our approach to the data preprocessing. The goal is to have exactly one label per token*, including an "empty" label.
As long as we restrict our data to only one predicate, it should be feasible to determine two what other part of the sentence the role connects to.

\* In this step, token refers to the "tokenization" as applied to the PMB, i.e. the tokens in the "en.tok.off" files. E.g., "Alfred Nobel" is one token here.
Our LLM will tokenize our sentence differently, and will create one or more tokens per PMB token. This mapping will be handled later.

In [None]:
# ROLES/LABELS: Agent, Location, Topic, Patient, Theme, EMPTY
# Tags: 0=EMPTY, 1=Agent, 2=Location, 3=Patient, 4=Theme, 5=Topic

# sentence = "A brown dog and a grey dog are fighting in the snow"
# The goal is to generate:
# srl_tags = [1,1,1,1,1,1,1,0,0,2,2,2]b
# tokens = ['A', 'brown', 'dog', 'and', 'a', 'grey', 'dog', 'are', 'fighting', 'in', 'the', 'snow']

In [33]:
import re
import os
# Example with one sentence:
# Note: forward slashes for Linux and WSL, backward slashes for Windows
# Windows example:
# file_path = r'C:\Users\bikow\Documents\AI\MSc\Computational Semantics\pmb-sample-4.0.0\data\en\gold\p00\d0004'
# WSL example:
# file_path = r'/mnt/c/Users/perry/Documents/uni/Master/CompSem/project/pmb-sample-4.0.0/data/en/gold/p00/d0004/'
# file_path = r'/mnt/c/Users/perry/Documents/uni/Master/CompSem/project/pmb-4.0.0/data/en/gold/p01/d2590/' # https://pmb.let.rug.nl/explorer/explore.php?part=01&doc_id=2590&type=der.xml&alignment_language=en
file_path = r'/mnt/c/Users/perry/Documents/uni/Master/CompSem/project/pmb-4.0.0/data/en/gold/p03/d0766/' # https://pmb.let.rug.nl/explorer/explore.php?part=03&doc_id=0766&type=der.xml&alignment_language=en

In [34]:
# THIS IS THE GOAL
# sentence = "A brown dog and a grey dog are fighting in the snow"
mapping = {"Agent": 1, "Location": 2, "Patient": 3, "Theme": 4, "Topic":5, "Destination": 6, "Result": 7}

sentence = ""
sentence_id = '0'
tokens = []

# Get the tokens from the tokenized sentence file
with open(file_path+"en.tok.off") as file:
    for line in file:
        tokens.append(line.split(maxsplit = 3)[-1].rstrip())

sentence = ' '.join(tokens)

print(sentence)
print(tokens)

Alfred Nobel invented dynamite in 1866 .
['Alfred Nobel', 'invented', 'dynamite', 'in', '1866', '.']


## Our class-based approach
We take the en.parse.tags file and recreate the CCG structure using custom classes.
This allows us to figure out to what tokens each semantic role label belongs.

In [35]:
class CCGNode:
    def __init__(self, category = 'none', rule_type='none', parent=None, level = 0):
        self.category = category # eg s\np or np
        self.rule_type = rule_type # fa or ba or conj
        self.children = []
        self.parent = parent
        self.level = level
        self.isFirstArgument = True
    
    def addChild(self, child):
        if len(self.children) == 1:
            child.isFirstArgument = False
        elif len(self.children) == 2:
            raise Exception(repr(self), 'already has two children')
        self.children.append(child)
    
    def getSibling(self):
        if self.isFirstArgument:
            return self.parent.children[1]
        else:
            return self.parent.children[0]
    
    def assignTag(self, tag):
        self.children[0].assignTag(tag)
        if len(self.children) > 1:
            self.children[1].assignTag(tag)
    
    def getTags(self, mapping = None):
        if len(self.children) == 1:
            return self.children[0].getTags(mapping)
        return self.children[0].getTags(mapping) + self.children[1].getTags(mapping)
    
    def __repr__(self):
        return ''.join([' ' * self.level, 'CCGNODE', ' ', self.category, ' ', self.rule_type, '\n', '\n'.join([repr(child) for child in self.children])])

class CCGToken:
    def __init__(self, token, category, parent, assignedTag = '', verbnet = [], level = 0):
        self.token = token
        self.category = category
        self.parent = parent
        self.assignedTag = assignedTag
        self.verbnet = verbnet
        self.children = []
        self.level = level
        self.isFirstArgument = True
        
    def getSibling(self):
        if self.isFirstArgument:
            return self.parent.children[1]
        else:
            return self.parent.children[0]
    
    def assignTag(self, tag):
        self.assignedTag = tag
    
    def getTags(self, mapping):
        if mapping == None:
            return [self.assignedTag]
        else:
            if self.assignedTag == '':
                return [0]
            return [mapping[self.assignedTag]]
    
    def __repr__(self):
        return ''.join([' ' * self.level, 'CCGTOKEN', ' ', self.token, ' ', self.category, ' ', self.assignedTag, ' ',' '.join(self.verbnet)])


In [41]:
def getTokens(file_path):
    tokens = []
    # Get the tokens from the tokenized sentence file
    with open(file_path+"en.tok.off") as file:
        for line in file:
            tokens.append(line.split(maxsplit = 3)[-1].rstrip())
    return tokens

In [None]:
file_path = 'blabla'

def getTokensAndLabels(file_path):
    tokens = getTokens()

In [38]:
token_idx = 0
topNode = None
currentNode = None
tokensWithVerbnet = []
with open(file_path + "en.parse.tags") as file:
    skipping = True
    previousLevel = 0
    for line in file:
        if skipping:
            if line.startswith('ccg'):
                skipping = False
                topNode = CCGNode()
                currentNode = topNode
            continue
        if line == '\n':
            continue
        trimmedLine = line.lstrip()
        nodeType, content = trimmedLine.split('(', 1)
        category = content.split(',')[0]
        if nodeType == 't':
            if category == '.':
                continue
            vnSplit = content.split("verbnet:")
            if len(vnSplit) == 1:
                verbnet = []
            else:
                verbnetLiteral = vnSplit[1].split(']')[0] + ']'
                verbnetUnfiltered = eval(verbnetLiteral)
                verbnet = [r for r in verbnetUnfiltered if r in mapping.keys()]
            
            currentNode.addChild(CCGToken(tokens[token_idx], category = category, parent = currentNode, verbnet = verbnet, level = currentNode.level + 1))
            if len(verbnet) > 0:
                tokensWithVerbnet.append(currentNode.children[-1])
            token_idx += 1
        else:
            level = len(line) - len(trimmedLine)
            if level > previousLevel: # This is a child of previous node
                currentNode.addChild(CCGNode(category, nodeType, parent=currentNode, level = level))
                currentNode = currentNode.children[-1]
            elif level == previousLevel: # Sibling of the previous node; same parent
                currentNode = currentNode.parent
                currentNode.addChild(CCGNode(category, nodeType, parent=currentNode, level = level))
                currentNode = currentNode.children[-1]
            else: # Go back 1 or more levels
                while not currentNode.isFirstArgument:
                    currentNode = currentNode.parent
                currentNode = currentNode.parent
                currentNode.addChild(CCGNode(category, nodeType, parent=currentNode, level = level))
                currentNode = currentNode.children[-1]
                
            previousLevel = level

print(topNode)

CCGNODE none none
 CCGNODE s:dcl ba
  CCGNODE np lx
   CCGTOKEN Alfred Nobel n  
  CCGNODE s:dcl\np ba
   CCGNODE s:dcl\np fa
    CCGTOKEN invented (s:dcl\np)/np  Result Agent
    CCGNODE np lx
     CCGTOKEN dynamite n  
   CCGNODE (s\np)\(s\np) fa
    CCGTOKEN in ((s\np)\(s\np))/np  
    CCGNODE np rp
     CCGNODE np lx
      CCGTOKEN 1866 n  


In [39]:
# To do: deal with multiple verbnet labels.

def findCorrectLevel(current):
    while (not current.category.endswith('np')): # first application with non-nps
        current = current.parent
    lookingForward = (current.category[-3] == '/')
    if lookingForward:
        while ((not current.isFirstArgument) or current.parent.rule_type != 'fa'):
            current = current.parent
    else:
        while (current.isFirstArgument or current.parent.rule_type != 'ba'):
            current = current.parent
    return current

for currentTokenWithVerbnet in tokensWithVerbnet:
    verbnet = currentTokenWithVerbnet.verbnet
    for verbnetItem in verbnet:
        currentTokenWithVerbnet = findCorrectLevel(currentTokenWithVerbnet)
        sibling = currentTokenWithVerbnet.getSibling()
        sibling.assignTag(verbnetItem)
        currentTokenWithVerbnet = currentTokenWithVerbnet.parent


print('The CCG tree with assigned tags (end of CCGTOKENs):')
print(topNode)

The CCG tree with assigned tags (end of CCGTOKENs):
CCGNODE none none
 CCGNODE s:dcl ba
  CCGNODE np lx
   CCGTOKEN Alfred Nobel n Agent 
  CCGNODE s:dcl\np ba
   CCGNODE s:dcl\np fa
    CCGTOKEN invented (s:dcl\np)/np  Result Agent
    CCGNODE np lx
     CCGTOKEN dynamite n Result 
   CCGNODE (s\np)\(s\np) fa
    CCGTOKEN in ((s\np)\(s\np))/np  
    CCGNODE np rp
     CCGNODE np lx
      CCGTOKEN 1866 n  


In [40]:
print(tokens)
print(topNode.getTags())
print(topNode.getTags(mapping))

['Alfred Nobel', 'invented', 'dynamite', 'in', '1866', '.']
['Agent', '', 'Result', '', '']
[1, 0, 7, 0, 0]
