# Token classification
This notebook shows our approach to the data preprocessing. The goal is to have exactly one label per token*, including an "empty" label.
As long as we restrict our data to only one predicate, it should be feasible to determine two what other part of the sentence the role connects to.

\* In this step, token refers to the "tokenization" as applied to the PMB, i.e. the tokens in the "en.tok.off" files. E.g., "Alfred Nobel" is one token here.
Our LLM will tokenize our sentence differently, and will create one or more tokens per PMB token. This mapping will be handled later.

In [None]:
import re
import os
from datasets import Dataset
mapping = {"Agent": 1, "Location": 2, "Patient": 3, "Theme": 4, "Destination": 5, "Result": 6, "Stimulus": 7, "Experiencer": 8, "Co-Theme": 9, "Pivot": 10}

In [9]:
# Example with one sentence:
# Note: forward slashes for Linux and WSL, backward slashes for Windows
# Windows example:
# file_path = r'C:\Users\bikow\Documents\AI\MSc\Computational Semantics\pmb-sample-4.0.0\data\en\gold\p00\d0004'
# WSL example:
file_path = r'/mnt/c/Users/perry/Documents/uni/Master/CompSem/project/pmb-4.0.0/data/en/gold/p00/d0004/'

## Our class-based approach
We take the en.parse.tags file and recreate the CCG structure using custom classes.
This allows us to figure out to what tokens each semantic role label belongs.

In [3]:
class CCGNode:
    def __init__(self, category = 'none', rule_type='none', parent=None, level = 0):
        self.category = category # eg s\np or np
        self.rule_type = rule_type # fa or ba or conj
        self.children = []
        self.parent = parent
        self.level = level
        self.isFirstArgument = True
    
    def addChild(self, child):
        if len(self.children) == 1:
            child.isFirstArgument = False
        elif len(self.children) == 2:
            raise Exception(repr(self), 'already has two children')
        child.level = self.level + 1
        self.children.append(child)
    
    def getSibling(self):
        if self.isFirstArgument:
            return self.parent.children[1]
        else:
            return self.parent.children[0]
    
    def assignTag(self, tag, tagFromTokenIdx):
        if tag == '': # Empty tag, we don't need to store this
            return
        self.children[0].assignTag(tag, tagFromTokenIdx)
        if len(self.children) > 1:
            self.children[1].assignTag(tag, tagFromTokenIdx)
    
    def getTags(self, mapping = None):
        if len(self.children) == 0:
            return []
        if len(self.children) == 1:
            return self.children[0].getTags(mapping)
        return self.children[0].getTags(mapping) + self.children[1].getTags(mapping)
    
    def getCategories(self, onlyTokens = False):
        if onlyTokens:
            x = []
        else:
            x = [self.category]
        if len(self.children) == 0:
            return x
        if len(self.children) == 1:
            return x + self.children[0].getCategories(onlyTokens)
        return x + self.children[0].getCategories(onlyTokens) + self.children[1].getCategories(onlyTokens)
    
    
    def getTagsFromTokenIdx(self):
        if len(self.children) == 0:
            return []
        if len(self.children) == 1:
            return self.children[0].getTagsFromTokenIdx()
        return self.children[0].getTagsFromTokenIdx() + self.children[1].getTagsFromTokenIdx()
    
    def __repr__(self):
        return ''.join([' ' * self.level, 'CCGNODE', ' ', self.category, ' ', self.rule_type, '\n', '\n'.join([repr(child) for child in self.children])])

class CCGToken:
    def __init__(self, token, category, parent, assignedTag = '', verbnet = [], tokenIdx = 0):
        self.token = token
        self.category = category
        self.parent = parent
        self.assignedTag = assignedTag
        self.verbnet = verbnet
        self.children = []
        self.level = None
        self.isFirstArgument = True
        self.tokenIdx = tokenIdx
        self.tagFromTokenIdx = None
        
    def getSibling(self):
        if self.isFirstArgument:
            return self.parent.children[1]
        else:
            return self.parent.children[0]
    
    def assignTag(self, tag, tagFromTokenIdx):
        self.assignedTag = tag
        self.tagFromTokenIdx = tagFromTokenIdx
    
    def getTags(self, mapping):
        if mapping == None:
            return [self.assignedTag]
        else:
            if self.assignedTag == '':
                return [0]
            return [mapping[self.assignedTag]]
    
    def getCategories(self, _):
        return [self.category]
    
    def getTagsFromTokenIdx(self):
        return [self.tagFromTokenIdx]
    
    def __repr__(self):
        return ''.join([' ' * self.level, 'CCGTOKEN', ' ', self.token, ' ', self.category, ' ', self.assignedTag, ' ',' '.join(self.verbnet)])


In [4]:
def getTokens(file_path):
    tokens = []
    # Get the tokens from the tokenized sentence file
    with open(os.path.join(file_path, "en.tok.off")) as file:
        for line in file:
            token = line.split(maxsplit = 3)[-1].rstrip()
            tokens.append(token)
            if token in ['.', '?', '!', ';', '...', '!!']:
                break
        
    return tokens

In [5]:
def getTree(file_path, tokens):
    tokenIdx = 0
    topNode = None
    currentNode = None
    tokensWithVerbnet = []
    with open(os.path.join(file_path, "en.parse.tags")) as file:
        skipping = True
        for line in file:
            if skipping:
                if line.startswith('ccg'):
                    skipping = False
                    topNode = CCGNode()
                    currentNode = topNode
                continue
            if line == '\n':
                continue
            if line.startswith('ccg'): # Second sentence starts, we ignore this
                return topNode, tokensWithVerbnet
            trimmedLine = line.lstrip()
            nodeType, content = trimmedLine.split('(', 1)
            category = content.split(',')[0]
            level = len(line) - len(trimmedLine)
            while level <= currentNode.level:
                currentNode = currentNode.parent
            if nodeType == 't':
                if category in ['.']:
                    break
                if tokens[tokenIdx] in ['.', '!', '?', ';']:
                    break
                
                vnSplit = content.split("verbnet:")
                if len(vnSplit) == 1:
                    verbnet = []
                else:
                    # It needs to combine to an np. Verbnet tags looking for a n for example,
                    # often describe adjectives and are not relevant for the main predicate
                    searchingFor = re.split(r'[\\\/]', category, 1)
                    if len(searchingFor) > 1 and ("np" in searchingFor[1]):
                        verbnetLiteral = vnSplit[1].split(']')[0] + ']'
                        verbnetUnfiltered = eval(verbnetLiteral)
                        for role in verbnetUnfiltered:
                            verbnetCounter[role] = verbnetCounter.get(role, 0) + 1
                        # If first element gets filtered out but not the second, replace with dummy value
                        verbnet = [r if r in mapping.keys() else '' for r in verbnetUnfiltered]
                        # Remove trailing dummy values
                        while (verbnet) and (verbnet[-1] == ''):
                            verbnet.pop()
                    else:
                        verbnet = []
                currentNode.addChild(CCGToken(tokens[tokenIdx], category = category, parent = currentNode, verbnet = verbnet, tokenIdx = tokenIdx))
                if len(verbnet) > 0:
                    tokensWithVerbnet.append(currentNode.children[-1])
                tokenIdx += 1
            else:
                currentNode.addChild(CCGNode(category, nodeType, parent=currentNode, level = level))
                currentNode = currentNode.children[-1]


    return topNode, tokensWithVerbnet

In [6]:
def findCorrectLevel(current):
    while (not current.category.endswith('np')): # first application with non-nps
        current = current.parent
    lookingForward = (current.category[-3] == '/')
    if lookingForward:
        while ((not current.isFirstArgument) or current.parent.rule_type != 'fa'):
            current = current.parent
    else:
        while (current.isFirstArgument or current.parent.rule_type != 'ba'):
            current = current.parent
    return current

def assignTags(tokensWithVerbnet):
    for currentTokenWithVerbnet in tokensWithVerbnet:
        verbnet = currentTokenWithVerbnet.verbnet
        currentTokenIdx = round(currentTokenWithVerbnet.tokenIdx + 0.0, 1)
        for verbnetItem in verbnet:
            currentTokenWithVerbnet = findCorrectLevel(currentTokenWithVerbnet)
            sibling = currentTokenWithVerbnet.getSibling()
            sibling.assignTag(verbnetItem, currentTokenIdx)
            currentTokenWithVerbnet = currentTokenWithVerbnet.parent
            currentTokenIdx = round(currentTokenIdx + 0.1, 1)

In [7]:
def skipSentence(topNode, tokens):
    allCategories = topNode.getCategories()
    
    # Skip W-questions
    if 's:wq' in allCategories:
        return True
    
    # Skip sentences with "there"
    if 'np:thr' in allCategories:
        return True
    
    # Skip sentenses with "'s" as "us"
    for idx, token in enumerate(tokens):
        if token.lower() == "let":
            if tokens[idx + 1] in ["'s", "us"]:
                return True
    
    # Skip sentences that miss a part (like "Think about it")
    if ("/" in topNode.children[0].category) or ("\\" in topNode.children[0].category):
        return True
    
    return False

In [12]:
mapping = {"Theme": 1, "Agent": 2, "Patient": 3, "Experiencer": 4, "Co-Theme": 5, "Stimulus": 6, "Location": 7, "Destination": 8}

# file_path = r'/mnt/c/Users/perry/Documents/uni/Master/CompSem/project/pmb-4.0.0/data/en/gold/p00/d1382' # Who directed the film "Fail Safe"?
# file_path = r'/mnt/c/Users/perry/Documents/uni/Master/CompSem/project/pmb-4.0.0/data/en/gold/p01/d2590/'
file_path = r'/mnt/c/Users/perry/Documents/uni/Master/CompSem/project/pmb-4.0.0/data/en/gold/p00/d0004/' # fighting dogs
# file_path = r'/mnt/c/Users/perry/Documents/uni/Master/CompSem/project/pmb-4.0.0/data/en/gold/p00/d2208' # Tom saw a mouse
# file_path = r'/mnt/c/Users/perry/Documents/uni/Master/CompSem/project/pmb-4.0.0/data/en/gold/p00/d0802' # Let's have sushi
# file_path = r'/mnt/c/Users/perry/Documents/uni/Master/CompSem/project/pmb-4.0.0/data/en/gold/p62/d1213'
# file_path = r'/mnt/c/Users/perry/Documents/uni/Master/CompSem/project/pmb-4.0.0/data/en/gold/p64/d2663' 
# file_path = r'/mnt/c/Users/perry/Documents/uni/Master/CompSem/project/pmb-4.0.0/data/en/gold/p64/d1941' 
def getTokensAndLabels(file_path):
    tokens = getTokens(file_path)
    topNode, tokensWithVerbnet = getTree(file_path, tokens)
    if skipSentence(topNode, tokens):
        return None, None, None
    try:
        assignTags(tokensWithVerbnet)
    except AttributeError:
        skippedDirs.append(file_path)
        return None, None, None
    assignTags(tokensWithVerbnet)
    labels = topNode.getTags(mapping)
    origin = topNode.getTagsFromTokenIdx()
    if tokens[-1] not in ['.', '!', '?', ';', '...', '!!']:
        tokens.append('.')
    if len(tokens) > len(labels):
        labels.append(0)
        origin.append(None)
    if len(tokens) != len(labels):
        raise Exception(file_path, 'Length of token and labels does not match up!')
    return tokens, labels, origin
    
verbnetCounter = {}
skippedDirs = []

tokens, labels, origin = getTokensAndLabels(file_path)
print(tokens)
print(labels)
print(origin)

['A', 'brown', 'dog', 'and', 'a', 'grey', 'dog', 'are', 'fighting', 'in', 'the', 'snow', '.']
[2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 7, 7, 0]
[8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, None, None, None, 9.0, 9.0, None]


In [232]:
folder_path = r'/mnt/c/Users/perry/Documents/uni/Master/CompSem/project/pmb-4.0.0/data/en/gold/' 
verbnetCounter = {}
skippedDirs = []

def createDataset(parent_dir):
    i = 0
    dataset = {'tokens': [], 'labels': [], 'origin': []}
    for subdir, dirs, files in os.walk(parent_dir):
        if not os.path.exists(os.path.join(subdir, 'en.parse.tags')):
            continue
        i += 1
        if (i % 100 == 0):
            print(i)
            print(subdir)
        tokens, labels, origin = getTokensAndLabels(subdir)
        if tokens == None:
            continue
        dataset['tokens'].append(tokens)
        dataset['labels'].append(labels)
        dataset['origin'].append(origin)
    return dataset
        

dataset = createDataset(folder_path)


100
/mnt/c/Users/perry/Documents/uni/Master/CompSem/project/pmb-4.0.0/data/en/gold/p00/d1470
200
/mnt/c/Users/perry/Documents/uni/Master/CompSem/project/pmb-4.0.0/data/en/gold/p00/d1976
300
/mnt/c/Users/perry/Documents/uni/Master/CompSem/project/pmb-4.0.0/data/en/gold/p00/d2581
400
/mnt/c/Users/perry/Documents/uni/Master/CompSem/project/pmb-4.0.0/data/en/gold/p00/d3351
500
/mnt/c/Users/perry/Documents/uni/Master/CompSem/project/pmb-4.0.0/data/en/gold/p01/d1775
600
/mnt/c/Users/perry/Documents/uni/Master/CompSem/project/pmb-4.0.0/data/en/gold/p01/d2921
700
/mnt/c/Users/perry/Documents/uni/Master/CompSem/project/pmb-4.0.0/data/en/gold/p02/d1705
800
/mnt/c/Users/perry/Documents/uni/Master/CompSem/project/pmb-4.0.0/data/en/gold/p02/d2978
900
/mnt/c/Users/perry/Documents/uni/Master/CompSem/project/pmb-4.0.0/data/en/gold/p03/d1876
1000
/mnt/c/Users/perry/Documents/uni/Master/CompSem/project/pmb-4.0.0/data/en/gold/p03/d3146
1100
/mnt/c/Users/perry/Documents/uni/Master/CompSem/project/pmb-4.0.

8900
/mnt/c/Users/perry/Documents/uni/Master/CompSem/project/pmb-4.0.0/data/en/gold/p80/d1834
9000
/mnt/c/Users/perry/Documents/uni/Master/CompSem/project/pmb-4.0.0/data/en/gold/p80/d3385
9100
/mnt/c/Users/perry/Documents/uni/Master/CompSem/project/pmb-4.0.0/data/en/gold/p82/d0042
9200
/mnt/c/Users/perry/Documents/uni/Master/CompSem/project/pmb-4.0.0/data/en/gold/p83/d1556
9300
/mnt/c/Users/perry/Documents/uni/Master/CompSem/project/pmb-4.0.0/data/en/gold/p84/d2551
9400
/mnt/c/Users/perry/Documents/uni/Master/CompSem/project/pmb-4.0.0/data/en/gold/p85/d3441
9500
/mnt/c/Users/perry/Documents/uni/Master/CompSem/project/pmb-4.0.0/data/en/gold/p87/d1551
9600
/mnt/c/Users/perry/Documents/uni/Master/CompSem/project/pmb-4.0.0/data/en/gold/p88/d2150
9700
/mnt/c/Users/perry/Documents/uni/Master/CompSem/project/pmb-4.0.0/data/en/gold/p89/d2901
9800
/mnt/c/Users/perry/Documents/uni/Master/CompSem/project/pmb-4.0.0/data/en/gold/p90/d2255
9900
/mnt/c/Users/perry/Documents/uni/Master/CompSem/project

In [233]:
# Total number of roles in the dataset:
print(len(dataset['tokens']))
# Frequency of roles
print(sorted(verbnetCounter.items(), key=lambda x:x[1], reverse = True))

9243
[('Theme', 5872), ('Agent', 4634), ('Patient', 1456), ('Attribute', 1374), ('Experiencer', 1345), ('Co-Theme', 1188), ('Stimulus', 1030), ('Location', 1025), ('Destination', 569), ('Equal', 472), ('Source', 412), ('Pivot', 367), ('Time', 359), ('Value', 349), ('Result', 309), ('Recipient', 306), ('Name', 284), ('PartOf', 241), ('Co-Agent', 222), ('Topic', 206), ('Beneficiary', 198), ('Instrument', 147), ('User', 145), ('Causer', 143), ('Of', 118), ('Manner', 117), ('Duration', 107), ('Quantity', 104), ('Colour', 91), ('Product', 84), ('SubOf', 81), ('Creator', 74), ('Goal', 71), ('Asset', 62), ('Path', 59), ('InstanceOf', 58), ('Context', 50), ('Bearer', 46), ('Material', 36), ('Frequency', 30), ('Extent', 28), ('AttributeOf', 22), ('Co-Patient', 22), ('Start', 20), ('Finish', 19), ('Owner', 18), ('Unit', 17), ('Part', 14), ('Content', 8), ('ContentOf', 4), ('Sub', 4), ('Similar', 2), ('Instance', 2), ('MadeOf', 1)]


In [128]:
# To see the sentences that have been skipped:
print(skippedDirs)

['/mnt/c/Users/perry/Documents/uni/Master/CompSem/project/pmb-4.0.0/data/en/gold/p00/d0867', '/mnt/c/Users/perry/Documents/uni/Master/CompSem/project/pmb-4.0.0/data/en/gold/p00/d0916', '/mnt/c/Users/perry/Documents/uni/Master/CompSem/project/pmb-4.0.0/data/en/gold/p00/d0948', '/mnt/c/Users/perry/Documents/uni/Master/CompSem/project/pmb-4.0.0/data/en/gold/p00/d1468', '/mnt/c/Users/perry/Documents/uni/Master/CompSem/project/pmb-4.0.0/data/en/gold/p00/d1562', '/mnt/c/Users/perry/Documents/uni/Master/CompSem/project/pmb-4.0.0/data/en/gold/p00/d1622', '/mnt/c/Users/perry/Documents/uni/Master/CompSem/project/pmb-4.0.0/data/en/gold/p00/d1686', '/mnt/c/Users/perry/Documents/uni/Master/CompSem/project/pmb-4.0.0/data/en/gold/p00/d1691', '/mnt/c/Users/perry/Documents/uni/Master/CompSem/project/pmb-4.0.0/data/en/gold/p00/d1712', '/mnt/c/Users/perry/Documents/uni/Master/CompSem/project/pmb-4.0.0/data/en/gold/p00/d1730', '/mnt/c/Users/perry/Documents/uni/Master/CompSem/project/pmb-4.0.0/data/en/gold

In [224]:
from datasets import Dataset

In [236]:
ds = Dataset.from_dict(dataset)
ds.save_to_disk("dataset.hf")