# Token classification

In [None]:
# ROLES/LABELS: Agent, Location, Topic, Patient, Theme, EMPTY
# Tags: 0=EMPTY, 1=Agent, 2=Location, 3=Patient, 4=Theme, 5=Topic

# Generate:
# Whole sentence +
# {'id': '0',
#  'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 8, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0],
#  'tokens': ['@paulwalk', 'It', "'s", 'the', 'view', 'from', 'where', 'I', "'m", 'living', 'for', 'two', 'weeks', '.', 'Empire', 'State', 'Building', '=', 'ESB', '.', 'Pretty', 'bad', 'storm', 'here', 'last', 'evening', '.']
# }

# THIS IS THE GOAL
# Tags: 0=EMPTY, 1=Agent, 2=Location, 3=Patient, 4=Theme, 5=Topic
# ner_tags = [1,1,1,1,1,1,1,0,0,2,2,2]
# tokens = ['A', 'brown', 'dog', 'and', 'a', 'grey', 'dog', 'are', 'fighting', 'in', 'the', 'snow']

In [None]:
import re
import os
# Example with one sentence:
# Note: forward slashes for Linux and WSL, backward slashes for Windows
# Windows example:
# file_path = r'C:\Users\bikow\Documents\AI\MSc\Computational Semantics\pmb-sample-4.0.0\data\en\gold\p00\d0004'
file_path = r'/mnt/c/Users/perry/Documents/uni/Master/CompSem/project/pmb-sample-4.0.0/data/en/gold/p00/d0004/'

In [None]:
# THIS IS THE GOAL
# sentence = "A brown dog and a grey dog are fighting in the snow"
# sentence_id = '0'
# ner_tags = [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 2, 2]
# tokens = ['A', 'brown', 'dog', 'and', 'a', 'grey', 'dog', 'are', 'fighting', 'in', 'the', 'snow']
mapping = {"Agent": 1, "Location": 2, "Patient": 3, "Theme": 4, "Topic":5}

sentence = ""
sentence_id = '0'
ner_tags = [],
tokens = []

# Get the tokens from the tokenized sentence file
with open(file_path+"en.tok.off") as file:
    for line in file:
        tokens.append(line.split()[-1])

sentence = ' '.join(tokens)
# Initially set all the tags as 0 (EMPTY)
ner_tags = [0] * len(tokens)

## New class-based approach

In [73]:
class CCGNode:
    def __init__(self, category = 'none', rule_type='none', parent=None, level = 0):
        self.category = category # eg s\np or np
        self.rule_type = rule_type # fa or ba or conj
        self.children = []
        self.parent = parent
        self.level = level
    
    def __repr__(self):
        return ''.join([' ' * self.level, 'CCGNODE', ' ', self.category, ' ', self.rule_type, '\n', '\n'.join([repr(child) for child in self.children])])
#         return 'CCGNODE: ' + str(id(self)) + '\n ' + '\n '.join([f(child) for child in self.children for f in (lambda child: str(id(child)), lambda child: str(len(child.children)))])

class CCGToken:
    def __init__(self, token: str, parent: CCGNode, assignedTag: str = '', verbnet = [], level = 0):
        self.token = token
        self.parent = parent
        self.assignedTag = assignedTag
        self.verbnet = verbnet
        self.children = []
        self.level = level
    
    def __repr__(self):
        return ''.join([' ' * self.level, 'CCGTOKEN', ' ', self.token, ' ', self.assignedTag, ' ',' '.join(self.verbnet)])


In [78]:
token_idx = 0
del(topNode)
del(currentNode)
topNode = None
currentNode = None
with open(file_path + "en.parse.tags") as file:
    currentNode = None
    skipping = True
    previousLevel = 0
    for line in file:
        if skipping:
            if line.startswith('ccg'):
                skipping = False
                topNode = CCGNode()
                print(topNode)
                currentNode = topNode
            continue
        if line == '\n':
            continue
#         print(line)
        trimmedLine = line.lstrip()
        nodeType, content = trimmedLine.split('(', 1)
        if nodeType == 't':
            verbnet = [r for r in mapping.keys() if r in line]
            currentNode.children.append(CCGToken(tokens[token_idx], parent = currentNode, verbnet = verbnet, level = currentNode.level + 1))
            token_idx += 1
        else:
            level = len(line) - len(trimmedLine)
            category = content.split(',')[0]
            if level > previousLevel: # This is a child of previous node
                currentNode.children.append(CCGNode(category, nodeType, parent=currentNode, level = level))
                currentNode = currentNode.children[-1]
            elif level == previousLevel: # Sibling of the previous node; same parent
                currentNode = currentNode.parent
                currentNode.children.append(CCGNode(category, nodeType, parent=currentNode, level = level))
                currentNode = currentNode.children[-1]
            else: # Go back 1? level
                currentNode = currentNode.parent.parent
                currentNode.children.append(CCGNode(category, nodeType, parent=currentNode, level = level))
                currentNode = currentNode.children[-1]
                
            previousLevel = level
        print("Current state:")
        print(topNode)

print(topNode)

CCGNODE none none

Current state:
CCGNODE none none
 CCGNODE s:dcl ba

Current state:
CCGNODE none none
 CCGNODE s:dcl ba
  CCGNODE np ba

Current state:
CCGNODE none none
 CCGNODE s:dcl ba
  CCGNODE np ba
   CCGNODE np fa

Current state:
CCGNODE none none
 CCGNODE s:dcl ba
  CCGNODE np ba
   CCGNODE np fa
    CCGTOKEN A  
Current state:
CCGNODE none none
 CCGNODE s:dcl ba
  CCGNODE np ba
   CCGNODE np fa
    CCGTOKEN A  
    CCGNODE n fa

Current state:
CCGNODE none none
 CCGNODE s:dcl ba
  CCGNODE np ba
   CCGNODE np fa
    CCGTOKEN A  
    CCGNODE n fa
     CCGTOKEN brown  
Current state:
CCGNODE none none
 CCGNODE s:dcl ba
  CCGNODE np ba
   CCGNODE np fa
    CCGTOKEN A  
    CCGNODE n fa
     CCGTOKEN brown  
     CCGTOKEN dog  
Current state:
CCGNODE none none
 CCGNODE s:dcl ba
  CCGNODE np ba
   CCGNODE np fa
    CCGTOKEN A  
    CCGNODE n fa
     CCGTOKEN brown  
     CCGTOKEN dog  
   CCGNODE np\np conj

Current state:
CCGNODE none none
 CCGNODE s:dcl ba
  CCGNODE np ba
   CCG

In [31]:
topNode = None
currentNode = None

In [37]:
bla = CCGNode('')

print(bla)

CCGNODE: 140033530569680
 140033539640768
 96
 140033539483392
 96
 140033539484016
 96
 140033539640960
 0
 140033539482288
 0
 140033539483440
 0
 140033530558256
 0
 140033530560128
 0
 140033530559984
 0
 140033530560368
 0
 140033530558400
 96
 140033530557248
 0
 140033539485552
 0
 140033530556864
 0
 140033530559024
 0
 140033530558688
 0
 140033539483632
 96
 140033539620576
 96
 140033539483104
 96
 140033539482336
 0
 140033539485120
 0
 140033539482096
 0
 140033600673056
 0
 140033530524576
 0
 140033530526400
 0
 140033530527456
 0
 140033530570112
 96
 140033530571168
 0
 140033530572224
 0
 140033530058016
 0
 140033530059840
 0
 140033530069152
 0
 140033601053984
 96
 140033530571984
 96
 140033530571600
 96
 140033530571360
 0
 140033600674112
 0
 140033530526688
 0
 140033530524768
 0
 140033539738016
 0
 140033539482576
 0
 140033539620240
 0
 140033539738544
 96
 140033539738160
 0
 140033539620432
 0
 140033530525440
 0
 140033530527600
 0
 140033530568768
 0
 14

In [27]:
print(dir())
print(globals())

['CCGNode', 'CCGToken', 'In', 'Out', '_', '__', '___', '__builtin__', '__builtins__', '__doc__', '__loader__', '__name__', '__package__', '__spec__', '_dh', '_i', '_i1', '_i10', '_i11', '_i12', '_i13', '_i14', '_i15', '_i16', '_i17', '_i18', '_i19', '_i2', '_i20', '_i21', '_i22', '_i23', '_i24', '_i25', '_i26', '_i27', '_i3', '_i4', '_i5', '_i6', '_i7', '_i8', '_i9', '_ih', '_ii', '_iii', '_oh', 'bla', 'category', 'content', 'currentNode', 'exit', 'file', 'file_path', 'get_ipython', 'level', 'line', 'mapping', 'ner_tags', 'nodeType', 'os', 'previousLevel', 'quit', 're', 'sentence', 'sentence_id', 'skipping', 'token_idx', 'tokens', 'topNode', 'trimmedLine', 'verbnet']
{'__name__': '__main__', '__doc__': 'Automatically created module for IPython interactive environment', '__package__': None, '__loader__': None, '__spec__': None, '__builtin__': <module 'builtins' (built-in)>, '__builtins__': <module 'builtins' (built-in)>, '_ih': ['', "import re\nimport os\n# Example with one sentence:\n#

In [None]:
print(CCGToken)

## Old approach

In [None]:
# get the roles for each token from the (parse/drs?)
token_idx = 0
with open(file_path + "en.parse.tags") as file:
    # collect all the NP and corresponding words
    NPs = []
    for line in file:
        if re.search('([fb]a\(np),', line):
            NPs.append([re.search('([fb]a\(np),', line).group(1)])
            print("Bla,", NPs[-1])
        
        # Find a CCG for a token
        if token_idx < len(tokens) and line.find(tokens[token_idx]) != -1: # TODO: fix to make sure single letter/small words found correctly
            print(line)
#             print("Contains token:", tokens[token_idx], "at position", line.find(tokens[token_idx]))
            
            # Add the token to its closes NP
            NPs[-1].append(tokens[token_idx])
            
            # Check whether any of the roles are in the sentence
            roles = [r for r in mapping.keys() if r in line]
            if roles:
#                 print(line)
                print("Has role:", roles)
                
                # Find the CCG (using regex after "t(", finding last \np or /np before ,)
                ccg = re.search('([\\/\\\][np]+),', line) # TODO: make sure I didn't think too simple for this
                ccg = ccg.group(1)
                print("CCG (last part):", ccg)
                
                # Find whether role should be placed using forward/backward application
                if ccg[0] == "\\": #backwards
                    print("backwards")
                    # Look at all previous "ba(np" and find the one corresponding to this ccg
                elif ccg[0] == "\/": #forwards
                    print("forwards")
                    # Look at all next "fa(np" and find the one corresponding to this ccg
                    
                # Find place of NP for role ("fa(np" or "ba(np")
                # Find all word positions belonging to NP and assign their tokens the correct role?
                # TODO: make sure it is fine to only look at NPs
            token_idx += 1
    print("NPs and their tokens:", NPs)

print("tokens:", tokens)
print("sentence:", sentence)
print("ner_tags:", ner_tags)

In [None]:
txt = r"t(s:ng\np, 'fighting', [from:31, to:39, pos:'VBG', lemma:'fight', sem:'EXG', wordnet:'fight.v.01', verbnet:['Agent']])),"
print(re.search('([\\/\\\][np]+),', txt))
print(re.search('([\\/\\\][np]+),', txt).group(1))
# print(re.search('([\\/\\\][np]+),', "r\'"+txt+"\'"))

# Graph/DRS data preparation

In [None]:
import re

In [None]:
def preprocess_sentence(file_path, save_path):
    file_path = r'C:\Users\bikow\Documents\AI\MSc\Computational Semantics\pmb-sample-4.0.0\data\en\gold\p00\d0004\en.drs.sbn'
    save_path = r'C:\Users\bikow\Documents\AI\MSc\Computational Semantics\sentences.csv'
    processed_text = ""
    with open(file_path) as file:
        for line in file:
            if line[0] != "%":
                text = line
                # remove extra whitespace
                text = " ".join(text.split()).strip()
                # remove comments
                comment_start = text.find("%")
                if comment_start > -1:
                    text = text[:comment_start]
                processed_text += text
    print("Processed text:", processed_text)
    
    # method to replace roles with "[X]" or [MASK]
    roles = [] # fill in list when only specific roles (so, not Sub etc.)
    if not roles:
        # find roles by using -1 and +2, etc.
        roles = re.findall('\d (\w+) [-+]', processed_text)
        print("The roles masked are:", roles)
        
    final_text = processed_text
    for role in roles:
        final_text = final_text.replace(role, "[X]")
    print("\nFully masked text:", final_text)
    
    print("\nText with [MASK] for roles:")
    instances = [m.start()-1 for m in re.finditer("[X]", final_text)]
    for instance in instances:
        before = final_text[:instance]
        after = final_text[instance:].replace("[X]", "[MASK]", 1)
        masked_text = before + after
        print(masked_text)
        
preprocess_sentence(None, None)

In [None]:
import re

# Need to find: ["Colour", "Sub", "Sub", "Colour", "Agent", "Time", "Location"]
s = 'brown.a.01 dog.n.01 Colour -1 entity.n.01 Sub -1 Sub +2 grey.a.01 dog.n.01 Colour -1 time.n.08 EQU now fight.v.01 Agent -4 Time -1 Location +1 snow.n.02 '

# result = re.findall('\d (.*?) [-+]\d', s)
# print(result)
result = re.findall('\d (\w+) [-+]', s)
print(result)

In [None]:
import os
#letters = ["Alef", "Ayin", "Bet", "Dalet", "Gimel", "He", "Het", "Kaf", "Kaf-final", "Lamed", "Mem", "Mem-medial", "Nun-final", "Nun-medial", "Pe", "Pe-final", "Qof", "Resh", "Samekh", "Shin", "Taw", "Tet", "Tsadi-final", "Tsadi-medial", "Waw", "Yod", "Zayin"]
# os.chdir(cwd)
# r'C:\Users\bikow\Documents\AI\MSc\Computational Semantics\pmb-sample-4.0.0\data\en\gold\p00\d0004\en.drs.sbn'
original_dataset_path = "pmb-sample-4.0.0\data\en\gold"
output_path = "augment"

for classname in os.listdir(original_dataset_path):
    class_input_dir = os.path.join(original_dataset_path, classname)
    class_output_dir = os.path.join(output_path, classname)
    # Create a new directory if it did not yet exist
    if not os.path.exists(class_output_dir):
        os.makedirs(class_output_dir)
    nr = 0
    size = len(os.listdir(class_input_dir))
    for filename in os.listdir(class_input_dir):
        if nr % int(size/10) == 0:
            print(nr, " out of ", size)
        if True:#try:
            f = os.path.join(class_input_dir,filename)
            if os.path.isfile(f):
                nr += 1
                augment(f, classname, nr, class_output_dir)
                cv2.destroyAllWindows()
#                 break
#         except:
#             print("error")
#             break
#             print(f)

# Fill-mask with specified masks?

In [None]:
# import torch
# from torch.utils.data import Dataset

# class MyDataset(Dataset):
#     def __init__(self, sentences, labels):
#         self.sentences = sentences
#         self.labels = labels

#     def __len__(self):
#         return len(self.sentences)

#     def __getitem__(self, idx):
#         input_ids = self.sentences[idx]['input_ids']
#         labels = self.labels[idx]

#         # Convert to tensors
#         input_ids_tensor = torch.tensor(input_ids)
#         labels_tensor = torch.tensor(labels)

#         return {
#             'input_ids': input_ids_tensor,
#             'labels': labels_tensor
#         }


import torch
from transformers import BertTokenizer, BertForMaskedLM
from torch.utils.data import DataLoader, Dataset

# Define your dataset class
class MaskedSentenceDataset(Dataset):
    def __init__(self, sentences, masked_indices, labels):
        self.sentences = sentences
        self.masked_indices = masked_indices
        self.labels = labels

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        self.sentences_idx = tokenizer.encode(self.sentences[idx], add_special_tokens=True)
        self.masked_indices_idx = self.masked_indices[idx]
        self.labels_idx = tokenizer.encode(self.labels[idx], add_special_tokens=True)
        
#         self.sentences_tensor = torch.tensor(self.sentences_idx)
#         self.masked_indices_tensor = torch.tensor(self.masked_indices_idx)
#         self.labels_tensor = torch.tensor(self.labels_idx)
        
        return {
            'input_ids': torch.tensor(self.sentences_idx),
            'masked_indices': torch.tensor(self.masked_indices_idx),
            'labels': torch.tensor(self.labels_idx)
        }

# Example data
sentences = ["This is a sample sentence.", "Another example sentence."]
masked_indices = [[2], [1]]
labels = ["masked", "example"]

# Tokenize input sentences
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create dataset
dataset = MaskedSentenceDataset(sentences, masked_indices, labels)

# Initialize the model
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

# Define DataLoader and other training parameters


from torch.nn.utils.rnn import pad_sequence

def custom_collate(batch):
    input_ids = [item['input_ids'] for item in batch]
    masked_indices = [item['masked_indices'] for item in batch]
    labels = [item['labels'] for item in batch]

    # Pad sequences to the same length
    input_ids_padded = pad_sequence(input_ids, batch_first=True, padding_value=0)
    # Adjust the padding_value according to your tokenizer

    return {
        'input_ids': input_ids_padded,
        'masked_indices': masked_indices,
        'labels': labels
    }

# dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=custom_collate)



from torch.utils.data import DataLoader

# Assuming you already have your MaskedSentenceDataset instance 'dataset'
batch_size = 32  # Adjust the batch size according to your preferences

# Create DataLoader
# dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=custom_collate)


num_epochs = 1
# Training loop
for epoch in range(num_epochs):
    for batch in dataloader:
        inputs = batch['input_ids']
        mask_indices = batch['masked_indices']
        labels = batch['labels']

        outputs = model(inputs, labels=inputs)
        loss = outputs.loss

        # Backpropagation and optimization steps

# Evaluation and inference steps


In [None]:
# # Input sentence with a mask token [MASK]
# input_sentence = "The [MASK] is blue."

# # Tokenize the input sentence
# tokenized_input = tokenizer(input_sentence, return_tensors='pt')


# # Get the position of the masked token in the input
# mask_token_index = (tokenized_input['input_ids'] == tokenizer.mask_token_id).nonzero().item()

# # Forward pass to get predictions
# with torch.no_grad():
#     outputs = model(**tokenized_input)

# # Get the predicted logits for the masked token
# predictions = outputs.logits[:, mask_token_index, :]

# # Get the predicted token ID (argmax or sampling)
# predicted_token_id = torch.argmax(predictions).item()

# # Convert the predicted token ID back to a word
# predicted_word = tokenizer.decode(predicted_token_id)


# print(f"The predicted word for the mask is: {predicted_word}")

# model_inputs = tokenizer(["ما لون السماء؟"], return_tensors="pt")

s1 = "This is [MASK] sample sentence."
s2 = "Another [MASK] sentence."
s3 = "The [MASK] is blue."
model_inputs = tokenizer([s3], return_tensors="pt")

generated_ids = model.generate(**model_inputs)
print(tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0])

In [None]:
import torch
from transformers import BertForMaskedLM, BertTokenizer

# Load pre-trained model and tokenizer
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Input sentence with [MASK]
# input_sentence = "The cat [MASK] on the mat."
input_sentence = "The cat sat on the mat."

# Tokenize input and get labels
tokenized_input = tokenizer(input_sentence, return_tensors='pt')
labels = tokenized_input['input_ids'].clone()

# Replace a token with [MASK] (e.g., the word "sat")
masked_position = 4
tokenized_input['input_ids'][0, masked_position] = tokenizer.mask_token_id

for i in range(100000):
    if i%100 == 0:
        print("Epoch:", i)
        model_inputs = tokenizer([s4], return_tensors="pt")

        generated_ids = model.generate(**model_inputs)
        print(tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0])
    # Forward pass
    outputs = model(**tokenized_input)
    logits = outputs.logits

    # Calculate loss
    criterion = torch.nn.CrossEntropyLoss()
    loss = criterion(logits[:, masked_position, :], labels[:, masked_position])

    # Backward pass and parameter update
    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


In [None]:
s1 = "This is [MASK] sample sentence."
s2 = "Another [MASK] sentence."
s3 = "The [MASK] is blue."
s4 = "The cat [MASK] on the mat."
model_inputs = tokenizer([s4], return_tensors="pt")

generated_ids = model.generate(**model_inputs)
print(tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0])