In [1]:
import datasets
from datasets import Sequence
from datasets import ClassLabel
from hfmtl.tasks.sequence_classification import SequenceClassification
from hfmtl.tasks.token_classification import TokenClassification
from hfmtl.utils import *
from hfmtl.models import *

from PYEVALB.scorer import Scorer
from PYEVALB.summary import summary

from codelin.models.const_tree import C_Tree
from codelin.models.const_label import C_Label
from codelin.models.linearized_tree import LinearizedTree
from codelin.encs.constituent import *
from codelin.utils.constants import *

import easydict
from frozendict import frozendict
import os
import torch
import pandas as pd


# Generate datasets
ptb_path = "~/Treebanks/const/PENN_TREEBANK/"
ptb_path = os.path.expanduser(ptb_path)

with open(os.path.join(ptb_path,"test.trees")) as f:
    ptb_test = [l.rstrip() for l in f.read().splitlines()]
with open(os.path.join(ptb_path,"dev.trees")) as f:
    ptb_dev = [l.rstrip() for l in f.read().splitlines()]
with open(os.path.join(ptb_path,"train.trees")) as f:
    ptb_train = [l.rstrip() for l in f.read().splitlines()]

def generate_dataset_from_codelin(train_dset, dev_dset, test_dset):
    # Get all possible labels and cast to ClassLabel
    # Get all possible labels and cast to ClassLabel
    label_set = set()
    for dset in [train_dset, dev_dset, test_dset]:
        for labels in dset["target"]:
            label_set.update(labels)
    label_names = sorted(list(label_set))

    train_dset = datasets.Dataset.from_dict(train_dset)
    train_dset = train_dset.cast_column("target", Sequence(ClassLabel(num_classes=len(label_names), names=label_names)))

    dev_dset = datasets.Dataset.from_dict(dev_dset)
    dev_dset = dev_dset.cast_column("target", Sequence(ClassLabel(num_classes=len(label_names), names=label_names)))

    test_dset = datasets.Dataset.from_dict(test_dset)
    test_dset = test_dset.cast_column("target", Sequence(ClassLabel(num_classes=len(label_names), names=label_names)))
    
    # Convert to Hugging Face DatasetDict format
    dataset = datasets.DatasetDict({
            "train": train_dset,
            "validation": dev_dset,
            "test": test_dset
        })

    return dataset

def encode_dset(encoder, dset):
    encoded_trees = {"tokens":[], "target":[]}
    max_len_tree = 0
    for line in dset:
        tree = C_Tree.from_string(line)
        lin_tree = encoder.encode(tree)
        encoded_trees["tokens"].append([w for w in lin_tree.words])
        encoded_trees["target"].append([str(l) for l in lin_tree.labels])
        max_len_tree = max(max_len_tree, len(lin_tree.words))
    return encoded_trees, max_len_tree

def gen_dsets():
    encodings = []

    # naive absolute encodings
    a_enc     = C_NaiveAbsoluteEncoding(separator="[_]", unary_joiner="[+]", reverse=False, binary=False, binary_direction=None, binary_marker="[b]")
    encodings.append({"name":"naive_absolute", "encoder":a_enc})
    a_br_enc  = C_NaiveAbsoluteEncoding(separator="[_]", unary_joiner="[+]", reverse=False, binary=True,  binary_direction="R",  binary_marker="[b]")
    encodings.append({"name":"naive_absolute_br", "encoder":a_br_enc})
    a_bl_enc  = C_NaiveAbsoluteEncoding(separator="[_]", unary_joiner="[+]", reverse=False, binary=True,  binary_direction="L",  binary_marker="[b]")
    encodings.append({"name":"naive_absolute_bl", "encoder":a_bl_enc})
    ar_enc    = C_NaiveAbsoluteEncoding(separator="[_]", unary_joiner="[+]", reverse=True,  binary=False, binary_direction=None, binary_marker="[b]")
    encodings.append({"name":"naive_absolute_r", "encoder":ar_enc})
    ar_br_enc = C_NaiveAbsoluteEncoding(separator="[_]", unary_joiner="[+]", reverse=True,  binary=True,  binary_direction="R",  binary_marker="[b]")
    encodings.append({"name":"naive_absolute_r_br", "encoder":ar_br_enc})
    ar_bl_enc = C_NaiveAbsoluteEncoding(separator="[_]", unary_joiner="[+]", reverse=True,  binary=True,  binary_direction="L",  binary_marker="[b]")
    encodings.append({"name":"naive_absolute_r_bl", "encoder":ar_bl_enc})

    # naive relative encodings
    r_enc     = C_NaiveRelativeEncoding(separator="[_]", unary_joiner="[+]", reverse=False, binary=False, binary_direction=None, binary_marker="[b]")
    encodings.append({"name":"naive_relative", "encoder":r_enc})
    r_br_enc  = C_NaiveRelativeEncoding(separator="[_]", unary_joiner="[+]", reverse=False, binary=True,  binary_direction="R",  binary_marker="[b]")
    encodings.append({"name":"naive_relative_br", "encoder":r_br_enc})
    r_bl_enc  = C_NaiveRelativeEncoding(separator="[_]", unary_joiner="[+]", reverse=False, binary=True,  binary_direction="L",  binary_marker="[b]")
    encodings.append({"name":"naive_relative_bl", "encoder":r_bl_enc})
    rr_enc    = C_NaiveRelativeEncoding(separator="[_]", unary_joiner="[+]", reverse=True,  binary=False, binary_direction=None, binary_marker="[b]")
    encodings.append({"name":"naive_relative_r", "encoder":rr_enc})
    rr_br_enc = C_NaiveRelativeEncoding(separator="[_]", unary_joiner="[+]", reverse=True,  binary=True,  binary_direction="R",  binary_marker="[b]")
    encodings.append({"name":"naive_relative_r_br", "encoder":rr_br_enc})
    rr_bl_enc = C_NaiveRelativeEncoding(separator="[_]", unary_joiner="[+]", reverse=True,  binary=True,  binary_direction="L",  binary_marker="[b]")
    encodings.append({"name":"naive_relative_r_bl", "encoder":rr_bl_enc})

    # naive dynamic encodings
    d_enc     = C_NaiveDynamicEncoding(separator="[_]", unary_joiner="[+]", reverse=False, binary=False, binary_direction=None, binary_marker="[b]")
    encodings.append({"name":"naive_dynamic", "encoder":d_enc})
    d_br_enc  = C_NaiveDynamicEncoding(separator="[_]", unary_joiner="[+]", reverse=False, binary=True,  binary_direction="R",  binary_marker="[b]")
    encodings.append({"name":"naive_dynamic_br", "encoder":d_br_enc})
    d_bl_enc  = C_NaiveDynamicEncoding(separator="[_]", unary_joiner="[+]", reverse=False, binary=True,  binary_direction="L",  binary_marker="[b]")
    encodings.append({"name":"naive_dynamic_bl", "encoder":d_bl_enc})
    dr_enc    = C_NaiveDynamicEncoding(separator="[_]", unary_joiner="[+]", reverse=True,  binary=False, binary_direction=None, binary_marker="[b]")
    encodings.append({"name":"naive_dynamic_r", "encoder":dr_enc})
    dr_br_enc = C_NaiveDynamicEncoding(separator="[_]", unary_joiner="[+]", reverse=True,  binary=True,  binary_direction="R",  binary_marker="[b]")
    encodings.append({"name":"naive_dynamic_r_br", "encoder":dr_br_enc})
    dr_bl_enc = C_NaiveDynamicEncoding(separator="[_]", unary_joiner="[+]", reverse=True,  binary=True,  binary_direction="L",  binary_marker="[b]")
    encodings.append({"name":"naive_dynamic_r_bl", "encoder":dr_bl_enc})

    # gaps encodings
    g_r_enc   = C_GapsEncoding(separator="[_]", unary_joiner="[+]", binary_direction="R", binary_marker="[b]")
    encodings.append({"name":"gaps_r", "encoder":g_r_enc})
    g_l_enc   = C_GapsEncoding(separator="[_]", unary_joiner="[+]", binary_direction="L", binary_marker="[b]")
    encodings.append({"name":"gaps_l", "encoder":g_l_enc})

    # tetra encodings
    t_pr_enc  = C_Tetratag(separator="[_]", unary_joiner="[+]", mode='preorder',  binary_marker="[b]")
    encodings.append({"name":"tetratag_preorder", "encoder":t_pr_enc})
    t_in_enc  = C_Tetratag(separator="[_]", unary_joiner="[+]", mode='inorder',   binary_marker="[b]")
    encodings.append({"name":"tetratag_inorder", "encoder":t_in_enc})
    t_po_enc  = C_Tetratag(separator="[_]", unary_joiner="[+]", mode='postorder', binary_marker="[b]")
    encodings.append({"name":"tetratag_postorder", "encoder":t_po_enc})

    # yuxtaposed encodings
    j_enc   = C_JuxtaposedEncoding(separator="[_]", unary_joiner="[+]", binary=False, binary_direction=None, binary_marker="[b]")
    encodings.append({"name":"juxtaposed", "encoder":j_enc})
    j_r_enc = C_JuxtaposedEncoding(separator="[_]", unary_joiner="[+]", binary=True, binary_direction='R',   binary_marker="[b]")
    encodings.append({"name":"juxtaposed_r", "encoder":j_r_enc})
    j_l_enc = C_JuxtaposedEncoding(separator="[_]", unary_joiner="[+]", binary=True, binary_direction='L',   binary_marker="[b]")
    encodings.append({"name":"juxtaposed_l", "encoder":j_l_enc})

    return encodings

# Get model hyperparameters
args = easydict.EasyDict({
    "max_seq_length": 128,
    
    "batch_size": 8,
    "per_device_train_batch_size": 8,
    "per_device_eval_batch_size":  8,
    "gradient_accumulation_steps": 4,

    # quantify model to floating_point 16    
    "fp16_full_eval": True,
    "fp16": True,

    "num_train_epochs": 1,

    "learning_rate": 1e-5,
    "weight_decay": 0.01,
    "adam_epsilon": 1e-8,
    "adam_beta1": 0.9,
    "adam_beta2": 0.999,
    "do_eval": False,
    "do_predict": False,
    "do_train": True,
    
    "evaluation_strategy": "epoch",
    "logging_strategy": "epoch",

    "overwrite_output_dir": True,

    "include_inputs_for_metrics": True,
    "batch_truncation": True,
    "add_cls": True,
    "add_clf": True,
    "drop_probability": 0.1,
    "include_inputs_for_metrics": False,
    "model_name": "roberta-base"
    })

def delete_garbage():
    gc.collect()

    for obj in gc.get_objects():
        try:
            if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data)):
                del obj
        except:
            pass

    torch.cuda.empty_cache()


# train and evaluate using Evalb
encodings = gen_dsets()
results = {}
train_limit = None
results_df = pd.DataFrame(columns=["encoding", "recall", "precision", "f1", "n_labels"])

delete_garbage()

for enc in encodings[1:2]:
    print("[GPU] Starting training; Allocated memory:", torch.cuda.memory_allocated()/1e6,"MB")
    print("[GPU] Starting training; Cached memory:", torch.cuda.memory_cached()/1e6,"MB")
    encoder = enc["encoder"]
    
    print("[DST] Encoding the datasets using CoDeLin")
    train_enc, mlt1 = encode_dset(encoder, ptb_train[:train_limit] if train_limit else ptb_train)
    dev_enc, mlt2   = encode_dset(encoder, ptb_dev[:train_limit]   if train_limit else ptb_dev)
    test_enc, mlt3  = encode_dset(encoder, ptb_test[:train_limit]  if train_limit else ptb_test)
    print("[DST] Max length of tree found: {}".format(max(mlt1, mlt2, mlt3)))
    print("[DST] Generating the PyTorch dataset object")
    dataset  = generate_dataset_from_codelin(train_enc, dev_enc, test_enc)
    
    tasks = [TokenClassification(
                dataset = dataset,
                name = enc["name"],
                tokenizer_kwargs = frozendict(padding="max_length", max_length=args.max_seq_length, truncation=True)
            )]
        
    model   = Model(tasks, args)            # list of models; by default, shared encoder, task-specific CLS token task-specific head 
    trainer = Trainer(model, tasks, args)   # tasks are uniformly sampled by default
    
    print("[GPU] Model created; Total allocated memory", torch.cuda.memory_allocated()/1e6,"MB")
    print("[GPU] Model created; Total cached memory", torch.cuda.memory_cached()/1e6,"MB")
    device = torch.device("cuda")
    model.to(device)
    print("[GPU] Model sent to device; Total allocated memory", torch.cuda.memory_allocated()/1e6,"MB")
    print("[GPU] Model sent to device; Total cached memory", torch.cuda.memory_cached()/1e6,"MB")
    
    trainer.train()
    print("[GPU] Model training finished; Total allocated memory", torch.cuda.memory_allocated()/1e6,"MB")
    print("[GPU] Model training finished; Total cached memory", torch.cuda.memory_cached()/1e6,"MB")
    
    # save
    trainer.save_model(output_dir=f"models/{enc['name']}")
    
    for i, t in enumerate(tasks):
        test_trees = ptb_test[:train_limit] if train_limit else ptb_test
        dec_trees = []
        scorer = Scorer()

        for gold_tree in test_trees:
            tree = C_Tree.from_string(gold_tree)
            
            words = tree.get_words()
            postags = tree.get_postags()
            sentence = " ".join(words)
            
            tokenized_input = trainer.tokenizer(sentence, return_tensors='pt')
            tokenized_input = {k: v.to(device) for k, v in tokenized_input.items()}
            
            outputs = model.task_models_list[i](**tokenized_input)
            logits = outputs.logits
            predictions = logits.argmax(-1).squeeze().tolist()
            true_predictions = [tasks[i].label_names[p] for p in predictions[1:-1]]
            
            labels = []
            for p in true_predictions:
                labels.append(C_Label.from_string(p, sep="[_]", uj="[+]"))

            lin_tree = LinearizedTree(words=words, postags=postags,
                        additional_feats=[], labels=labels, n_feats=0)
            dec_tree = encoder.decode(lin_tree).postprocess_tree(conflict_strat=C_STRAT_MAX, clean_nulls=True, default_root="S")
            dec_tree = str(dec_tree)            
            dec_trees.append(dec_tree)

            # free memory
            del tokenized_input
            del predictions
            del logits
            del outputs
        
        results = scorer.score_corpus(test_trees, dec_trees)

        try:
            s = summary(results)
            
            recall = s[4]
            fscore = s[6]
            precision = s[7]
        except:
            recall = 0
            fscore = 0
            precision = 0

        results_dict = {"recall":recall, "precision": precision, "f1": fscore, "n_labels": t.num_labels}
        results_df = results_df.append({"encoding":enc["name"], **results_dict}, ignore_index=True)
        print("****")
        print("Results for {}:".format(enc["name"]))
        print("Recall: {}".format(recall))
        print("Precision: {}".format(precision))
        print("F1: {}".format(fscore))
        print("****")
        
    # free memory
    del model
    del trainer
    del tasks

    delete_garbage()

    # garbage collection
    print("[GPU] End of training total allocated memory", torch.cuda.memory_allocated()/1e6,"MB")
    print("[GPU] End of training total cached memory", torch.cuda.memory_cached()/1e6,"MB")


# save as latex
results_latex = results_df.to_latex()
with open("results.tex", "w") as f:
    f.write(results_latex)



[GPU] Starting training; Allocated memory: 0.0 MB
[GPU] Starting training; Cached memory: 0.0 MB
[DST] Encoding the datasets using CoDeLin
[DST] Max length of tree found: 141
[DST] Generating the PyTorch dataset object


Casting the dataset:   0%|          | 0/39832 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1700 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/2416 [00:00<?, ? examples/s]

[*] Loaded TokenClassification task with 4702 labels
[MDL] Task 0 => naive_absolute_br with 4702 labels


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForTokenClassification: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able

[TRN] Initializing trainer with 1 GPUs
[TRN] Initializing trainer with device_batch_size = 8
[TRN] Initializing trainer with gradient_accumulation_steps = 4
[TRN] Preprocessing task naive_absolute_br


Map:   0%|          | 0/39832 [00:00<?, ? examples/s]

Map:   0%|          | 0/1700 [00:00<?, ? examples/s]

Map:   0%|          | 0/2416 [00:00<?, ? examples/s]

[GPU] Model created; Total allocated memory 512.224768 MB
[GPU] Model created; Total cached memory 549.453824 MB
[GPU] Model sent to device; Total allocated memory 512.224768 MB
[GPU] Model sent to device; Total cached memory 549.453824 MB


  0%|          | 0/1244 [00:00<?, ?it/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


[TRN] Saving model of task 0
[TRN] Saving model of task 0
{'loss': 4.8733, 'learning_rate': 8.038585209003216e-09, 'epoch': 1.0}
[TRN] Evaluating task naive_absolute_br


  0%|          | 0/213 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 1.73 GiB (GPU 0; 7.92 GiB total capacity; 3.64 GiB already allocated; 1.71 GiB free; 5.53 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF