In [1]:
import datasets
from datasets import Sequence
from datasets import ClassLabel
from hfmtl.tasks.sequence_classification import SequenceClassification
from hfmtl.tasks.token_classification import TokenClassification
from hfmtl.utils import *
from hfmtl.models import *

from codelin.models.const_tree import C_Tree
from codelin.encs.constituent import *
from codelin.utils.constants import *

import easydict
from frozendict import frozendict
import json
import os
import pandas as pd

ptb_path = "~/Treebanks/const/PENN_TREEBANK/"
ptb_path = os.path.expanduser(ptb_path)


def generate_dataset_from_codelin(train_dset, dev_dset, test_dset):
    # Get all possible labels and cast to ClassLabel
    # Get all possible labels and cast to ClassLabel
    label_set = set()
    for dset in [train_dset, dev_dset, test_dset]:
        for labels in dset["target"]:
            label_set.update(labels)
    label_names = sorted(list(label_set))

    train_dset = datasets.Dataset.from_dict(train_dset)
    train_dset = train_dset.cast_column("target", Sequence(ClassLabel(num_classes=len(label_names), names=label_names)))

    dev_dset = datasets.Dataset.from_dict(dev_dset)
    dev_dset = dev_dset.cast_column("target", Sequence(ClassLabel(num_classes=len(label_names), names=label_names)))

    test_dset = datasets.Dataset.from_dict(test_dset)
    test_dset = test_dset.cast_column("target", Sequence(ClassLabel(num_classes=len(label_names), names=label_names)))
    
    # Convert to Hugging Face DatasetDict format
    dataset = datasets.DatasetDict({
            "train": train_dset,
            "validation": dev_dset,
            "test": test_dset
        })

    return dataset

def encode_dset(encoder, dset):
    encoded_trees = {"tokens":[], "target":[]}
    for line in dset:
        tree = C_Tree.from_string(line)
        lin_tree = encoder.encode(tree)
        encoded_trees["tokens"].append([w for w in lin_tree.words])
        encoded_trees["target"].append([str(l) for l in lin_tree.labels])
    
    return encoded_trees

with open(os.path.join(ptb_path,"test.trees")) as f:
    ptb_test = [l.rstrip() for l in f.read().splitlines()]
with open(os.path.join(ptb_path,"dev.trees")) as f:
    ptb_dev = [l.rstrip() for l in f.read().splitlines()]
with open(os.path.join(ptb_path,"train.trees")) as f:
    ptb_train = [l.rstrip() for l in f.read().splitlines()]

args = easydict.EasyDict({
    "max_seq_length": 128,
    
    "batch_size": 32,
    "per_device_train_batch_size": 32,
    "per_device_eval_batch_size": 32,

    "num_train_epochs": 1,

    "learning_rate": 1e-5,
    "weight_decay": 0.01,
    "adam_epsilon": 1e-8,
    "adam_beta1": 0.9,
    "adam_beta2": 0.999,
    "max_seq_length": 64,
    "do_eval": True,
    "do_train": True,
    "do_predict": True,
    
    
    "evaluation_strategy": "epoch",
    
    "save_strategy": "epoch",
    "overwrite_output_dir": True,
    
    "logging_strategy": "steps",
    "logging_steps": 100,
    
    
    "load_best_model_at_end": True,
    "batch_truncation": True,
    "add_cls": True,
    "add_clf": True,
    "drop_probability": 0.1,
    "include_inputs_for_metrics": True,
    "model_name": "roberta-base"
    })


encoder = C_NaiveAbsoluteEncoding(separator="[_]", unary_joiner="[+]", reverse=False, binary=False, binary_direction=None, binary_marker="[b]")
dataset_abs = generate_dataset_from_codelin(encode_dset(encoder, ptb_train[:100]), encode_dset(encoder, ptb_dev[:100]), encode_dset(encoder, ptb_test[:100]))

encoder = C_NaiveRelativeEncoding(separator="[_]", unary_joiner="[+]", reverse=False, binary=False, binary_direction=None, binary_marker="[b]")
dataset_rel = generate_dataset_from_codelin(encode_dset(encoder, ptb_train[:100]), encode_dset(encoder, ptb_dev[:100]), encode_dset(encoder, ptb_test[:100]))

tasks = [TokenClassification(
            dataset = dataset_abs,
            name = "encoding_abs",
            tokenizer_kwargs = frozendict(padding="max_length", max_length=args.max_seq_length, truncation=True)
        ),
        TokenClassification(
            dataset = dataset_rel,
            name = "encoding_rel",
            tokenizer_kwargs = frozendict(padding="max_length", max_length=args.max_seq_length, truncation=True)
        )]
        
model   = Model(tasks, args) # list of models; by default, shared encoder, task-specific CLS token task-specific head 
trainer = Trainer(model, tasks, args) # tasks are uniformly sampled by default

trainer.train()
trainer.evaluate()
trainer.save_model("./saved")
test_prediction = trainer.predict(None)

for i, t in enumerate(model.task_names):
    pred_i = test_prediction[t]
    model.factorize(task_index = i)
    id2label = model.task_models_list[i].config.id2label
    for p in (pred_i[:5]):
        labels = []
        for l in p:
            if l!=-100:
                labels.append(id2label[l])
        print(labels)



  from .autonotebook import tqdm as notebook_tqdm
                                                                   

[*] Loaded TokenClassification task with 410 labels: ['10[_]ADJP', '10[_]ADJP[+]QP', '10[_]ADVP', '10[_]NP', '10[_]NP[+]QP', '10[_]NP[_]NP', '10[_]NP[_]NX', '10[_]NP[_]VP', '10[_]PP', '10[_]PP[_]INTJ', '10[_]PRN', '10[_]QP', '10[_]S', '10[_]SBAR', '10[_]SBAR[+]S[_]NP', '10[_]SBAR[_]WHNP', '10[_]S[+]VP', '10[_]S[+]VP[_]NP', '10[_]S[+]VP[_]PRT', '10[_]S[_]NP', '10[_]VP', '10[_]VP[_]NP', '11[_]ADJP', '11[_]ADJP[+]QP', '11[_]ADVP', '11[_]ADVP[_]ADVP', '11[_]NP', '11[_]NP[+]QP', '11[_]NP[_]NP', '11[_]PP', '11[_]PP[_]NP', '11[_]QP', '11[_]S', '11[_]SBAR', '11[_]SBAR[+]S', '11[_]SBAR[+]S[+]VP', '11[_]SBAR[_]WHADVP', '11[_]SBAR[_]WHNP', '11[_]S[+]VP', '11[_]S[_]NP', '11[_]UCP', '11[_]VP', '11[_]VP[_]ADVP', '11[_]VP[_]NP', '11[_]VP[_]PP', '11[_]WHPP', '12[_]ADJP', '12[_]ADVP', '12[_]CONJP', '12[_]NP', '12[_]NP[_]ADVP', '12[_]NP[_]NP', '12[_]PP', '12[_]QP', '12[_]S', '12[_]SBAR[+]S', '12[_]SBAR[+]S[+]VP', '12[_]SBAR[_]WHNP', '12[_]S[+]VP', '12[_]S[+]VP[_]NP', '12[_]S[+]VP[_]VP', '12[_]S[_]NP', '

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForTokenClassification: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able

[*] Found task 1 => encoding_rel with 331 labels


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForTokenClassification: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able

[*] Preprocessing task 0 => encoding_abs


                                                   

[*] Preprocessing task 1 => encoding_rel


  0%|          | 0/8 [00:00<?, ?it/s]              You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 8/8 [00:02<00:00,  4.83it/s]

[*] Evaluating task 0 => encoding_abs


                                             
100%|██████████| 8/8 [00:02<00:00,  4.83it/s]

{'eval_loss': 6.022686004638672, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.0006236357966947302, 'eval_name': 'encoding_abs', 'eval_size': 100, 'eval_index': 0, 'eval_runtime': 0.4531, 'eval_samples_per_second': 220.712, 'eval_steps_per_second': 8.828, 'epoch': 1.0}
[*] Evaluating task 1 => encoding_rel


                                             
100%|██████████| 8/8 [00:03<00:00,  4.83it/s]

{'eval_loss': 5.805825233459473, 'eval_precision': 0.0038910505836575876, 'eval_recall': 0.0010090817356205853, 'eval_f1': 0.0016025641025641025, 'eval_accuracy': 0.0003118178983473651, 'eval_name': 'encoding_rel', 'eval_size': 100, 'eval_index': 1, 'eval_runtime': 0.4478, 'eval_samples_per_second': 223.308, 'eval_steps_per_second': 8.932, 'epoch': 1.0}
[*] Calling save_model with task_idx 0
[*] Initializing adapter with 2 classifiers


There were missing keys in the checkpoint model loaded: ['Z', 'shared_encoder.embeddings.position_ids', 'shared_encoder.embeddings.word_embeddings.weight', 'shared_encoder.embeddings.position_embeddings.weight', 'shared_encoder.embeddings.token_type_embeddings.weight', 'shared_encoder.embeddings.LayerNorm.weight', 'shared_encoder.embeddings.LayerNorm.bias', 'shared_encoder.encoder.layer.0.attention.self.query.weight', 'shared_encoder.encoder.layer.0.attention.self.query.bias', 'shared_encoder.encoder.layer.0.attention.self.key.weight', 'shared_encoder.encoder.layer.0.attention.self.key.bias', 'shared_encoder.encoder.layer.0.attention.self.value.weight', 'shared_encoder.encoder.layer.0.attention.self.value.bias', 'shared_encoder.encoder.layer.0.attention.output.dense.weight', 'shared_encoder.encoder.layer.0.attention.output.dense.bias', 'shared_encoder.encoder.layer.0.attention.output.LayerNorm.weight', 'shared_encoder.encoder.layer.0.attention.output.LayerNorm.bias', 'shared_encoder.en

{'train_runtime': 11.3269, 'train_samples_per_second': 0.706, 'train_steps_per_second': 0.706, 'train_loss': 5.922636985778809, 'epoch': 1.0}
[*] Evaluating task 0 => encoding_abs


100%|██████████| 4/4 [00:00<00:00, 12.12it/s]


[*] Evaluating task 1 => encoding_rel


100%|██████████| 4/4 [00:00<00:00, 12.57it/s]


[*] Calling save_model with task_idx 0
[*] Initializing adapter with 2 classifiers
[*] Predicting task 0 => encoding_abs


100%|██████████| 4/4 [00:00<00:00, 11.81it/s]


[*] Predicting task 1 => encoding_rel


100%|██████████| 4/4 [00:00<00:00, 12.69it/s]

[*] Initializing adapter with 2 classifiers
['2[_]PP', '3[_]NP', '1[_]S', '1[_]S', '2[_]NP', '1[_]S', '1[_]S', '2[_]VP', '3[_]ADVP', '5[_]QP', '4[_]NP', '1[_]S', '1[_]S']
['1[_]S', '2[_]PP', '3[_]NP', '3[_]NP', '3[_]NP', '1[_]S', '1[_]S', '1[_]S', '1[_]S', '2[_]NP', '2[_]NP', '2[_]NP', '2[_]NP', '1[_]S', '1[_]S', '1[_]S', '2[_]PP', '4[_]QP', '4[_]QP', '3[_]NP', '1[_]S', '2[_]VP', '2[_]VP', '5[_]QP', '4[_]NP', '3[_]NP', '4[_]PP', '6[_]NP', '5[_]NP', '5[_]NP', '6[_]NP', '2[_]VP', '2[_]VP', '4[_]S[+]VP', '4[_]S[+]VP', '4[_]S[+]VP[_]PRT', '6[_]NP', '7[_]ADJP[_]ADJP', '7[_]ADJP[_]ADJP', '7[_]ADJP[_]ADJP', '7[_]ADJP[_]ADJP', '7[_]ADJP[_]ADJP', '7[_]ADJP', '7[_]ADJP', '8[_]ADJP', '8[_]ADJP', '8[_]ADJP', '7[_]ADJP', '6[_]NP', '5[_]NP', '6[_]PP', '7[_]NP', '4[_]S[+]VP', '5[_]PP', '6[_]NP', '6[_]NP', '6[_]NP', '6[_]NP', '6[_]NP', '6[_]NP', '1[_]S', '1[_]S']
['2[_]NP', '1[_]S', '1[_]S', '1[_]S', '1[_]S', '2[_]VP', '2[_]VP[_]ADVP', '2[_]VP[_]ADVP', '1[_]S[_]VP', '1[_]S[_]VP', '1[_]S']
['2[_]S[_]NP


