In [1]:
import datasets

def load_conll_dataset(train_path, dev_path, test_path, target_label="hd"):
    def read_conll_file(file_path, target_label):
        with open(file_path, "r") as f:
            sentences = [[]]
            for line in f:
                line = line.strip()
                
                if line:
                    idx, word, lem, upos, xpos, morph, hd, rel, deprel, m = line.split('\t')
                    sentences[-1].append((word, eval(f"{target_label}")))
                
                else:
                    if sentences[-1]:
                        sentences.append([])
            
            if not sentences[-1]:
                sentences.pop()

        # Convert sentences to Hugging Face Dataset format
        dataset = {
            "tokens": [[token for token, label in sentence] for sentence in sentences],
            "tags": [[label for token, label in sentence] for sentence in sentences],
        }

        return dataset

    def label_to_int(dataset, label_set):
        label_to_id = {label: i for i, label in enumerate(label_set)}
        dataset["tags"] = [[label_to_id[label] for label in labels] for labels in dataset["tags"]]
        return dataset
    
    train_dset = read_conll_file(train_path, target_label)
    dev_dset = read_conll_file(dev_path, target_label)
    test_dset = read_conll_file(test_path, target_label)

    # Get all possible labels
    label_set = set()
    for dset in [train_dset, dev_dset]:
        for labels in dset["tags"]:
            label_set.update(labels)
    
    # labels to int
    train_dset = label_to_int(train_dset, label_set)
    dev_dset = label_to_int(dev_dset, label_set)
    test_dset = label_to_int(test_dset, label_set)

    
    # Convert to Hugging Face DatasetDict format
    dataset = datasets.DatasetDict({
            "train": datasets.Dataset.from_dict(train_dset),
            "validation": datasets.Dataset.from_dict(dev_dset),
            "test": datasets.Dataset.from_dict(test_dset)
        })

    return dataset


postag_dataset = load_conll_dataset("./data/train.conllu", "./data/dev.conllu", "./data/test.conllu", target_label="upos")

In [2]:
from src.tasks.sequence_classification import SequenceClassification
from src.tasks.token_classification import TokenClassification
from src.utils import *
from src.models import *
from datasets import load_dataset

import easydict


args = easydict.EasyDict({
        "model_name": "roberta-base",
        "mtl_architecture": "cls_task_embedding",
        "learning_rate": 3e-5,
        "num_train_epochs": 10,
    })

postag = TokenClassification(
    dataset = postag_dataset,
    name = "postag", tokens = "tokens", y = "tags"
)

tasks = [postag]
models = Model(tasks, args) # list of models; by default, shared encoder, task-specific CLS token task-specific head 
trainer = Trainer(models, tasks, args) # tasks are uniformly sampled by default
trainer.train()

[*] Found task 0 => postag


  with safe_open(checkpoint_file, framework="pt") as f:
  return self.fget.__get__(instance, owner)()
  storage = cls(wrap_storage=untyped_storage)
  with safe_open(filename, framework="pt", device=device) as f:
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForTokenClassification: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initi

[*] Preprocessing task 0 => postag


Map:   0%|          | 0/12532 [00:00<?, ? examples/s]

Map:   0%|          | 0/2001 [00:00<?, ? examples/s]

Map:   0%|          | 0/2073 [00:00<?, ? examples/s]

***** Running training *****
  Num examples = 12532
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 15670
  Number of trainable parameters = 133692148


  0%|          | 0/15670 [00:00<?, ?it/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


KeyboardInterrupt: 