In [1]:
import datasets
from datasets import Sequence
from datasets import ClassLabel
def load_conll_dataset(train_path, dev_path, test_path, token_idx, label_idx):
    
    def read_conll_file(file_path, token_idx, label_idx):

        stop_point = 10000
        counter    = 0
        
        with open(file_path, "r") as f:
            sentences = [[]]
            for line in f:
                if counter == stop_point:
                    break
                counter += 1
                line = line.strip()
                
                if line:
                    split = line.split('\t')
                    sentences[-1].append((split[token_idx], split[label_idx]))
                
                else:
                    if sentences[-1]:
                        sentences.append([])
            
            if not sentences[-1]:
                sentences.pop()

        # Convert sentences to Hugging Face Dataset format
        dataset = {
            "tokens": [[token for token, label in sentence] for sentence in sentences],
            "target": [[label for token, label in sentence] for sentence in sentences],
        }

        return dataset


    train_dset = read_conll_file(train_path, token_idx, label_idx)
    dev_dset = read_conll_file(dev_path, token_idx, label_idx)
    test_dset = read_conll_file(test_path, token_idx, label_idx)

    # Get all possible labels and cast to ClassLabel
    label_set = set()
    for dset in [train_dset, dev_dset, test_dset]:
        for labels in dset["target"]:
            label_set.update(labels)
    label_names = sorted(list(label_set))
    
    train_dset = datasets.Dataset.from_dict(train_dset)
    train_dset = train_dset.cast_column("target", Sequence(ClassLabel(names=label_names)))

    dev_dset = datasets.Dataset.from_dict(dev_dset)
    dev_dset = dev_dset.cast_column("target", Sequence(ClassLabel(names=label_names)))

    test_dset = datasets.Dataset.from_dict(test_dset)
    test_dset = test_dset.cast_column("target", Sequence(ClassLabel(names=label_names)))
    
    # Convert to Hugging Face DatasetDict format
    dataset = datasets.DatasetDict({
            "train": train_dset,
            "validation": dev_dset,
            "test": test_dset
        })

    return dataset

raw_dataset = load_conll_dataset("data/train.conllu", "data/dev.conllu", "data/test.conllu", 1, 3)
print(raw_dataset["train"][0]["tokens"])
print(raw_dataset["train"][0]["target"])
print(raw_dataset["train"].features["target"].feature.names)


Casting the dataset:   0%|          | 0/413 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/665 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/676 [00:00<?, ? examples/s]

['Al', '-', 'Zaman', ':', 'American', 'forces', 'killed', 'Shaikh', 'Abdullah', 'al', '-', 'Ani', ',', 'the', 'preacher', 'at', 'the', 'mosque', 'in', 'the', 'town', 'of', 'Qaim', ',', 'near', 'the', 'Syrian', 'border', '.']
[11, 12, 11, 12, 0, 7, 15, 11, 11, 11, 12, 11, 12, 5, 7, 1, 5, 7, 1, 5, 7, 1, 11, 12, 1, 5, 0, 7, 12]
['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X']


We should take as input a .json file with the training info. The input files will be in columns separated by \t. In the json file we will specify the 'target' columns. If more than one target column, then we will duplicate the task.

In [2]:
from src.tasks.sequence_classification import SequenceClassification
from src.tasks.token_classification import TokenClassification
from src.utils import *
from src.models import *

import easydict
from frozendict import frozendict
import json

# read train_config.json as easydict
with open("config.json", "r") as f:
    args = easydict.EasyDict(json.load(f))

tasks = []
for task in args.tasks:
    if task.task_type == "token_classification":
        for l_idx in task.label_idx:
            tasks.append(
                TokenClassification(
                    dataset = load_conll_dataset(task.train_file, task.eval_file, task.test_file, task.tokens_idx, l_idx),
                    name = task.task_name,
                    tokenizer_kwargs = frozendict(padding="max_length", max_length=args.max_seq_length, truncation=True)
                )
            )
    
    elif task.type == "sequence_classification":
        for l_idx in task.label_idx:
            tasks.append(
                SequenceClassification(
                    dataset = load_conll_dataset(task.train_file, task.eval_file, task.test_file, task.tokens_idx, l_idx),
                    name = task.name,
                    tokenizer_kwargs = frozendict(padding="max_length", max_length=args.max_seq_length, truncation=True)
                )
            )


        
models  = Model(tasks, args) # list of models; by default, shared encoder, task-specific CLS token task-specific head 
trainer = Trainer(models, tasks, args) # tasks are uniformly sampled by default

trainer.train()

Casting the dataset:   0%|          | 0/413 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/665 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/676 [00:00<?, ? examples/s]

Labels for task:
['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X']


Casting the dataset:   0%|          | 0/413 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/665 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/676 [00:00<?, ? examples/s]

Labels for task:
['0', '1', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '2', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '3', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '4', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '5', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '6', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '7', '70', '71', '72', '73', '74', '75', '76', '79', '8', '9', '_']
[*] Found task 0 => conllu


  with safe_open(checkpoint_file, framework="pt") as f:
  return self.fget.__get__(instance, owner)()
  storage = cls(wrap_storage=untyped_storage)
  with safe_open(filename, framework="pt", device=device) as f:
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForTokenClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initi

[*] Found task 1 => conllu


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForTokenClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

label_names: ['labels']
[*] Preprocessing task 0 => conllu


Map:   0%|          | 0/413 [00:00<?, ? examples/s]

Map:   0%|          | 0/665 [00:00<?, ? examples/s]

Map:   0%|          | 0/676 [00:00<?, ? examples/s]

[*] Preprocessing task 1 => conllu


Map:   0%|          | 0/413 [00:00<?, ? examples/s]

Map:   0%|          | 0/665 [00:00<?, ? examples/s]

Map:   0%|          | 0/676 [00:00<?, ? examples/s]

***** Running training *****
  Num examples = 826
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 78
  Number of trainable parameters = 124130400


  0%|          | 0/78 [00:00<?, ?it/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
***** Running Evaluation *****
  Num examples = 665
  Batch size = 8


{'loss': 3.4416, 'learning_rate': 6.666666666666667e-06, 'epoch': 1.0}
[*] Evaluating task 0 => conllu


  0%|          | 0/21 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
Trainer is attempting to log a value of "conllu" of type <class 'str'> for key "eval/name" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
***** Running Evaluation *****
  Num examples = 665
  Batch size = 8


Computing metrics...
*** example:
labels: [['SCONJ', 'PRON', 'AUX', 'ADJ', 'ADJ', 'PART', 'VERB', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'NOUN', 'NOUN', 'ADP'], ['INTJ', 'VERB', 'PRON', 'VERB', 'SCONJ', 'PRON', 'VERB', 'PRON', 'ADJ', 'PUNCT'], ['NOUN', 'NOUN', 'PUNCT'], ['VERB', 'VERB', 'VERB', 'VERB', 'VERB', 'NOUN', 'NUM', 'NUM', 'NUM', 'NOUN', 'NOUN', 'ADV', 'PUNCT'], ['PROPN', 'PROPN', 'PROPN', 'PUNCT']]
predictions: [['NOUN', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'NOUN'], ['PROPN', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'PROPN', 'PROPN', 'NOUN', 'PROPN', 'PROPN'], ['NOUN', 'NOUN', 'NOUN'], ['NOUN', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'PROPN'], ['PROPN', 'PROPN', 'PROPN', 'NOUN']]
***
{'eval_loss': 2.4672038555145264, 'eval_precision': 0.04501607717041801, 'eval_recall': 0.014033680834001604, 'eval_f1': 0.021396912731163075, 'eval_accuracy': 0.203514330980844, 'eval_name': 'conllu', 'eval_

  0%|          | 0/21 [00:00<?, ?it/s]

Trainer is attempting to log a value of "conllu" of type <class 'str'> for key "eval/name" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


Computing metrics...
*** example:
labels: [['3', '3', '6', '6', '6', '0', '6', '9', '6', '6'], ['2', '0', '0', '4', '2', '2'], ['0', '0', '1', '1', '1', '1', '5', '5', '7', '7', '7'], ['3', '3', '0', '0', '5', '3', '3', '3', '3'], ['4', '4', '4', '0', '0', '0', '4', '4']]
predictions: [['0', '0', '0', '0', '0', '0', '0', '0', '0', '0'], ['0', '0', '0', '0', '0', '0'], ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0'], ['0', '0', '0', '0', '0', '0', '0', '0', '0'], ['0', '0', '6', '6', '0', '0', '6', '50']]
***
{'eval_loss': 3.733130931854248, 'eval_precision': 0.0032679738562091504, 'eval_recall': 0.0008857395925597874, 'eval_f1': 0.001393728222996516, 'eval_accuracy': 0.13610831052858996, 'eval_name': 'conllu', 'eval_size': 665, 'eval_index': 1, 'eval_runtime': 6.9657, 'eval_samples_per_second': 95.468, 'eval_steps_per_second': 12.059, 'epoch': 1.0}


***** Running Evaluation *****
  Num examples = 665
  Batch size = 8


{'loss': 2.9504, 'learning_rate': 3.3333333333333333e-06, 'epoch': 2.0}
[*] Evaluating task 0 => conllu


  0%|          | 0/21 [00:00<?, ?it/s]

KeyboardInterrupt: 