In [1]:
import datasets
from datasets import Sequence
from datasets import ClassLabel
def load_conll_dataset(train_path, dev_path, test_path, token_idx, label_idx):
    
    def read_conll_file(file_path, token_idx, label_idx):        
        with open(file_path, "r") as f:
            sentences = [[]]
            for line in f:
                line = line.strip()
                
                if line:
                    split = line.split('\t')
                    sentences[-1].append((split[token_idx], split[label_idx]))
                
                else:
                    if sentences[-1]:
                        sentences.append([])
            
            if not sentences[-1]:
                sentences.pop()

        # Convert sentences to Hugging Face Dataset format
        dataset = {
            "tokens": [[token for token, label in sentence] for sentence in sentences],
            "target": [[label for token, label in sentence] for sentence in sentences],
        }

        return dataset

    train_dset = read_conll_file(train_path, token_idx, label_idx)
    dev_dset = read_conll_file(dev_path, token_idx, label_idx)
    test_dset = read_conll_file(test_path, token_idx, label_idx)

    # Get all possible labels and cast to ClassLabel
    label_set = set()
    for dset in [train_dset, dev_dset, test_dset]:
        for labels in dset["target"]:
            label_set.update(labels)
    label_names = sorted(list(label_set))
    
    train_dset = datasets.Dataset.from_dict(train_dset)
    train_dset = train_dset.cast_column("target", Sequence(ClassLabel(names=label_names)))

    dev_dset = datasets.Dataset.from_dict(dev_dset)
    dev_dset = dev_dset.cast_column("target", Sequence(ClassLabel(names=label_names)))

    test_dset = datasets.Dataset.from_dict(test_dset)
    test_dset = test_dset.cast_column("target", Sequence(ClassLabel(names=label_names)))
    
    # Convert to Hugging Face DatasetDict format
    dataset = datasets.DatasetDict({
            "train": train_dset,
            "validation": dev_dset,
            "test": test_dset
        })

    return dataset

raw_dataset = load_conll_dataset("data/train.conllu", "data/dev.conllu", "data/test.conllu", 1, 3)
print(raw_dataset["train"][0]["tokens"])
print(raw_dataset["train"][0]["target"])
print(raw_dataset["train"].features["target"].feature.names)


  from .autonotebook import tqdm as notebook_tqdm
                                                                                   

['Al', '-', 'Zaman', ':', 'American', 'forces', 'killed', 'Shaikh', 'Abdullah', 'al', '-', 'Ani', ',', 'the', 'preacher', 'at', 'the', 'mosque', 'in', 'the', 'town', 'of', 'Qaim', ',', 'near', 'the', 'Syrian', 'border', '.']
[11, 12, 11, 12, 0, 7, 15, 11, 11, 11, 12, 11, 12, 5, 7, 1, 5, 7, 1, 5, 7, 1, 11, 12, 1, 5, 0, 7, 12]
['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X']




We should take as input a .json file with the training info. The input files will be in columns separated by \t. In the json file we will specify the 'target' columns. If more than one target column, then we will duplicate the task.

In [2]:
from hfmtl.tasks.sequence_classification import SequenceClassification
from hfmtl.tasks.token_classification import TokenClassification
from hfmtl.utils import *
from hfmtl.models import *

import easydict
from frozendict import frozendict
import json

# read train_config.json as easydict
with open("config.json", "r") as f:
    args = easydict.EasyDict(json.load(f))

tasks = []
for task in args.tasks:
    if task.task_type == "token_classification":
        for l_idx in task.label_idx:
            tasks.append(
                TokenClassification(
                    dataset = load_conll_dataset(task.train_file, task.eval_file, task.test_file, task.tokens_idx, l_idx),
                    name = task.task_name,
                    tokenizer_kwargs = frozendict(padding="max_length", max_length=args.max_seq_length, truncation=True)
                )
            )
    
    elif task.type == "sequence_classification":
        for l_idx in task.label_idx:
            tasks.append(
                SequenceClassification(
                    dataset = load_conll_dataset(task.train_file, task.eval_file, task.test_file, task.tokens_idx, l_idx),
                    name = task.name,
                    tokenizer_kwargs = frozendict(padding="max_length", max_length=args.max_seq_length, truncation=True)
                )
            )


        
model   = Model(tasks, args) # list of models; by default, shared encoder, task-specific CLS token task-specific head 
trainer = Trainer(model, tasks, args) # tasks are uniformly sampled by default

trainer.train()

                                                                                   

Labels for task:
['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X']


                                                                                   

Labels for task:
['0', '1', '10', '100', '101', '102', '105', '107', '109', '11', '111', '112', '113', '115', '117', '118', '12', '120', '122', '123', '125', '126', '13', '130', '131', '132', '134', '136', '139', '14', '142', '144', '145', '147', '15', '150', '153', '155', '157', '16', '17', '18', '19', '2', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '3', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '4', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '5', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '6', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '7', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '8', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '9', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99', '_']
[*] Found task 0 => conllu


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForTokenClassification: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able

[*] Found task 1 => conllu


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForTokenClassification: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able

Device =  0
[*] Preprocessing task 0 => conllu


                                                                   

[*] Preprocessing task 1 => conllu




asking for single train dataloader
asking for single train dataloader


  0%|          | 0/4704 [00:00<?, ?it/s]You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
 33%|███▎      | 1568/4704 [02:50<05:21,  9.75it/s]

{'loss': 1.4942, 'learning_rate': 6.666666666666667e-06, 'epoch': 1.0}
=> {'ignore_keys': None}
[*] Evaluating task 0 => conllu
asking for single train dataloader




logits&&labels =  2001 2001
Computing metrics...
*** example:
2001 2001
labels: [['INTJ', 'INTJ', 'PUNCT', 'INTJ', 'INTJ', 'PUNCT', 'PRON', 'VERB', 'PART', 'VERB', 'NOUN', 'NOUN', 'NOUN', 'NOUN'], ['DET', 'ADJ', 'NOUN', 'NOUN', 'ADP', 'NOUN', 'NOUN', 'ADP', 'DET', 'PROPN', 'PROPN', 'VERB', 'ADP', 'PROPN'], ['ADV', 'PRON', 'AUX', 'PART', 'VERB', 'VERB', 'DET', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'SCONJ', 'DET', 'VERB'], ['PUNCT', 'PUNCT'], ['NOUN', 'ADP', 'DET', 'ADJ', 'ADJ', 'NOUN', 'CCONJ', 'ADJ', 'NOUN', 'PRON', 'CCONJ', 'PRON', 'NOUN', 'NOUN']]
predictions: [['PRON', 'VERB', 'PUNCT', 'ADV', 'INTJ', 'PUNCT', 'PRON', 'VERB', 'PART', 'VERB', 'NOUN', 'NOUN', 'NOUN', 'NOUN'], ['DET', 'ADJ', 'NOUN', 'NOUN', 'ADP', 'NOUN', 'NOUN', 'ADP', 'DET', 'PROPN', 'PROPN', 'VERB', 'ADP', 'PROPN'], ['ADV', 'PRON', 'AUX', 'PART', 'VERB', 'VERB', 'DET', 'ADJ', 'NOUN', 'NOUN', 'NOUN', 'ADP', 'DET', 'ADJ'], ['PUNCT', 'PUNCT'], ['NOUN', 'ADP', 'DET', 'ADJ', 'ADJ', 'NOUN', 'CCONJ', 'ADJ', 'NOUN', 'PRON', 'CCONJ

                                                   
 33%|███▎      | 1568/4704 [02:53<05:21,  9.75it/s]

{'eval_loss': 0.2710649073123932, 'eval_precision': 0.9086760967342415, 'eval_recall': 0.9196463913445045, 'eval_f1': 0.9141283320764615, 'eval_accuracy': 0.9278480399282841, 'eval_name': 'conllu', 'eval_size': 2001, 'eval_index': 0, 'eval_runtime': 3.2185, 'eval_samples_per_second': 621.715, 'eval_steps_per_second': 39.148, 'epoch': 1.0}
[*] Evaluating task 1 => conllu
asking for single train dataloader




logits&&labels =  2001 2001
Computing metrics...
*** example:
2001 2001
labels: [['2', '0', '0', '4', '2', '4', '17', '8', '9', '17', '12', '12', '8', '17'], ['3', '3', '3', '3', '0', '0', '5', '3', '3', '3'], ['0', '0', '1'], ['0', '1', '4', '1', '1', '8', '8', '8', '1', '13', '13', '12', '13', '8'], ['0', '0', '1']]
predictions: [['2', '0', '0', '3', '2', '2', '10', '10', '9', '8', '11', '11', '10', '13'], ['3', '3', '4', '3', '0', '5', '6', '4', '6', '5'], ['0', '0', '1'], ['0', '2', '4', '1', '2', '7', '7', '8', '1', '13', '12', '12', '12', '13'], ['0', '0', '1']]
***


  _warn_prf(average, modifier, msg_start, len(result))
                                                   
 33%|███▎      | 1569/4704 [02:56<1:20:08,  1.53s/it]

{'eval_loss': 1.5871649980545044, 'eval_precision': 0.23466204506065858, 'eval_recall': 0.2242464392182842, 'eval_f1': 0.22933604336043362, 'eval_accuracy': 0.46537771962979113, 'eval_name': 'conllu', 'eval_size': 2001, 'eval_index': 1, 'eval_runtime': 3.1489, 'eval_samples_per_second': 635.452, 'eval_steps_per_second': 40.013, 'epoch': 1.0}


 67%|██████▋   | 3136/4704 [05:41<02:26, 10.72it/s]  

{'loss': 0.911, 'learning_rate': 3.3333333333333333e-06, 'epoch': 2.0}
=> {'ignore_keys': None}
[*] Evaluating task 0 => conllu
asking for single train dataloader




logits&&labels =  2001 2001
Computing metrics...
*** example:
2001 2001
labels: [['DET', 'NOUN', 'NOUN', 'NOUN', 'AUX', 'PART', 'PART', 'AUX', 'VERB', 'ADV', 'ADV', 'PUNCT'], ['PUNCT', 'VERB', 'PUNCT', 'PROPN', 'PROPN', 'PUNCT', 'VERB', 'PRON', 'PRON', 'NOUN', 'NOUN', 'NOUN', 'ADP', 'PROPN'], ['CCONJ', 'SCONJ', 'PRON', 'PRON', 'ADJ', 'AUX', 'VERB', 'VERB', 'PUNCT', 'PRON', 'AUX', 'PRON', 'NOUN', 'NOUN'], ['PRON', 'AUX', 'VERB', 'ADV', 'ADV', 'PUNCT'], ['X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X']]
predictions: [['DET', 'NOUN', 'NOUN', 'NOUN', 'AUX', 'PART', 'PART', 'AUX', 'VERB', 'ADV', 'ADV', 'PUNCT'], ['PUNCT', 'VERB', 'PUNCT', 'NOUN', 'PROPN', 'PUNCT', 'VERB', 'PRON', 'PRON', 'ADJ', 'ADJ', 'NOUN', 'ADP', 'PROPN'], ['CCONJ', 'SCONJ', 'PRON', 'PRON', 'PRON', 'AUX', 'VERB', 'VERB', 'PUNCT', 'PRON', 'AUX', 'PRON', 'NOUN', 'NOUN'], ['PRON', 'AUX', 'AUX', 'ADV', 'ADV', 'PUNCT'], ['X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X']]
***


                                                   
 67%|██████▋   | 3136/4704 [05:45<02:26, 10.72it/s]

{'eval_loss': 0.22451576590538025, 'eval_precision': 0.9202553912306991, 'eval_recall': 0.9318511677002244, 'eval_f1': 0.926016979709575, 'eval_accuracy': 0.9376362843436546, 'eval_name': 'conllu', 'eval_size': 2001, 'eval_index': 0, 'eval_runtime': 3.2451, 'eval_samples_per_second': 616.629, 'eval_steps_per_second': 38.828, 'epoch': 2.0}
[*] Evaluating task 1 => conllu
asking for single train dataloader




logits&&labels =  2001 2001
Computing metrics...
*** example:
2001 2001
labels: [['2', '6', '6', '6', '6', '6', '0', '6'], ['0', '0', '1'], ['4', '4', '4', '0', '4'], ['25', '6', '6', '6', '6', '25', '25', '8', '6', '12', '12', '12', '8', '19'], ['3', '3', '0', '5', '3', '12', '12', '12', '12', '12', '10', '8', '8', '8']]
predictions: [['2', '6', '5', '5', '5', '5', '0', '5'], ['0', '0', '1'], ['3', '3', '3', '0', '2'], ['6', '5', '5', '6', '5', '2', '3', '8', '5', '12', '11', '12', '5', '14'], ['3', '3', '0', '5', '3', '10', '10', '8', '8', '8', '11', '8', '8', '10']]
***


                                                   
 67%|██████▋   | 3136/4704 [05:48<02:26, 10.72it/s]

{'eval_loss': 1.274828553199768, 'eval_precision': 0.26749760306807285, 'eval_recall': 0.2772441205697251, 'eval_f1': 0.27228366948601174, 'eval_accuracy': 0.5923826137519989, 'eval_name': 'conllu', 'eval_size': 2001, 'eval_index': 1, 'eval_runtime': 3.1629, 'eval_samples_per_second': 632.643, 'eval_steps_per_second': 39.837, 'epoch': 2.0}


100%|██████████| 4704/4704 [08:38<00:00, 10.26it/s]

{'loss': 0.7752, 'learning_rate': 0.0, 'epoch': 3.0}
=> {'ignore_keys': None}
[*] Evaluating task 0 => conllu
asking for single train dataloader




logits&&labels =  2001 2001
Computing metrics...
*** example:
2001 2001
labels: [['VERB', 'VERB', 'VERB', 'VERB', 'PRON', 'CCONJ', 'VERB', 'PRON', 'ADP', 'PROPN', 'PRON', 'AUX', 'VERB', 'ADJ'], ['ADJ', 'CCONJ', 'ADJ', 'ADJ', 'ADP', 'ADJ', 'NOUN', 'NOUN', 'NOUN', 'CCONJ', 'ADJ', 'NOUN', 'PUNCT'], ['ADJ', 'NOUN', 'NOUN', 'NOUN', 'NOUN'], ['PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'ADJ', 'ADJ', 'ADJ', 'NOUN', 'NOUN', 'CCONJ', 'NOUN', 'NOUN', 'ADP'], ['PRON', 'AUX', 'PART', 'VERB', 'VERB', 'PUNCT', 'PRON', 'ADV', 'VERB', 'DET', 'NOUN', 'NOUN', 'ADP', 'PRON']]
predictions: [['VERB', 'VERB', 'VERB', 'VERB', 'PRON', 'CCONJ', 'VERB', 'PRON', 'ADP', 'NOUN', 'PRON', 'AUX', 'VERB', 'VERB'], ['ADJ', 'CCONJ', 'ADJ', 'ADJ', 'ADP', 'ADJ', 'NOUN', 'NOUN', 'NOUN', 'CCONJ', 'ADJ', 'NOUN', 'PUNCT'], ['ADJ', 'NOUN', 'NOUN', 'NOUN', 'NOUN'], ['PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'ADJ', 'ADJ', 'ADJ', 'NOUN', 'NOUN', 'CCONJ', 'NOUN', 'NOUN', 'ADP'], ['PRON', 'AUX', 'PART', 'VERB', 'VERB', 'PUNCT',

                                                   
100%|██████████| 4704/4704 [08:41<00:00, 10.26it/s]

{'eval_loss': 0.21349245309829712, 'eval_precision': 0.9263494967978042, 'eval_recall': 0.9351497559044729, 'eval_f1': 0.9307288246881156, 'eval_accuracy': 0.9406890536415177, 'eval_name': 'conllu', 'eval_size': 2001, 'eval_index': 0, 'eval_runtime': 3.1893, 'eval_samples_per_second': 627.415, 'eval_steps_per_second': 39.507, 'epoch': 3.0}
[*] Evaluating task 1 => conllu
asking for single train dataloader




logits&&labels =  2001 2001
Computing metrics...
*** example:
2001 2001
labels: [['2', '0', '0', '7', '7', '7', '7', '7', '7', '7', '2', '2', '9', '7'], ['2', '0', '0', '2', '2', '8', '8', '7', '8', '2', '8', '11', '8', '8'], ['3', '3', '3', '0', '3'], ['4', '4', '4', '0', '4', '4', '4', '4'], ['2', '16', '16', '2', '2', '2', '9', '9', '8', '9', '4', '11', '9', '9']]
predictions: [['2', '0', '0', '7', '7', '7', '7', '8', '8', '8', '2', '2', '9', '7'], ['2', '0', '0', '2', '2', '7', '8', '7', '7', '2', '2', '11', '2', '2'], ['2', '3', '3', '0', '2'], ['4', '4', '4', '0', '4', '4', '4', '4'], ['2', '9', '9', '2', '2', '2', '8', '9', '9', '9', '3', '11', '9', '9']]
***


                                                   
100%|██████████| 4704/4704 [08:45<00:00,  8.96it/s]

{'eval_loss': 1.1973886489868164, 'eval_precision': 0.2958660744789887, 'eval_recall': 0.28684995031467375, 'eval_f1': 0.29128826101580896, 'eval_accuracy': 0.6248970296070165, 'eval_name': 'conllu', 'eval_size': 2001, 'eval_index': 1, 'eval_runtime': 3.1349, 'eval_samples_per_second': 638.298, 'eval_steps_per_second': 40.193, 'epoch': 3.0}
{'train_runtime': 525.1467, 'train_samples_per_second': 8.957, 'train_steps_per_second': 8.957, 'train_loss': 1.0601452159232834, 'epoch': 3.0}





TrainOutput(global_step=4704, training_loss=1.0601452159232834, metrics={'train_runtime': 525.1467, 'train_samples_per_second': 8.957, 'train_steps_per_second': 8.957, 'train_loss': 1.0601452159232834, 'epoch': 3.0})