In [57]:
import numpy as np
from collections import Counter
import pandas as pd
import os
import evaluate
from datasets import Dataset, DatasetDict, ClassLabel, Sequence, Value, Features, load_dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, get_scheduler
from transformers import DataCollatorForTokenClassification

from sklearn.model_selection import ParameterGrid

import torch
from torch.utils.data import DataLoader
from accelerate import Accelerator
from tqdm.auto import tqdm
from torch.optim import AdamW

# Load toy dataset for testing
Will be replaced with Remco's code for dataset creation, token alignment, tokenization and dataset split creation  
This is code from assignment 2

In [58]:
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

# Step 1: Read and parse IOB files
def read_iob_file(file_path):
    """Reads an IOB file from filepath and returns sentences with tokens and tags."""
    sentences = []
    sentence_tokens = []
    sentence_labels = []

    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            line = line.strip()
            if line:  # If line is not empty
                token, _, tag = line.split()
                sentence_tokens.append(token)
                sentence_labels.append(tag)
                
            else:
                # End of a sentence
                if sentence_tokens:
                    sentences.append({"tokens": sentence_tokens, "ner_tags": sentence_labels})
                    sentence_tokens = []
                    sentence_labels = []
        # Add the last sentence if file doesn't end with a newline
        if sentence_tokens:
            sentences.append({"tokens": sentence_tokens, "ner_tags": sentence_labels})
    return sentences

# Step 2: Convert IOB data to Hugging Face dataset format
def create_dataset_from_files(data_dir):
    """Create dataset from train and test files, generates the ClassLabel from unique values in train, val
       and test and returns the DatasetDict

    Args:
        data_dir (Directory): The directory having the train, val and test.txt files. 

    Returns:
        dataset_dict (DatasetDict): The DatasetDict necessary for further training and classification purposes
    """

    # Define paths for train, validation, and test files
    file_paths = {
        "train": os.path.join(data_dir, "train.txt"),
        "validation": os.path.join(data_dir, "val.txt"),
        "test": os.path.join(data_dir, "test.txt"),
    }
    
    # Parse the files
    data = {split: read_iob_file(path) for split, path in file_paths.items()}

    # Define the label names and ClassLabel feature
    unique_labels = sorted(set(tag for split_data in data.values() for d in split_data for tag in d["ner_tags"]))
    label_feature = ClassLabel(names=unique_labels)

    # Define the Features schema for Hugging Face datasets
    features = Features({
        'tokens': Sequence(Value("string")),
        'ner_tags': Sequence(label_feature)
    })

    # Convert data into DatasetDict
    dataset_dict = DatasetDict({
        split: Dataset.from_list(split_data).cast(features)
        for split, split_data in data.items()
    })
    
    return dataset_dict

# Step 3: Create the dataset
data_dir = "example_data"  # Adjust this path to your data directory
dataset = create_dataset_from_files(data_dir)

# Assign the datasets to the correct variables
train_dataset = dataset['train']
val_dataset = dataset['validation']
test_dataset = dataset['test']

Casting the dataset:   0%|          | 0/1992 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/850 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/864 [00:00<?, ? examples/s]

In [59]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None

    for word_id in word_ids:
        if word_id != current_word:
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)

        elif word_id is None:
            # Special token
            new_labels.append(-100)

        else:
            # Same word as previous token
            label = labels[word_id]

            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels


def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, 
        is_split_into_words=True
    )
    all_labels = examples["ner_tags"] 
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

# tokenized_train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True, remove_columns=train_dataset.column_names)
# tokenized_val_dataset = val_dataset.map(tokenize_and_align_labels, batched=True, remove_columns=val_dataset.column_names)
# tokenized_test_dataset = test_dataset.map(tokenize_and_align_labels, batched=True, remove_columns=test_dataset.column_names)


In [49]:
# data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
# tokenized_train_dataset[0]

{'input_ids': [101, 15982, 1407, 119, 102],
 'token_type_ids': [0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1],
 'labels': [-100, 12, 12, 12, -100]}

In [60]:
# Load the evaluation metric
metric = evaluate.load("seqeval")

label_names = train_dataset.features["ner_tags"].feature.names
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

model = AutoModelForTokenClassification.from_pretrained("bert-base-cased", id2label=id2label, label2id=label2id)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Prepping the data for training

In [61]:
def create_dataset_given_model(train, val, test, model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

    # Tokenize and align labels
    tokenized_train = train_dataset.map(tokenize_and_align_labels, batched=True, remove_columns=train_dataset.column_names)
    tokenized_val = val_dataset.map(tokenize_and_align_labels, batched=True, remove_columns=val_dataset.column_names)
    tokenized_test = test_dataset.map(tokenize_and_align_labels, batched=True, remove_columns=test_dataset.column_names)

    return data_collator, tokenized_train, tokenized_val, tokenized_test

In [63]:
def postprocess(predictions, labels):
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_labels, true_predictions

# Training and evaluation

In [65]:
# Define the models and their corresponding sizes
models = {
    "small": "bert-base-cased",
    "medium": "bert-large-cased",
    "large": "roberta-large"
}

# Initialize a dictionary to store the results
results = {}

for size, model_name in models.items():
    print(f"\nTraining and evaluating {model_name} ({size})...")

    # Create datasets and data collator for the current model
    data_collator, tokenized_train, tokenized_val, tokenized_test = create_dataset_given_model(
        train_dataset, val_dataset, test_dataset, model_name
    )

    # Create DataLoaders
    train_dataloader = DataLoader(tokenized_train, batch_size=8, shuffle=True, collate_fn=data_collator)
    val_dataloader = DataLoader(tokenized_val, batch_size=8, collate_fn=data_collator)
    test_dataloader = DataLoader(tokenized_test, batch_size=8, collate_fn=data_collator)

    # Initialize the model for token classification
    model = AutoModelForTokenClassification.from_pretrained(
        model_name, id2label=id2label, label2id=label2id
    )

    # Set up optimizer and learning rate scheduler
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    num_train_epochs = 3
    num_update_steps_per_epoch = len(train_dataloader)
    num_training_steps = num_train_epochs * num_update_steps_per_epoch
    lr_scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps
    )

    # Use the accelerator for distributed training
    accelerator = Accelerator()
    model, optimizer, train_dataloader, val_dataloader = accelerator.prepare(
        model, optimizer, train_dataloader, val_dataloader
    )

    # Training loop
    for epoch in range(num_train_epochs):
        print(f"Epoch {epoch + 1}/{num_train_epochs}")
        model.train()
        total_loss = 0
        progress_bar = tqdm(train_dataloader, desc=f"Training Epoch {epoch+1}")
        for batch in progress_bar:
            # Move batch data to the same device as the model
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            total_loss += loss.item()
    
            accelerator.backward(loss)
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
    
            progress_bar.set_postfix(loss=loss.item())
    
        print(f"Epoch {epoch + 1} Loss: {total_loss:.4f}")

        # Validation loop
        model.eval()
        val_predictions, val_labels = [], []
        with torch.no_grad():
            for batch in val_dataloader:
                # Move batch data to the same device as the model
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**batch)
                logits = outputs.logits
                predictions = logits.argmax(dim=-1)
                labels = batch["labels"]
        
                # Handle padding across processes for multi-GPU
                predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
                labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)
        
                predictions_gathered = accelerator.gather(predictions)
                labels_gathered = accelerator.gather(labels)
        
                true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered)
                metric.add_batch(predictions=true_predictions, references=true_labels)
        
        val_results = metric.compute()
        print(f"Validation Metrics (Epoch {epoch + 1}): {val_results}")

    # Test loop
    model.eval()
    test_predictions, test_labels = [], []
    with torch.no_grad():
        for batch in test_dataloader:
            # Move batch data to the same device as the model
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            logits = outputs.logits
            predictions = logits.argmax(dim=-1)
            labels = batch["labels"]
        
            # Handle padding across processes for multi-GPU
            predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
            labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)
        
            predictions_gathered = accelerator.gather(predictions)
            labels_gathered = accelerator.gather(labels)
        
            true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered)
            metric.add_batch(predictions=true_predictions, references=true_labels)
    
    test_results = metric.compute()
    print(f"Test Metrics for {model_name}: {test_results}")

    # Store results for the current model
    results[size] = test_results

# Final results
for size, metrics in results.items():
    print(f"\nFinal Test Metrics for {models[size]} ({size}): {metrics}")



Training and evaluating bert-base-cased (small)...


Map:   0%|          | 0/1992 [00:00<?, ? examples/s]

Map:   0%|          | 0/850 [00:00<?, ? examples/s]

Map:   0%|          | 0/864 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


Training Epoch 1:   0%|          | 0/249 [00:00<?, ?it/s]

Epoch 1 Loss: 85.2722
Validation Metrics (Epoch 1): {'ART': {'precision': np.float64(0.5465393794749404), 'recall': np.float64(0.324822695035461), 'f1': np.float64(0.40747330960854095), 'number': np.int64(705)}, 'CON': {'precision': np.float64(0.5058139534883721), 'recall': np.float64(0.3425196850393701), 'f1': np.float64(0.40845070422535207), 'number': np.int64(254)}, 'LOC': {'precision': np.float64(0.6115942028985507), 'recall': np.float64(0.5424164524421594), 'f1': np.float64(0.5749318801089919), 'number': np.int64(389)}, 'MAT': {'precision': np.float64(0.0), 'recall': np.float64(0.0), 'f1': np.float64(0.0), 'number': np.int64(0)}, 'PER': {'precision': np.float64(0.7266949152542372), 'recall': np.float64(0.5813559322033899), 'f1': np.float64(0.6459510357815442), 'number': np.int64(590)}, 'SPE': {'precision': np.float64(0.0), 'recall': np.float64(0.0), 'f1': np.float64(0.0), 'number': np.int64(0)}, 'overall_precision': np.float64(0.5807743658210948), 'overall_recall': np.float64(0.44

Training Epoch 2:   0%|          | 0/249 [00:00<?, ?it/s]

Epoch 2 Loss: 28.4917
Validation Metrics (Epoch 2): {'ART': {'precision': np.float64(0.6062052505966588), 'recall': np.float64(0.37797619047619047), 'f1': np.float64(0.465627864344638), 'number': np.int64(672)}, 'CON': {'precision': np.float64(0.5872093023255814), 'recall': np.float64(0.38113207547169814), 'f1': np.float64(0.4622425629290618), 'number': np.int64(265)}, 'LOC': {'precision': np.float64(0.5971014492753624), 'recall': np.float64(0.544973544973545), 'f1': np.float64(0.5698478561549102), 'number': np.int64(378)}, 'MAT': {'precision': np.float64(0.08888888888888889), 'recall': np.float64(0.3076923076923077), 'f1': np.float64(0.13793103448275862), 'number': np.int64(13)}, 'PER': {'precision': np.float64(0.7415254237288136), 'recall': np.float64(0.587248322147651), 'f1': np.float64(0.6554307116104869), 'number': np.int64(596)}, 'SPE': {'precision': np.float64(0.3333333333333333), 'recall': np.float64(0.5357142857142857), 'f1': np.float64(0.41095890410958896), 'number': np.int64

Training Epoch 3:   0%|          | 0/249 [00:00<?, ?it/s]

Epoch 3 Loss: 19.9413
Validation Metrics (Epoch 3): {'ART': {'precision': np.float64(0.594272076372315), 'recall': np.float64(0.40161290322580645), 'f1': np.float64(0.47930702598652547), 'number': np.int64(620)}, 'CON': {'precision': np.float64(0.5872093023255814), 'recall': np.float64(0.3494809688581315), 'f1': np.float64(0.438177874186551), 'number': np.int64(289)}, 'LOC': {'precision': np.float64(0.663768115942029), 'recall': np.float64(0.5518072289156627), 'f1': np.float64(0.6026315789473685), 'number': np.int64(415)}, 'MAT': {'precision': np.float64(0.13333333333333333), 'recall': np.float64(0.375), 'f1': np.float64(0.19672131147540986), 'number': np.int64(16)}, 'PER': {'precision': np.float64(0.7521186440677966), 'recall': np.float64(0.6089193825042881), 'f1': np.float64(0.6729857819905213), 'number': np.int64(583)}, 'SPE': {'precision': np.float64(0.4), 'recall': np.float64(0.5142857142857142), 'f1': np.float64(0.45), 'number': np.int64(35)}, 'overall_precision': np.float64(0.63

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Map:   0%|          | 0/1992 [00:00<?, ? examples/s]

Map:   0%|          | 0/850 [00:00<?, ? examples/s]

Map:   0%|          | 0/864 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-large-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


Training Epoch 1:   0%|          | 0/249 [00:00<?, ?it/s]

Epoch 1 Loss: 72.0516
Validation Metrics (Epoch 1): {'ART': {'precision': np.float64(0.5751789976133651), 'recall': np.float64(0.3990066225165563), 'f1': np.float64(0.4711632453567937), 'number': np.int64(604)}, 'CON': {'precision': np.float64(0.5930232558139535), 'recall': np.float64(0.24817518248175183), 'f1': np.float64(0.34991423670668953), 'number': np.int64(411)}, 'LOC': {'precision': np.float64(0.672463768115942), 'recall': np.float64(0.48232848232848236), 'f1': np.float64(0.5617433414043583), 'number': np.int64(481)}, 'MAT': {'precision': np.float64(0.0), 'recall': np.float64(0.0), 'f1': np.float64(0.0), 'number': np.int64(0)}, 'PER': {'precision': np.float64(0.7288135593220338), 'recall': np.float64(0.6220614828209765), 'f1': np.float64(0.671219512195122), 'number': np.int64(553)}, 'SPE': {'precision': np.float64(0.2222222222222222), 'recall': np.float64(0.5555555555555556), 'f1': np.float64(0.31746031746031744), 'number': np.int64(18)}, 'overall_precision': np.float64(0.62016

Training Epoch 2:   0%|          | 0/249 [00:00<?, ?it/s]

Epoch 2 Loss: 26.2343
Validation Metrics (Epoch 2): {'ART': {'precision': np.float64(0.5894988066825776), 'recall': np.float64(0.4395017793594306), 'f1': np.float64(0.5035677879714577), 'number': np.int64(562)}, 'CON': {'precision': np.float64(0.5988372093023255), 'recall': np.float64(0.3588850174216028), 'f1': np.float64(0.44880174291938996), 'number': np.int64(287)}, 'LOC': {'precision': np.float64(0.6811594202898551), 'recall': np.float64(0.5904522613065326), 'f1': np.float64(0.6325706594885598), 'number': np.int64(398)}, 'MAT': {'precision': np.float64(0.2), 'recall': np.float64(0.75), 'f1': np.float64(0.31578947368421056), 'number': np.int64(12)}, 'PER': {'precision': np.float64(0.7161016949152542), 'recall': np.float64(0.6167883211678832), 'f1': np.float64(0.6627450980392157), 'number': np.int64(548)}, 'SPE': {'precision': np.float64(0.6444444444444445), 'recall': np.float64(0.6041666666666666), 'f1': np.float64(0.6236559139784946), 'number': np.int64(48)}, 'overall_precision': n

Training Epoch 3:   0%|          | 0/249 [00:00<?, ?it/s]

Epoch 3 Loss: 17.6319
Validation Metrics (Epoch 3): {'ART': {'precision': np.float64(0.5847255369928401), 'recall': np.float64(0.4359430604982206), 'f1': np.float64(0.4994903160040775), 'number': np.int64(562)}, 'CON': {'precision': np.float64(0.5697674418604651), 'recall': np.float64(0.358974358974359), 'f1': np.float64(0.4404494382022472), 'number': np.int64(273)}, 'LOC': {'precision': np.float64(0.6782608695652174), 'recall': np.float64(0.6015424164524421), 'f1': np.float64(0.6376021798365121), 'number': np.int64(389)}, 'MAT': {'precision': np.float64(0.37777777777777777), 'recall': np.float64(0.6296296296296297), 'f1': np.float64(0.47222222222222215), 'number': np.int64(27)}, 'PER': {'precision': np.float64(0.7245762711864406), 'recall': np.float64(0.6107142857142858), 'f1': np.float64(0.6627906976744186), 'number': np.int64(560)}, 'SPE': {'precision': np.float64(0.6666666666666666), 'recall': np.float64(0.5882352941176471), 'f1': np.float64(0.625), 'number': np.int64(51)}, 'overal

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/1992 [00:00<?, ? examples/s]

Map:   0%|          | 0/850 [00:00<?, ? examples/s]

Map:   0%|          | 0/864 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


Training Epoch 1:   0%|          | 0/249 [00:00<?, ?it/s]

Epoch 1 Loss: 152.5536
Validation Metrics (Epoch 1): {'ART': {'precision': np.float64(0.03341288782816229), 'recall': np.float64(0.7777777777777778), 'f1': np.float64(0.06407322654462243), 'number': np.int64(18)}, 'CON': {'precision': np.float64(0.11627906976744186), 'recall': np.float64(0.4166666666666667), 'f1': np.float64(0.1818181818181818), 'number': np.int64(48)}, 'LOC': {'precision': np.float64(0.0), 'recall': np.float64(0.0), 'f1': np.float64(0.0), 'number': np.int64(0)}, 'MAT': {'precision': np.float64(0.0), 'recall': np.float64(0.0), 'f1': np.float64(0.0), 'number': np.int64(0)}, 'PER': {'precision': np.float64(0.1228813559322034), 'recall': np.float64(0.4603174603174603), 'f1': np.float64(0.19397993311036787), 'number': np.int64(126)}, 'SPE': {'precision': np.float64(0.0), 'recall': np.float64(0.0), 'f1': np.float64(0.0), 'number': np.int64(0)}, 'overall_precision': np.float64(0.06141522029372497), 'overall_recall': np.float64(0.4791666666666667), 'overall_f1': np.float64(0.

Training Epoch 2:   0%|          | 0/249 [00:00<?, ?it/s]

Epoch 2 Loss: 98.0196
Validation Metrics (Epoch 2): {'ART': {'precision': np.float64(0.2911694510739857), 'recall': np.float64(0.4), 'f1': np.float64(0.3370165745856354), 'number': np.int64(305)}, 'CON': {'precision': np.float64(0.3081395348837209), 'recall': np.float64(0.29608938547486036), 'f1': np.float64(0.301994301994302), 'number': np.int64(179)}, 'LOC': {'precision': np.float64(0.028985507246376812), 'recall': np.float64(0.22727272727272727), 'f1': np.float64(0.05141388174807198), 'number': np.int64(44)}, 'MAT': {'precision': np.float64(0.0), 'recall': np.float64(0.0), 'f1': np.float64(0.0), 'number': np.int64(0)}, 'PER': {'precision': np.float64(0.4004237288135593), 'recall': np.float64(0.5220994475138122), 'f1': np.float64(0.45323741007194246), 'number': np.int64(362)}, 'SPE': {'precision': np.float64(0.0), 'recall': np.float64(0.0), 'f1': np.float64(0.0), 'number': np.int64(0)}, 'overall_precision': np.float64(0.24966622162883845), 'overall_recall': np.float64(0.4202247191011

Training Epoch 3:   0%|          | 0/249 [00:00<?, ?it/s]

Epoch 3 Loss: 69.9804
Validation Metrics (Epoch 3): {'ART': {'precision': np.float64(0.31742243436754175), 'recall': np.float64(0.4539249146757679), 'f1': np.float64(0.3735955056179775), 'number': np.int64(293)}, 'CON': {'precision': np.float64(0.3372093023255814), 'recall': np.float64(0.2871287128712871), 'f1': np.float64(0.3101604278074866), 'number': np.int64(202)}, 'LOC': {'precision': np.float64(0.07246376811594203), 'recall': np.float64(0.2976190476190476), 'f1': np.float64(0.11655011655011656), 'number': np.int64(84)}, 'MAT': {'precision': np.float64(0.0), 'recall': np.float64(0.0), 'f1': np.float64(0.0), 'number': np.int64(0)}, 'PER': {'precision': np.float64(0.4258474576271186), 'recall': np.float64(0.5037593984962406), 'f1': np.float64(0.4615384615384615), 'number': np.int64(399)}, 'SPE': {'precision': np.float64(0.0), 'recall': np.float64(0.0), 'f1': np.float64(0.0), 'number': np.int64(3)}, 'overall_precision': np.float64(0.27837116154873165), 'overall_recall': np.float64(0.