## Split the dataset into train, test and validation set

In [1]:
import random

def read_iob_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = file.read().strip().split('\n\n')
        data = [sentence.split('\n') for sentence in data]
    return data

def write_iob_data(file_path, data):
    with open(file_path, 'w', encoding='utf-8') as file:
        for sentence in data:
            for line in sentence:
                file.write(line + '\n')
            file.write('\n')

def split_data(data, train_ratio=0.8, test_ratio=0.1, val_ratio=0.1):
    random.shuffle(data)
    total_size = len(data)
    train_size = int(total_size * train_ratio)
    test_size = int(total_size * test_ratio)
    val_size = total_size - train_size - test_size
    
    train_data = data[:train_size]
    test_data = data[train_size:train_size + test_size]
    val_data = data[train_size + test_size:]
    
    return train_data, test_data, val_data

# read
data = read_iob_data('output.txt')

# print(data)

# split
train_data, test_data, val_data = split_data(data)

# save
write_iob_data('train_iob_data.txt', train_data)
write_iob_data('test_iob_data.txt', test_data)
write_iob_data('val_iob_data.txt', val_data)


## Finetuning on a local dataset

In [3]:
local_dataset = {"train": "train_iob_data.txt", "validation": "test_iob_data.txt", "test": "val_iob_data.txt"}

In [2]:
from tner import GridSearcher, TransformersNER

searcher = GridSearcher(
   checkpoint_dir='./ckpt_bert_custom_dataset',
   local_dataset=local_dataset,
   model="distilbert-base-cased",  # language model to fine-tune
   epoch=2,  # the total epoch (`L` in the figure)
   epoch_partial=1,  # the number of epoch at 1st stage (`M` in the figure)
   n_max_config=1,  # the number of models to pass to 2nd stage (`K` in the figure)
   batch_size=4,
   gradient_accumulation_steps=[1],
   crf=[True],
   lr=[1e-4],
   weight_decay=[None],
   random_seed=[42],
   lr_warmup_step_ratio=[0.1],
   max_grad_norm=[None, 10]
)
searcher.train()

ImportError: DLL load failed while importing _multiarray_umath: 找不到指定的模块。

ImportError: numpy._core.multiarray failed to import

In [4]:
from tner import GridSearcher, TransformersNER

model = TransformersNER("ckpt_bert_custom_dataset/best_model")
metric = model.evaluate(local_dataset=local_dataset, dataset_split='test', batch_size=16)

2024-08-01 23:50:18 INFO     initialize language model with `ckpt_bert_custom_dataset/best_model`
2024-08-01 23:50:18 INFO     use CRF
2024-08-01 23:50:18 INFO     loading pre-trained CRF layer
2024-08-01 23:50:18 INFO     label2id: {'B-CAPEXCovenantCondition': 0, 'B-CAPEXCovenantDefinition': 1, 'B-CAPEXDefinition': 2, 'B-CAPEXStartingCovenantThreshold': 3, 'B-DividendCovenantCondition': 4, 'B-DividendCovenantDefinition': 5, 'B-DividendDefinition': 6, 'B-DividendStartingCovenantThreshold': 7, 'B-DtECovenantCondition': 41, 'B-DtECovenantDefinition': 42, 'B-DtEDefinition': 43, 'B-DtEStartingCovenantDate': 69, 'B-DtEStartingCovenantThreshold': 44, 'B-FCCRCovenantCondition': 45, 'B-FCCRCovenantDefinition': 8, 'B-FCCRDefinition': 9, 'B-FCCRStartingCovenantThreshold': 10, 'B-ICRCovenantCondition': 46, 'B-ICRCovenantDefinition': 47, 'B-ICRDefinition': 70, 'B-ICRStartingCovenantDate': 48, 'B-ICRStartingCovenantThreshold': 49, 'B-LeverageCovenantCondition': 50, 'B-LeverageCovenantDefinition': 1

In [5]:
metric

{'micro/f1': 0.0,
 'micro/f1_ci': {},
 'micro/recall': 0.0,
 'micro/precision': 0.0,
 'macro/f1': 0.0,
 'macro/f1_ci': {},
 'macro/recall': 0.0,
 'macro/precision': 0.0,
 'per_entity_metric': {'CAPEXCovenantCondition': {'f1': 0.0,
   'f1_ci': {},
   'precision': 0.0,
   'recall': 0.0},
  'CAPEXCovenantDefinition': {'f1': 0.0,
   'f1_ci': {},
   'precision': 0.0,
   'recall': 0.0},
  'CAPEXDefinition': {'f1': 0.0, 'f1_ci': {}, 'precision': 0.0, 'recall': 0.0},
  'CAPEXStartingCovenantThreshold': {'f1': 0.0,
   'f1_ci': {},
   'precision': 0.0,
   'recall': 0.0},
  'DividendCovenantCondition': {'f1': 0.0,
   'f1_ci': {},
   'precision': 0.0,
   'recall': 0.0},
  'DividendCovenantDefinition': {'f1': 0.0,
   'f1_ci': {},
   'precision': 0.0,
   'recall': 0.0},
  'DividendDefinition': {'f1': 0.0,
   'f1_ci': {},
   'precision': 0.0,
   'recall': 0.0},
  'DividendStartingCovenantThreshold': {'f1': 0.0,
   'f1_ci': {},
   'precision': 0.0,
   'recall': 0.0},
  'FCCRCovenantDefinition': {'f1':