## Split the dataset into train, test and validation set

In [5]:
import random

def read_iob_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = file.read().strip().split('\n\n')
        data = [sentence.split('\n') for sentence in data]
    return data

def write_iob_data(file_path, data):
    with open(file_path, 'w', encoding='utf-8') as file:
        for sentence in data:
            for line in sentence:
                file.write(line + '\n')
            file.write('\n')

def split_data(data, train_ratio=0.8, test_ratio=0.1, val_ratio=0.1):
    random.shuffle(data)
    total_size = len(data)
    train_size = int(total_size * train_ratio)
    test_size = int(total_size * test_ratio)
    val_size = total_size - train_size - test_size
    
    train_data = data[:train_size]
    test_data = data[train_size:train_size + test_size]
    val_data = data[train_size + test_size:]
    
    return train_data, test_data, val_data

# read
data = read_iob_data('output.txt')

# print(data)

# split
train_data, test_data, val_data = split_data(data)

# save
write_iob_data('train_iob_data.txt', train_data)
write_iob_data('test_iob_data.txt', test_data)
write_iob_data('val_iob_data.txt', val_data)


## Finetuning on a local dataset

In [1]:
local_dataset = {"train": "train_iob_data.txt", "validation": "test_iob_data.txt", "test": "val_iob_data.txt"}

In [3]:
from tner import GridSearcher, TransformersNER

searcher = GridSearcher(
   checkpoint_dir='./ckpt_bert_custom_dataset',
   local_dataset=local_dataset,
   model="distilbert-base-cased",  # language model to fine-tune
   epoch=2,  # the total epoch (`L` in the figure)
   epoch_partial=1,  # the number of epoch at 1st stage (`M` in the figure)
   n_max_config=1,  # the number of models to pass to 2nd stage (`K` in the figure)
   batch_size=4,
   gradient_accumulation_steps=[1],
   crf=[True],
   lr=[1e-4],
   weight_decay=[None],
   random_seed=[42],
   lr_warmup_step_ratio=[0.1],
   max_grad_norm=[None, 10]
)
searcher.train()

ModuleNotFoundError: No module named 'tner'

In [None]:
model = TransformersNER("ckpt_bert_custom_dataset/best_model")
metric = model.evaluate(local_dataset=local_dataset, dataset_split='test', batch_size=16)