In [1]:
from transformers import Trainer, TrainingArguments
from custom_tokenizers.jieba_tokenizer import JiebaLikeTokenizer
from model_instancies import model1
from datasets import load_dataset

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/cq/3ryty8mx6qz8cqtn1ghkrvmm0000gn/T/jieba.cache
Loading model cost 0.246 seconds.
Prefix dict has been built successfully.


In [2]:
# Load and prepare dataset
dataset = load_dataset("parquet", data_files="../data/training_data/training_corpus.parquet", split="train[:10000]")
dataset = dataset.shuffle(seed=42)
dataset = dataset.train_test_split(test_size=0.2)
train_data = dataset['train']
val_data = dataset['test']

In [4]:
tokenizer = JiebaLikeTokenizer()

def preprocess(example):
    return {
        "input_ids": tokenizer.convert_tokens_to_ids(example["input"]),
        "labels": tokenizer.convert_tokens_to_ids([example["label"]], padding=False)[0],
    }

def add_attention_mask(example, tokenizer=tokenizer):
    return {
        "attention_mask": [0 if input_id==tokenizer.pad_token_id else 1 for input_id in example["input_ids"]],
    }

In [5]:
train_data = train_data.map(preprocess, num_proc=8, remove_columns=train_data.column_names)
val_data = val_data.map(preprocess, num_proc=8, remove_columns=val_data.column_names)
train_data = train_data.map(add_attention_mask, num_proc=8)
val_data = val_data.map(add_attention_mask, num_proc=8)

print(train_data[0])

Map (num_proc=8):   0%|          | 0/8000 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/2000 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/8000 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/2000 [00:00<?, ? examples/s]

{'input_ids': [7097, 37425, 28417, 21461, 13548, 31783, 17178, 17002, 8354, 19434, 36854, 4319, 21461, 42646, 38468, 21461, 35765, 30849, 21461, 2232, 41982, 41735, 15385, 36095, 7616, 16504, 14466, 41412, 15485, 2404, 11061, 27428], 'labels': 31218, 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [6]:
model = model1()

In [11]:
training_args = TrainingArguments(
    output_dir="../models",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    save_strategy="steps",
    save_total_limit=3,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=10,
    eval_strategy="steps",
    eval_steps=20,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
)

In [9]:
trainer.train()



Step,Training Loss,Validation Loss
20,6.7588,6.927484
40,6.7528,6.878438
60,6.7919,6.840289
80,6.6736,6.806548
100,6.6991,6.777026
120,6.6793,6.752817
140,6.5718,6.732944
160,6.6926,6.7169
180,6.7567,6.69916
200,6.7115,6.678845




TrainOutput(global_step=750, training_loss=6.5060505777994795, metrics={'train_runtime': 25.4181, 'train_samples_per_second': 944.208, 'train_steps_per_second': 29.506, 'total_flos': 0.0, 'train_loss': 6.5060505777994795, 'epoch': 3.0})