In [1]:
from datasets import load_dataset

# Load first 1000 examples
dataset = load_dataset("parquet", data_files="../data/training_data/training_corpus.parquet", split="train[:1000]")

# Shuffle and split
dataset = dataset.shuffle(seed=42)
dataset = dataset.train_test_split(test_size=0.2)

train_data = dataset['train']
val_data = dataset['test']

# Print dataset sizes and one example
print("Train size:", len(train_data))
print("Val size:", len(val_data))
print(train_data[0])

Train size: 800
Val size: 200
{'input': ['这', '就是', '藏花', '9', '4', '4', '8', '2'], 'label': '喜欢'}


In [2]:
from custom_tokenizers.jieba_tokenizer import JiebaLikeTokenizer

tokenizer = JiebaLikeTokenizer()


def preprocess(example, tokenizer=tokenizer):
    result = tokenizer(" ".join(example["input"]))
    return {
        "input_ids": result["input_ids"],
        "label_id": tokenizer.convert_tokens_to_ids([example["label"]])[0]
    }

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/cq/3ryty8mx6qz8cqtn1ghkrvmm0000gn/T/jieba.cache
Loading model cost 0.250 seconds.
Prefix dict has been built successfully.


In [3]:
train_data = train_data.map(preprocess, num_proc=8)
val_data = val_data.map(preprocess, num_proc=8)

print(train_data[0])

Map (num_proc=8):   0%|          | 0/800 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/200 [00:00<?, ? examples/s]

{'input': ['这', '就是', '藏花', '9', '4', '4', '8', '2'], 'label': '喜欢', 'input_ids': [12067, 24803, 37305, 15883, 31307, 31307, 8699, 11071, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'label_id': 25092}


In [4]:
import torch

x_train = [d["input_ids"] for d in train_data]
y_train = [d["label_id"] for d in train_data]
x_val = [d["input_ids"] for d in val_data]
y_val = [d["label_id"] for d in val_data]

x_train = torch.tensor(x_train)
y_train = torch.tensor(y_train)
x_val = torch.tensor(x_val)
y_val = torch.tensor(y_val)

print(f"x_train example: {x_train[0]}")
print(f"y_train example: {y_train[0]}")

x_train example: tensor([12067, 24803, 37305, 15883, 31307, 31307,  8699, 11071,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1])
y_train example: 25092


In [5]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print("Using device:", device)

Using device: mps


In [7]:
from tuner_setup import run_tuning

vocab_size = len(tokenizer.token_to_id)
best_model = run_tuning(x_train, y_train, x_val, y_val, vocab_size=vocab_size)

Reloading Tuner from tuner_dir/bert_t9_tune/tuner0.json
Search space summary
Default search space size: 5
embed_dim (Choice)
{'default': 64, 'conditions': [], 'values': [64, 128, 256], 'ordered': True}
num_heads (Choice)
{'default': 2, 'conditions': [], 'values': [2, 4, 8], 'ordered': True}
ff_dim (Choice)
{'default': 128, 'conditions': [], 'values': [128, 256, 512], 'ordered': True}
num_layers (Int)
{'default': None, 'conditions': [], 'min_value': 1, 'max_value': 4, 'step': 1, 'sampling': 'linear'}
batch_size (Choice)
{'default': 16, 'conditions': [], 'values': [16, 32, 64], 'ordered': True}
Best hyperparameters found:
embed_dim: 256
num_heads: 4
ff_dim: 128
num_layers: 2
batch_size: 32


In [8]:
best_model

BertNextTokenModel(
  (embedding): Embedding(43325, 256)
  (position_embed): Embedding(32, 256)
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
        )
        (linear1): Linear(in_features=256, out_features=128, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=128, out_features=256, bias=True)
        (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
  (cls_head): Linear(in_features=256, out_features=43325, bias=True)
)