In [None]:

from transformers import BertTokenizer, BertForMaskedLM
import torch

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained('btqkhai/SinoNomBERT')
# Load the model
model = BertForMaskedLM.from_pretrained('btqkhai/SinoNomBERT')

text = '大 [MASK] 百 官 其 𢮿 花 供 饌 皆 用 新 禮'

inputs = tokenizer(text, return_tensors="pt")
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
# Ground Truth: 宴
logits = model(**inputs).logits
mask_token_logits = logits[0, mask_token_index, :]

print("Predicted word:",  tokenizer.decode(mask_token_logits[0].argmax()))


Predicted word: 宴


In [None]:
!unzip /content/train_concate.zip

Archive:  /content/train_concate.zip
   creating: my_dataset/
  inflating: my_dataset/dataset_dict.json  
   creating: my_dataset/test/
  inflating: my_dataset/test/data-00000-of-00001.arrow  
  inflating: my_dataset/test/dataset_info.json  
  inflating: my_dataset/test/state.json  
   creating: my_dataset/train/
  inflating: my_dataset/train/data-00000-of-00001.arrow  
  inflating: my_dataset/train/dataset_info.json  
  inflating: my_dataset/train/state.json  
   creating: my_dataset/validation/
  inflating: my_dataset/validation/data-00000-of-00001.arrow  
  inflating: my_dataset/validation/dataset_info.json  
  inflating: my_dataset/validation/state.json  
   creating: NomBertTokenizer/
  inflating: NomBertTokenizer/added_tokens.json  
  inflating: NomBertTokenizer/special_tokens_map.json  
  inflating: NomBertTokenizer/tokenizer_config.json  
  inflating: NomBertTokenizer/vocab.txt  


In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

## *Training*

In [None]:
import torch
from transformers import (BertTokenizer, BertForMaskedLM, Trainer, TrainingArguments,
                          DataCollatorForLanguageModeling, EarlyStoppingCallback)
from datasets import Dataset
import math
import os
# 1. Load the custom tokenizer
tokenizer = BertTokenizer.from_pretrained("NomBertTokenizer")

# 2. Load the pre-trained model and resize embeddings
model = BertForMaskedLM.from_pretrained("Jihuai/bert-ancient-chinese")
model.resize_token_embeddings(len(tokenizer))  # Adjust model for the new tokenizer

# 3. Load the data
from datasets import load_from_disk
loaded_dataset = load_from_disk("my_dataset")

train_dataset = loaded_dataset['train']
val_dataset = loaded_dataset['validation']
test_texts = loaded_dataset['test']

MAX_LENGTH = 256

print("Sample Training Data:", train_dataset[0])
# 5. Tokenization Function
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=True, max_length=MAX_LENGTH)

# Apply Tokenization
train_dataset = train_dataset.map(tokenize_function, remove_columns=["text"])
val_dataset = val_dataset.map(tokenize_function, remove_columns=["text"])
test_texts = test_texts.map(tokenize_function, remove_columns=["text"])
print(train_dataset.column_names)
# Debug: View tokenized samples
def decode_tokenized(example):
    return tokenizer.decode(example["input_ids"], skip_special_tokens=False)

print("Sample Encoded:", train_dataset[0])
print("Sample Decoded:", decode_tokenized(train_dataset[0]))

# 6. Data Collator for MLM
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.2,  # 20% of tokens are masked
)

# 7. Training Arguments
training_args = TrainingArguments(
    output_dir="./fine_tuned_model",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=32,  # Adjust batch size based on memory
    per_device_eval_batch_size=32,  # Adjust batch size based on memory
    learning_rate=5e-5,
    weight_decay=0.01,
    num_train_epochs=20,
    save_total_limit=2,
    logging_steps=100,  # Log every 100 steps
    logging_dir="./logs",  # Logs will be saved here for visualization
    fp16=torch.cuda.is_available(),
    report_to=["tensorboard"],  # Enable TensorBoard integration
    load_best_model_at_end=True,
    optim= "adamw_torch",
    dataloader_num_workers=8
)

early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=2,
    early_stopping_threshold=0.01
    # Stop if no improvement in validation loss for 2 consecutive epochs
)
# 8. Trainer Initializationsssssss
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[early_stopping_callback]
)
# 9. Train and Save the Model
trainer.train()
trainer.save_model("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

# 11. Evaluate and Calculate Perplexity on the Test Dataset
test_results = trainer.evaluate(test_texts)

# Calculate Perplexity for Test Set
if "eval_loss" in test_results:
    test_perplexity = math.exp(test_results["eval_loss"])
    print("Test Perplexity:", test_perplexity)
else:
    print("No eval_loss found for the test set.")

# Ensure TensorBoard directory exists
os.makedirs(training_args.logging_dir, exist_ok=True)

# Launch TensorBoard (if running locally, open TensorBoard in your browser)
print(f"Launch TensorBoard with the following command:\ntensorboard --logdir {training_args.logging_dir}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/838 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/462M [00:00<?, ?B/s]

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Sample Training Data: {'text': '盎 那 代 保 侖 庄 隊 律 [SEP] 空 如 固 固 如 空 [SEP] 免 戈 夏 免 侖 冬 [SEP] 矮 短 悶 希 㙮 侖 [SEP] 鰥 寡 𪪳 化 婆 侖 [SEP] 典 𫢩 計 \U000f02c7 𠁀 侖 [SEP] 生 𦋦 李 賁 奇 坤 欣 𠊛 [SEP] 嗔 使 君 哿 𩝺 威 嚴 添 \U000f167c 例 法 [SEP] 旦 最 些 \U000f16a2 王 府 滥 图 例 \U000f0742 補 㭲 多 [SEP] 縁 \U000f0179 刁 等 庫 庒 赦 [SEP] 𡲤 吘 𥙩 制 詵 羅 例 代 初 牢 可 補 丕 [SEP] 麻 群 鳩 魂 庄 曾 割 例 [SEP] 忍 典 㐌 哿 乙 群 鳩 汝 卢'}


Map:   0%|          | 0/7232 [00:00<?, ? examples/s]

Map:   0%|          | 0/402 [00:00<?, ? examples/s]

Map:   0%|          | 0/402 [00:00<?, ? examples/s]

## *Inference*

In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForMaskedLM, DataCollatorForLanguageModeling
from datasets import load_from_disk

# 1. Load the custom tokenizer
tokenizer = BertTokenizer.from_pretrained("fine_tuned_model")

# 2. Load the pre-trained model
model = BertForMaskedLM.from_pretrained("fine_tuned_model")

# 3. Load the dataset
loaded_dataset = load_from_disk("my_dataset")
test_dataset = loaded_dataset['test']

# 4. Define maximum sequence length for tokenization
MAX_LENGTH = 256

# 5. Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=True, max_length=MAX_LENGTH)
# Apply tokenization
test_dataset = test_dataset.map(tokenize_function, remove_columns=["text"])

# DEBUG: Print a sample from the tokenized dataset
print("Sample from Tokenized Test Dataset:", test_dataset[0])

# 6. Data Collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.2
)

# 7. Calculate masked language modeling token accuracy
def calculate_token_accuracy(model, dataset, tokenizer, data_collator, batch_size=32):
    """
    Calculate token accuracy for masked language modeling.

    Args:
        model: Pretrained MLM model.
        dataset: Tokenized dataset.
        tokenizer: Tokenizer used for the model.
        data_collator: Data collator for batching.
        batch_size: Batch size for processing.

    Returns:
        Token accuracy as a percentage.
    """
    model.eval()
    dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=data_collator)
    correct_predictions = 0
    total_masked = 0

    mask_token_id = tokenizer.mask_token_id
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    with torch.no_grad():
        for batch in dataloader:
            # Move batch to device
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=-1)

            # Identify masked token positions
            mask_positions = (labels != -100)  # Masked positions have valid labels

            # Compare predictions with labels only at masked positions
            correct_predictions += torch.sum(
                (predictions == labels) & mask_positions
            ).item()

            total_masked += torch.sum(mask_positions).item()

    accuracy = (correct_predictions / total_masked) * 100 if total_masked > 0 else 0.0

    return accuracy

# 8. Calculate and print token accuracy
print("Calculating Test Accuracy...")
test_accuracy = calculate_token_accuracy(model, test_dataset, tokenizer, data_collator)
print(f"Test Set Token Accuracy: {test_accuracy:.2f}%")

Map:   0%|          | 0/402 [00:00<?, ? examples/s]

Sample from Tokenized Test Dataset: {'input_ids': [101, 2668, 7626, 984, 746, 5186, 5811, 3143, 2621, 5811, 4373, 46147, 1350, 3613, 4374, 2723, 4374, 5811, 3143, 2184, 2128, 1235, 4374, 1538, 1751, 1062, 2548, 1751, 1062, 3346, 1751, 1062, 3152, 1751, 1062, 2566, 1751, 1062, 4886, 1751, 1062, 5023, 984, 6538, 5445, 2347, 102, 1063, 3299, 5059, 1169, 6972, 3351, 2345, 6328, 2199, 7526, 1070, 1146, 2127, 677, 3825, 678, 3825, 2571, 2336, 7518, 2128, 2705, 2255, 5023, 2424, 102, 2541, 808, 6258, 1392, 5238, 3837, 3696, 1726, 6518, 2541, 3511, 102, 673, 3299, 753, 3189, 2498, 3215, 6210, 3346, 3175, 2900, 3346, 1266, 6121, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [None]:
from transformers import BertTokenizer, BertForMaskedLM
import torch

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained('fine_tuned_model')
# # # Load the model
model = BertForMaskedLM.from_pretrained('fine_tuned_model')

text = '大 [MASK] 百 官 其 𢮿 花 供 饌 皆 用 新 禮'

inputs = tokenizer(text, return_tensors="pt", add_special_tokens=True)
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
# Ground Truth: 宴
logits = model(**inputs).logits
mask_token_logits = logits[0, mask_token_index, :]

print("Predicted word:",  tokenizer.decode(mask_token_logits[0].argmax()))

Predicted word: 宴


## *Saving the logs and the model*


In [None]:
!zip -r concate.zip /content

  adding: content/ (stored 0%)
  adding: content/.config/ (stored 0%)
  adding: content/.config/.last_opt_in_prompt.yaml (stored 0%)
  adding: content/.config/default_configs.db (deflated 98%)
  adding: content/.config/.last_survey_prompt.yaml (stored 0%)
  adding: content/.config/gce (stored 0%)
  adding: content/.config/config_sentinel (stored 0%)
  adding: content/.config/.last_update_check.json (deflated 22%)
  adding: content/.config/logs/ (stored 0%)
  adding: content/.config/logs/2024.12.19/ (stored 0%)
  adding: content/.config/logs/2024.12.19/14.20.18.151587.log (deflated 58%)
  adding: content/.config/logs/2024.12.19/14.20.29.520330.log (deflated 57%)
  adding: content/.config/logs/2024.12.19/14.20.16.940511.log (deflated 87%)
  adding: content/.config/logs/2024.12.19/14.19.43.316528.log (deflated 93%)
  adding: content/.config/logs/2024.12.19/14.20.30.129972.log (deflated 57%)
  adding: content/.config/logs/2024.12.19/14.20.05.781718.log (deflated 58%)
  adding: content/.con

## *Optional saving*



In [None]:
!zip -r output.zip ./fine_tuned_model/

  adding: fine_tuned_model/ (stored 0%)
  adding: fine_tuned_model/vocab.txt (deflated 42%)
  adding: fine_tuned_model/tokenizer.json (deflated 91%)
  adding: fine_tuned_model/checkpoint-738/ (stored 0%)
  adding: fine_tuned_model/checkpoint-738/generation_config.json (deflated 8%)
  adding: fine_tuned_model/checkpoint-738/scheduler.pt (deflated 56%)
  adding: fine_tuned_model/checkpoint-738/trainer_state.json (deflated 72%)
  adding: fine_tuned_model/checkpoint-738/optimizer.pt (deflated 9%)
  adding: fine_tuned_model/checkpoint-738/model.safetensors (deflated 8%)
  adding: fine_tuned_model/checkpoint-738/training_args.bin (deflated 52%)
  adding: fine_tuned_model/checkpoint-738/rng_state.pth (deflated 25%)
  adding: fine_tuned_model/checkpoint-738/config.json (deflated 54%)
  adding: fine_tuned_model/generation_config.json (deflated 8%)
  adding: fine_tuned_model/checkpoint-984/ (stored 0%)
  adding: fine_tuned_model/checkpoint-984/generation_config.json (deflated 8%)
  adding: fine_

In [None]:
!zip -r logs.zip ./logs


  adding: logs/ (stored 0%)
  adding: logs/events.out.tfevents.1734751731.fb39d44d2143.1227.0 (deflated 62%)
  adding: logs/events.out.tfevents.1734752966.fb39d44d2143.1227.1 (deflated 44%)
