In [37]:
!pip3 uninstall accelerate transformers -y
!pip3 install "transformers[torch]" datasets seqeval
!pip3 install accelerate==0.26.0

Found existing installation: accelerate 1.3.0
Uninstalling accelerate-1.3.0:
  Successfully uninstalled accelerate-1.3.0
Found existing installation: transformers 4.48.1
Uninstalling transformers-4.48.1:
  Successfully uninstalled transformers-4.48.1
Defaulting to user installation because normal site-packages is not writeable
Collecting transformers[torch]
  Using cached transformers-4.48.1-py3-none-any.whl (9.7 MB)
Collecting accelerate>=0.26.0
  Using cached accelerate-1.3.0-py3-none-any.whl (336 kB)
Installing collected packages: transformers, accelerate
Successfully installed accelerate-1.3.0 transformers-4.48.1
You should consider upgrading via the '/Applications/Xcode.app/Contents/Developer/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Defaulting to user installation because normal site-packages is not writeable
Collecting accelerate==0.26.0
  Downloading accelerate-0.26.0-py3-none-any.whl (270 kB)
[K     |████████████████████████████████| 270 kB 6.0 MB/s eta 0:00:

In [24]:
import os
from datasets import Dataset

os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Load all dataset files in a folder
def load_datasets(folder_path):
    sentences = []

    # Iterate through all .txt files in the folder
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".txt"):  # Only process .txt files
            file_path = os.path.join(folder_path, file_name)
            with open(file_path, "r", encoding="utf-8") as f:
                tokens, current_tags = [], []
                for line in f:
                    line = line.strip()
                    if not line:  # Sentence boundary
                        if tokens:  # Save the current sentence
                            sentences.append({"tokens": tokens, "ner_tags": current_tags})
                            tokens, current_tags = [], []
                    else:
                        parts = line.split("\t")
                        if len(parts) == 4:
                            word, pos, ner_tag, clause = parts
                            tokens.append(word)
                            current_tags.append(ner_tag)
                if tokens:  # Save the last sentence in the file
                    sentences.append({"tokens": tokens, "ner_tags": current_tags})
    
    # Convert sentences into a Hugging Face Dataset
    return Dataset.from_list(sentences)

train_dataset = load_datasets("train/train")
test_dataset = load_datasets("test/test")
val_dataset = load_datasets("eval/eval")


In [25]:
# Create a mapping for tags
unique_tags = set(tag for example in train_dataset for tag in example["ner_tags"])
tag2id = {tag: i for i, tag in enumerate(sorted(unique_tags))}
id2tag = {i: tag for tag, i in tag2id.items()}
print("Tag to ID mapping:", tag2id)

# Encode the tags in the dataset
def encode_tags(examples):
    examples["labels"] = [tag2id.get(tag, tag2id["O"]) for tag in examples["ner_tags"]]
    return examples

train_dataset = train_dataset.map(encode_tags)
val_dataset = val_dataset.map(encode_tags)
test_dataset = test_dataset.map(encode_tags)


Tag to ID mapping: {'B': 0, 'B_BRN': 1, 'B_DES': 2, 'B_DTM': 3, 'B_D`TM': 4, 'B_LOC': 5, 'B_MEA': 6, 'B_NAME': 7, 'B_NUM': 8, 'B_ORG': 9, 'B_PER': 10, 'B_TRM': 11, 'B_TTL': 12, 'DDEM': 13, 'E_BRN': 14, 'E_DES': 15, 'E_DTM': 16, 'E_LOC': 17, 'E_MEA': 18, 'E_NUM': 19, 'E_ORG': 20, 'E_PER': 21, 'E_TRM': 22, 'E_TTL': 23, 'I': 24, 'I_BRN': 25, 'I_DES': 26, 'I_DTM': 27, 'I_LOC': 28, 'I_MEA': 29, 'I_NUM': 30, 'I_ORG': 31, 'I_PER': 32, 'I_TRM': 33, 'I_TTL': 34, 'MEA_BI': 35, 'O': 36, 'OBRN_B': 37, 'ORG_I': 38, 'PER_I': 39, '__': 40}


Map: 100%|██████████| 3794/3794 [00:02<00:00, 1472.01 examples/s]
Map: 100%|██████████| 474/474 [00:00<00:00, 2296.91 examples/s]


In [26]:
from transformers import AutoTokenizer

# Load a tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    labels = []
    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = [-100 if word_id is None else label[word_id] for word_id in word_ids]
        labels.append(aligned_labels)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
val_dataset = val_dataset.map(tokenize_and_align_labels, batched=True)
test_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)


Map: 100%|██████████| 3794/3794 [00:35<00:00, 107.50 examples/s]
Map: 100%|██████████| 474/474 [00:03<00:00, 133.97 examples/s]


In [40]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

# Load the model
model = AutoModelForTokenClassification.from_pretrained(
    "bert-base-multilingual-cased",
    num_labels=len(tag2id)
)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=10
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

trainer.train()

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`

In [39]:
import torch
import transformers
import accelerate

print(f"Torch version: {torch.__version__}")
print(f"Transformers version: {transformers.__version__}")
print(f"Accelerate version: {accelerate.__version__}")


Torch version: 2.5.1
Transformers version: 4.47.0
Accelerate version: 1.3.0


In [None]:
# Evaluate
results = trainer.evaluate()
print(results)

# Save model and tokenizer
model.save_pretrained("./ner_model")
tokenizer.save_pretrained("./ner_model")

In [None]:
from transformers import pipeline

# Load the trained pipeline
ner_pipeline = pipeline("ner", model="./ner_model", tokenizer="./ner_model", aggregation_strategy="simple")

# Test
text = "ประเทศไทยมีศาลที่สำคัญ"
print(ner_pipeline(text))