In [1]:
!pip3 uninstall accelerate transformers -y
!pip3 install "transformers[torch]" datasets seqeval
!pip3 install accelerate==0.26.0

Found existing installation: accelerate 0.26.0
Uninstalling accelerate-0.26.0:
  Successfully uninstalled accelerate-0.26.0
Found existing installation: transformers 4.48.1
Uninstalling transformers-4.48.1:
  Successfully uninstalled transformers-4.48.1
Defaulting to user installation because normal site-packages is not writeable
Collecting transformers[torch]
  Using cached transformers-4.48.1-py3-none-any.whl (9.7 MB)
Collecting accelerate>=0.26.0
  Using cached accelerate-1.3.0-py3-none-any.whl (336 kB)
Installing collected packages: transformers, accelerate
Successfully installed accelerate-1.3.0 transformers-4.48.1
You should consider upgrading via the '/Applications/Xcode.app/Contents/Developer/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Defaulting to user installation because normal site-packages is not writeable
Collecting accelerate==0.26.0
  Using cached accelerate-0.26.0-py3-none-any.whl (270 kB)
Installing collected packages: accelerate
  Attempting uninstall

In [3]:
import os
from datasets import Dataset

os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Load all dataset files in a folder
def load_datasets(folder_path):
    sentences = []

    # Iterate through all .txt files in the folder
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".txt"):  # Only process .txt files
            file_path = os.path.join(folder_path, file_name)
            with open(file_path, "r", encoding="utf-8") as f:
                tokens, current_tags = [], []
                for line in f:
                    line = line.strip()
                    if not line:  # Sentence boundary
                        if tokens:  # Save the current sentence
                            sentences.append({"tokens": tokens, "ner_tags": current_tags})
                            tokens, current_tags = [], []
                    else:
                        parts = line.split("\t")
                        if len(parts) == 4:
                            word, pos, ner_tag, clause = parts
                            tokens.append(word)
                            current_tags.append(ner_tag)
                if tokens:  # Save the last sentence in the file
                    sentences.append({"tokens": tokens, "ner_tags": current_tags})
    
    # Convert sentences into a Hugging Face Dataset
    return Dataset.from_list(sentences)

train_dataset = load_datasets("train/train")
test_dataset = load_datasets("test/test")
val_dataset = load_datasets("eval/eval")


In [13]:
# Define custom tag to ID mapping based on your tag list
tag2id = {
    "O": 0,
    "B_ORG": 1,
    "B_PER": 2,
    "B_LOC": 3,
    "B_MEA": 4,
    "I_DTM": 5,
    "I_ORG": 6,
    "E_ORG": 7,
    "I_PER": 8,
    "B_TTL": 9,
    "E_PER": 10,
    "B_DES": 11,
    "E_LOC": 12,
    "B_DTM": 13,
    "B_NUM": 14,
    "I_MEA": 15,
    "E_DTM": 16,
    "E_MEA": 17,
    "I_LOC": 18,
    "I_DES": 19,
    "E_DES": 20,
    "I_NUM": 21,
    "E_NUM": 22,
    "B_TRM": 23,
    "B_BRN": 24,
    "I_TRM": 25,
    "E_TRM": 26,
    "I_TTL": 27,
    "I_BRN": 28,
    "E_BRN": 29,
    "E_TTL": 30,
    "B_NAME": 31
}

# Create a reverse mapping for convenience (for model output mapping)
id2tag = {v: k for k, v in tag2id.items()}

# Encode the tags in the dataset using the custom tag2id mapping
def encode_tags(examples):
    examples["labels"] = [tag2id.get(tag, tag2id["O"]) for tag in examples["ner_tags"]]
    return examples

# Apply encoding to your datasets
train_dataset = train_dataset.map(encode_tags)
val_dataset = val_dataset.map(encode_tags)
test_dataset = test_dataset.map(encode_tags)

print("Custom Tag to ID mapping:", tag2id)


Map: 100%|██████████| 3794/3794 [00:05<00:00, 717.02 examples/s] 
Map: 100%|██████████| 474/474 [00:00<00:00, 1927.23 examples/s]

Custom Tag to ID mapping: {'O': 0, 'B_ORG': 1, 'B_PER': 2, 'B_LOC': 3, 'B_MEA': 4, 'I_DTM': 5, 'I_ORG': 6, 'E_ORG': 7, 'I_PER': 8, 'B_TTL': 9, 'E_PER': 10, 'B_DES': 11, 'E_LOC': 12, 'B_DTM': 13, 'B_NUM': 14, 'I_MEA': 15, 'E_DTM': 16, 'E_MEA': 17, 'I_LOC': 18, 'I_DES': 19, 'E_DES': 20, 'I_NUM': 21, 'E_NUM': 22, 'B_TRM': 23, 'B_BRN': 24, 'I_TRM': 25, 'E_TRM': 26, 'I_TTL': 27, 'I_BRN': 28, 'E_BRN': 29, 'E_TTL': 30, 'B_NAME': 31}





In [14]:
from transformers import AutoTokenizer

# Load a tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

def tokenize_and_align_labels(examples):
    # Tokenize with padding and truncation
    tokenized_inputs = tokenizer(
        examples["tokens"], 
        truncation=True, 
        padding=True,  # Add padding
        is_split_into_words=True,  # Specify that input is already split into words
        max_length=512  # Optionally, you can set a max_length (512 is standard for BERT-based models)
    )
    
    # Align the labels with the tokenized inputs
    labels = []
    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = [-100 if word_id is None else label[word_id] for word_id in word_ids]
        labels.append(aligned_labels)
    
    # Add the aligned labels to the tokenized inputs
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Apply the function to the datasets
train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
val_dataset = val_dataset.map(tokenize_and_align_labels, batched=True)
test_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)


Map: 100%|██████████| 3794/3794 [00:37<00:00, 101.90 examples/s]
Map: 100%|██████████| 474/474 [00:03<00:00, 148.16 examples/s]


In [15]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

# Load the model
model = AutoModelForTokenClassification.from_pretrained(
    "bert-base-multilingual-cased",
    num_labels=len(tag2id)
)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=10
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

trainer.train()

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
# Evaluate
results = trainer.evaluate()
print(results)

# Save model and tokenizer
model.save_pretrained("./ner_model")
tokenizer.save_pretrained("./ner_model")

In [None]:
from transformers import pipeline

# Load the trained pipeline
ner_pipeline = pipeline("ner", model="./ner_model", tokenizer="./ner_model", aggregation_strategy="simple")

# Test
text = "ประเทศไทยมีศาลที่สำคัญ"
print(ner_pipeline(text))