In [2]:
import numpy as np
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    DataCollatorForTokenClassification
)
from seqeval.metrics import classification_report, accuracy_score
import evaluate
import torch
import time

  from .autonotebook import tqdm as notebook_tqdm


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'




In [7]:
#!pip install seqeval evaluate
#!ls /kaggle/input/

In [8]:
import torch
print(torch.cuda.is_available())

True


In [3]:
# -------------------
# 1. Parse CoNLL file
# -------------------
def parse_conll(file_path):
    sentences = []
    tokens, labels = [], []
    label_set = set()

    with open(file_path, encoding="utf-8") as f:
        for i, line in enumerate(f, 1):
            line = line.strip()
            if not line:
                if tokens:
                    sentences.append({"tokens": tokens, "ner_tags": labels})
                    tokens, labels = [], []
                continue
            parts = line.split("\t")
            if len(parts) < 3:
                print(f"Line {i} skipped (not enough columns): {line}")
                continue
            token, _, ner = parts[:3]  # ignore POS
            tokens.append(token)
            labels.append(ner)
            label_set.add(ner)

    if tokens:
        sentences.append({"tokens": tokens, "ner_tags": labels})

    return sentences, sorted(label_set)

In [11]:
# -------------------
# 2. Load train, val, test
# -------------------
dataset_path = "/kaggle/input/myner-mmdt/"

train_data, label_list = parse_conll(dataset_path + "ner_train.conll")
val_data, _ = parse_conll(dataset_path + "ner_val.conll")
test_data, _ = parse_conll(dataset_path + "ner_test.conll")

label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}

train_dataset = Dataset.from_list(train_data)
val_dataset = Dataset.from_list(val_data)
test_dataset = Dataset.from_list(test_data)

In [None]:
!ls /kaggle/input/

# -----------------------------------------------
# Tokenize words and align NER labels for DistilBERT
# -----------------------------------------------
Example:
Sentence (word-level): ["I", "love", "Myanmar"]
NER tags: [O, O, B-LOC]

Step 1: Tokenize with DistilBERT tokenizer (subword tokenization)
"I"       -> ["I"]
"love"    -> ["love"]
"Myanmar" -> ["My", "##anmar"]

Step 2: Map subwords to original word index
word_ids = [0, 1, 2, 2]  # each subword points to its word

Step 3: Align labels
   - First subword of a word: assign original label
   - Continuation subwords: assign same label (or -100 if ignored)
   - Special tokens ([CLS]/[SEP]) or padding: assign -100 (ignored in loss)
   labels = [O, O, B-LOC, B-LOC]

 Step 4: Add aligned labels to tokenized inputs
   - tokenized_inputs["labels"] = labels
   - Now ready for model training (Trainer/DataLoader)

 Result: Each subword has a correct label aligned with its word,
         padding/special tokens are ignored with -100

tokenized_inputs now contains:

| Key              | Description                                                  |
| ---------------- | ------------------------------------------------------------ |
| `input_ids`      | IDs of subword tokens for DistilBERT                         |
| `attention_mask` | Mask for real tokens vs padding                              |
| `labels`         | NER labels aligned with subwords (`-100` for ignored tokens) |

tokens = ["I", "love", "Myanmar"]
ner_tags = ["O", "O", "B-LOC"]
input_ids      = [101, 1045, 2293, 12950, 22289, 102, 0, 0, ...]  # subword IDs
attention_mask = [1, 1, 1, 1, 1, 1, 0, 0, ...]                     # 0 = padding
labels         = [-100, 0, 0, 3, 3, -100, -100, ...]               # aligned NER IDs


In [None]:
# -------------------
# 3. Tokenizer (DistilBERT)
# -------------------
model_name = "distilbert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)


def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        padding="max_length",   # fixed length padding
        max_length=128,         # shorter max length
        is_split_into_words=True, # converts words to subword token IDs
    )

    labels = []
    for i, label_seq in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        prev_word = None
        for word_idx in word_ids:
            if word_idx is None: # this token is padding or special token ([CLS]/[SEP]), assign -100 so the loss ignores it.
                label_ids.append(-100)
            elif word_idx != prev_word:
                label_ids.append(label2id[label_seq[word_idx]]) # if the token is the first subword of a word, assign its label.
            else:
                label_ids.append(label2id[label_seq[word_idx]]) # if the token is a subword, we keep the same label as the first word.
            prev_word = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
val_dataset = val_dataset.map(tokenize_and_align_labels, batched=True)
test_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)

In [None]:
# -------------------
# 4. Model (DistilBERT)
# -------------------
from transformers import AutoModelForTokenClassification

model_name = "distilbert-base-multilingual-cased"
retries = 5

for i in range(retries):
    try:
        model = AutoModelForTokenClassification.from_pretrained(
            model_name,
            num_labels=len(label_list),
            id2label=id2label, # id2label / label2id → mapping between numeric IDs and label names.
            label2id=label2id,
            force_download=True
        )
        print("Model downloaded successfully!")
        break
    except Exception as e:
        print(f"Attempt {i+1} failed: {e}")
        if i == retries - 1:
            raise


# -------------------
# 5. Training Arguments (CPU optimized)
# -------------------
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch", # evaluate at the end of each epoch.
    save_strategy="epoch", # save model at the end of each epoch.
    learning_rate=5e-5,
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=1,  
    num_train_epochs=5,
    weight_decay=0.01, # regularization
    logging_dir="./logs",
    logging_steps=50,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    seed=42,
    dataloader_num_workers=4, # speed up data loading.
    fp16=True, # enable mixed precision for faster training on GPU
    report_to="none",
    disable_tqdm=False,
)


# -------------------
# 6. Metrics
# -------------------
seqeval_metric = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [
        [id2label[label_id] for label_id in label_seq if label_id != -100]
        for label_seq in labels
    ]
    pred_labels = [
        [id2label[pred_id] for pred_id, label_id in zip(pred_seq, label_seq) if label_id != -100]
        for pred_seq, label_seq in zip(predictions, labels)
    ]
    results = seqeval_metric.compute(predictions=pred_labels, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


# -------------------
# 7. Trainer
# -------------------
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer) # data collator for token classification, automatically pads input IDs and labels for batching.

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],  # 
)


# -------------------
# 8. Training
# -------------------
torch.set_num_threads(4)  # optimize CPU usage

start_time = time.time()
trainer.train()
end_time = time.time()
print(f"Training time: {end_time - start_time:.2f} seconds")

# -------------------
# 9. Save best model
# -------------------
trainer.save_model("./distilmbert_epoch5")
tokenizer.save_pretrained("./distilmbert_epoch5")


config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model downloaded successfully!


  trainer = Trainer(
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avo

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.048,0.052664,0.87433,0.865639,0.869963,0.982738
2,0.042,0.043816,0.902672,0.885757,0.894135,0.985757
3,0.0326,0.04305,0.894939,0.908679,0.901757,0.986428
4,0.0226,0.046046,0.893932,0.913941,0.903826,0.986779
5,0.0158,0.05017,0.890586,0.919008,0.904574,0.986679


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Training time: 2600.93 seconds


('./distilmbert_epoch5/tokenizer_config.json',
 './distilmbert_epoch5/special_tokens_map.json',
 './distilmbert_epoch5/vocab.txt',
 './distilmbert_epoch5/added_tokens.json',
 './distilmbert_epoch5/tokenizer.json')

In [None]:
# -------------------
# 10. Evaluation
# -------------------
predictions, labels, _ = trainer.predict(test_dataset)
predictions = np.argmax(predictions, axis=2)


from sklearn.metrics import classification_report, accuracy_score

# Flatten token-level labels
true_labels_flat = [label for seq in true_labels for label in seq]
pred_labels_flat = [label for seq in pred_labels for label in seq]

print(classification_report(true_labels_flat, pred_labels_flat, digits=3))
print("Accuracy:", accuracy_score(true_labels_flat, pred_labels_flat))


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

              precision    recall  f1-score   support

      B-DATE      0.947     0.957     0.952     10397
       B-LOC      0.898     0.924     0.911     45861
      B-TIME      0.923     0.945     0.934      1439
      I-DATE      0.952     0.956     0.954      8437
       I-LOC      0.832     0.856     0.844     23790
      I-TIME      0.926     0.973     0.949      1870
           O      0.994     0.992     0.993   1162839

    accuracy                          0.986   1254633
   macro avg      0.925     0.943     0.934   1254633
weighted avg      0.987     0.986     0.987   1254633

Accuracy: 0.9864151508847607


In [None]:
from sklearn.metrics import classification_report

predictions, labels, _ = trainer.predict(test_dataset)
predictions = np.argmax(predictions, axis=2)

true_labels = [
    [id2label[label_id] for label_id in label_seq if label_id != -100]
    for label_seq in labels
]
pred_labels = [
    [id2label[pred_id] for pred_id, label_id in zip(pred_seq, label_seq) if label_id != -100]
    for pred_seq, label_seq in zip(predictions, labels)
]

# --- Keep BIO tags ---
print(classification_report(
    [lab for seq in true_labels for lab in seq],   # flatten
    [lab for seq in pred_labels for lab in seq],  # flatten
    digits=3
))


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

              precision    recall  f1-score   support

      B-DATE      0.947     0.957     0.952     10397
       B-LOC      0.898     0.924     0.911     45861
      B-TIME      0.923     0.945     0.934      1439
      I-DATE      0.952     0.956     0.954      8437
       I-LOC      0.832     0.856     0.844     23790
      I-TIME      0.926     0.973     0.949      1870
           O      0.994     0.992     0.993   1162839

    accuracy                          0.986   1254633
   macro avg      0.925     0.943     0.934   1254633
weighted avg      0.987     0.986     0.987   1254633



In [None]:
import shutil

shutil.make_archive("/kaggle/working/distilmbert_epoch5", 'zip', "/kaggle/working/distilmbert_epoch5")


'/kaggle/working/distilmbert_epoch5.zip'