In [1]:
%%capture
!pip install datasets --no-build-isolation
!pip install seqeval
!pip install transformers[torch]
!pip install evaluate


In [2]:
from huggingface_hub import login

login("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")

In [3]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification, EarlyStoppingCallback
import torch
from torch.utils.data import DataLoader
import numpy as np
import evaluate

In [4]:
# Load dataset and shuffle data (randomization)
dataset = load_dataset("procit008/NER_trainig_dataset_without_chunk_tags")
dataset["train"] = dataset["train"].shuffle(seed=12)
dataset["validation"] = dataset["validation"].shuffle(seed=12)
dataset["test"] = dataset["test"].shuffle(seed=12)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/1.77k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/3.95M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/644k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/619k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/155178 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/20892 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/21098 [00:00<?, ? examples/s]

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'ner_tags'],
        num_rows: 155178
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'ner_tags'],
        num_rows: 20892
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'ner_tags'],
        num_rows: 21098
    })
})

In [6]:
# Model checkpoint
checkpoint = "bert-base-uncased"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [7]:
# Tokenize and align labels without fixed padding
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])

            else:

                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/155178 [00:00<?, ? examples/s]

Map:   0%|          | 0/20892 [00:00<?, ? examples/s]

Map:   0%|          | 0/21098 [00:00<?, ? examples/s]

In [8]:
# Load pre-trained model

model = AutoModelForTokenClassification.from_pretrained(checkpoint, num_labels=9)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# # Load seqeval metric
# metric = load_metric("seqeval")

# Load the seqeval metric
metric = evaluate.load("seqeval")

# Define compute_metrics function
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_labels = [[label_list[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]


    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],

    }

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [10]:
# Get label list
label_list = dataset["train"].features["ner_tags"].feature.names

# Set up data collator for dynamic padding
data_collator = DataCollatorForTokenClassification(tokenizer)

In [11]:
# Set up training arguments

training_args = TrainingArguments(
    output_dir="./NER_training_base_uncased_without_chunk_tags",
    eval_strategy="epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,       # Load the best model when early stopping is triggered
    metric_for_best_model="eval_loss",  # Metric to monitor for early stopping (can be adjusted)
    greater_is_better=False,
    seed = 12

)
early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=3)

In [12]:
from transformers import AdamW, get_scheduler

# Initialize an optimizer
optimizer = AdamW(
    model.parameters(),
    lr=training_args.learning_rate,
    weight_decay=training_args.weight_decay
)

# Define the scheduler (optional, here using a linear scheduler with warmup)
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,  # Number of warmup steps (you can change this)
    num_training_steps=len(tokenized_datasets["train"]) // training_args.per_device_train_batch_size * training_args.num_train_epochs,
)



In [13]:
# # Check if validation split exists, if not fallback to test split or create one
# if "validation" not in tokenized_datasets:
#     # Split the training set if validation doesn't exist
#     train_test_split = tokenized_datasets["train"].train_test_split(test_size=0.1)  # 10% for validation
#     tokenized_datasets["train"] = train_test_split["train"]
#     tokenized_datasets["validation"] = train_test_split["test"]


In [14]:
# Initialize Trainer with compute_metrics and data collator

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,

    callbacks=[early_stopping_callback]

)

  trainer = Trainer(


In [None]:
# Train model
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0592,0.051421,0.95741,0.959475,0.958441,0.985628


In [None]:
# Evaluate model
results = trainer.evaluate()
print(results)

In [None]:
trainer.push_to_hub("Procit008/NER_training_base_uncased_without_chunk_tags")
tokenizer.push_to_hub("Procit008/NER_training_base_uncased_without_chunk_tags")

In [None]:
from transformers import BertConfig

In [None]:
config = BertConfig.from_pretrained(checkpoint)

In [None]:
config