In [None]:
!pip install evaluate seqeval transformers datasets

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess (from evaluate)
  Downl

In [None]:
from transformers import AutoTokenizer, DataCollatorForTokenClassification, TrainingArguments, AutoModelForTokenClassification
from torch.utils.data import DataLoader
import torch
import numpy as np
from sklearn.metrics import classification_report
from evaluate import load

# Print a message to confirm successful imports
print("Libraries imported successfully.")

Libraries imported successfully.


In [None]:
train_file = "/content/train.tsv"
test_file = "/content/test.tsv"
print(f"Train file: {train_file}")
print(f"Test file: {test_file}")

Train file: /content/train.tsv
Test file: /content/test.tsv


In [None]:
def read_tsv(file_path):
    sentences, labels = [], []
    sentence, label = [], []

    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:  # Empty line separates sentences
                if sentence:
                    sentences.append(sentence)
                    labels.append(label)
                sentence, label = [], []
            else:
                token, tag = line.split("\t")
                sentence.append(token)
                label.append(tag)

        # Add the last sentence if the file does not end with a blank line
        if sentence:
            sentences.append(sentence)
            labels.append(label)

    return sentences, labels

# Load datasets
train_sentences, train_labels = read_tsv(train_file)
test_sentences, test_labels = read_tsv(test_file)

# Print a sample for verification
print("Sample Sentence:", train_sentences[0])
print("Sample Labels:", train_labels[0])

Sample Sentence: ['Immunohistochemical', 'staining', 'was', 'positive', 'for', 'S', '-', '100', 'in', 'all', '9', 'cases', 'stained', ',', 'positive', 'for', 'HMB', '-', '45', 'in', '9', '(', '90', '%', ')', 'of', '10', ',', 'and', 'negative', 'for', 'cytokeratin', 'in', 'all', '9', 'cases', 'in', 'which', 'myxoid', 'melanoma', 'remained', 'in', 'the', 'block', 'after', 'previous', 'sections', '.']
Sample Labels: ['O', 'O', 'O', 'O', 'O', 'B-GENE', 'I-GENE', 'E-GENE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-GENE', 'I-GENE', 'E-GENE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'S-GENE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [None]:
unique_labels = set(tag for label_seq in train_labels + test_labels for tag in label_seq)
label_to_id = {label: idx for idx, label in enumerate(sorted(unique_labels))}
id_to_label = {idx: label for label, idx in label_to_id.items()}

print("Label to ID mapping:", label_to_id)

Label to ID mapping: {'B-GENE': 0, 'E-GENE': 1, 'I-GENE': 2, 'O': 3, 'S-GENE': 4}


In [None]:
from datasets import Dataset

def preprocess_data(sentences, labels, label_to_id):
    data = {"tokens": [], "labels": []}

    for sentence, label_seq in zip(sentences, labels):
        # Convert IOBES labels to numeric IDs
        label_ids = [label_to_id[label] for label in label_seq]
        data["tokens"].append(sentence)
        data["labels"].append(label_ids)

    return Dataset.from_dict(data)

# Preprocess train and test datasets
train_data = preprocess_data(train_sentences, train_labels, label_to_id)
test_data = preprocess_data(test_sentences, test_labels, label_to_id)

# Print dataset info
print(train_data)

Dataset({
    features: ['tokens', 'labels'],
    num_rows: 12574
})


In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        padding="max_length",
        max_length=128,
        is_split_into_words=True,
    )

    all_labels = []
    for i, labels in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = []
        previous_word_idx = None

        for word_idx in word_ids:
            if word_idx is None:
                aligned_labels.append(-100)
            elif word_idx != previous_word_idx:
                aligned_labels.append(labels[word_idx] if word_idx < len(labels) else -100)
            else:
                aligned_labels.append(-100)
            previous_word_idx = word_idx

        all_labels.append(aligned_labels)

    tokenized_inputs["labels"] = all_labels
    return tokenized_inputs

# Initialize tokenizer
model_name = "dmis-lab/biobert-base-cased-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize datasets
tokenized_train_data = train_data.map(tokenize_and_align_labels, batched=True)
tokenized_test_data = test_data.map(tokenize_and_align_labels, batched=True)

print(tokenized_train_data[0])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Map:   0%|          | 0/12574 [00:00<?, ? examples/s]

Map:   0%|          | 0/5038 [00:00<?, ? examples/s]

{'tokens': ['Immunohistochemical', 'staining', 'was', 'positive', 'for', 'S', '-', '100', 'in', 'all', '9', 'cases', 'stained', ',', 'positive', 'for', 'HMB', '-', '45', 'in', '9', '(', '90', '%', ')', 'of', '10', ',', 'and', 'negative', 'for', 'cytokeratin', 'in', 'all', '9', 'cases', 'in', 'which', 'myxoid', 'melanoma', 'remained', 'in', 'the', 'block', 'after', 'previous', 'sections', '.'], 'labels': [-100, 3, -100, -100, -100, -100, -100, 3, -100, 3, 3, 3, 0, 2, 1, 3, 3, 3, 3, 3, 3, 3, 3, 0, -100, 2, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, -100, -100, -100, -100, 3, 3, 3, 3, 3, 3, 3, -100, -100, 3, -100, -100, 3, 3, 3, 3, 3, 3, 3, 3, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100

In [None]:
metric = load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (-100) in predictions and labels
    true_predictions = [
        [id_to_label[pred] for pred, label in zip(prediction, label_seq) if label != -100]
        for prediction, label_seq in zip(predictions, labels)
    ]
    true_labels = [
        [id_to_label[label] for pred, label in zip(prediction, label_seq) if label != -100]
        for prediction, label_seq in zip(predictions, labels)
    ]

    # Compute metrics
    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# Load model
model = AutoModelForTokenClassification.from_pretrained(
    model_name, num_labels=len(label_to_id)
)

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Import the Trainer class
from transformers import Trainer, TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",  # Handles evaluation at the end of each epoch
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=2,  # Keeps only the last 2 checkpoints
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_data,
    eval_dataset=tokenized_test_data,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0834,0.082674,0.789709,0.78432,0.787005,0.971976
2,0.0445,0.081725,0.775411,0.837804,0.805401,0.974124
3,0.0267,0.094591,0.787004,0.830344,0.808093,0.973984
4,0.0135,0.126589,0.801743,0.832408,0.816787,0.974845
5,0.0048,0.145244,0.81185,0.830662,0.821148,0.975209


TrainOutput(global_step=7860, training_loss=0.039384300062674606, metrics={'train_runtime': 2143.8303, 'train_samples_per_second': 29.326, 'train_steps_per_second': 3.666, 'total_flos': 4107043165555200.0, 'train_loss': 0.039384300062674606, 'epoch': 5.0})

In [None]:
from sklearn.metrics import classification_report

# Evaluate model
predictions, labels, _ = trainer.predict(tokenized_test_data)
predictions = np.argmax(predictions, axis=2)

# Align predictions and labels
true_predictions = [
    [id_to_label[pred] for pred, label in zip(prediction, label_seq) if label != -100]
    for prediction, label_seq in zip(predictions, labels)
]
true_labels = [
    [id_to_label[label] for pred, label in zip(prediction, label_seq) if label != -100]
    for prediction, label_seq in zip(predictions, labels)
]

# Generate classification report
report = classification_report(
    [label for sublist in true_labels for label in sublist],
    [label for sublist in true_predictions for label in sublist],
    zero_division=0
)
print(report)

              precision    recall  f1-score   support

      B-GENE       0.86      0.88      0.87      3492
      E-GENE       0.85      0.86      0.85      3492
      I-GENE       0.83      0.85      0.84      5268
           O       0.99      0.99      0.99    127814
      S-GENE       0.88      0.86      0.87      2809

    accuracy                           0.98    142875
   macro avg       0.88      0.89      0.88    142875
weighted avg       0.98      0.98      0.98    142875

