In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/genia-biomedical-event-dataset/train_data.csv
/kaggle/input/genia-biomedical-event-dataset/test_data.csv
/kaggle/input/genia-biomedical-event-dataset/dev_data.csv
/kaggle/input/genia-biomedical-event-dataset/GE11-LICENSE


In [2]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizerFast, BertForTokenClassification, AdamW
from sklearn.metrics import classification_report
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import f1_score

In [3]:
# Preprocessing function
def preprocess_data(data, tokenizer, max_len):
    sentences = []
    labels = []

    for _, row in data.iterrows():
        sentence = row['Sentence'].split()
        trigger_words = set(row['TriggerWord'].split(';')) if pd.notna(row['TriggerWord']) else set()
        label = [1 if word in trigger_words else 0 for word in sentence]

        sentences.append(sentence)
        labels.append(label)

    return sentences, labels


In [4]:
# Dummy dataset class
class WordClassificationDataset(Dataset):
    def __init__(self, sentences, labels, tokenizer, max_len):
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            sentence,
            is_split_into_words=True,
            return_offsets_mapping=True,
            padding='max_length',
            truncation=True,
            max_length=self.max_len
        )

        input_ids = encoding['input_ids']
        attention_mask = encoding['attention_mask']

        # Map labels to tokens
        label_ids = [-100] * len(input_ids)
        word_ids = encoding.word_ids()
        for i, word_id in enumerate(word_ids):
            if word_id is not None and i < len(label):
                label_ids[i] = label[word_id]

        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
            'labels': torch.tensor(label_ids, dtype=torch.long)
        }

In [5]:
# Prepare data
file_path = '/kaggle/input/genia-biomedical-event-dataset/train_data.csv'
data = pd.read_csv(file_path)

In [6]:
# Parameters
MAX_LEN = 128
BATCH_SIZE = 16
EPOCHS = 10
LEARNING_RATE = 5e-5

In [7]:
# Initialize tokenizer
tokenizer = BertTokenizerFast.from_pretrained('dmis-lab/biobert-base-cased-v1.2')

# Preprocess data
sentences, labels = preprocess_data(data, tokenizer, MAX_LEN)

# Create dataset and dataloader
dataset = WordClassificationDataset(sentences, labels, tokenizer, MAX_LEN)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)


vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

In [8]:
# Model
model = BertForTokenClassification.from_pretrained('dmis-lab/biobert-base-cased-v1.2', num_labels=2)
model = model.to('cuda')

# Optimizer
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# Training loop
def train(model, dataloader, optimizer, epochs):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        progress_bar = tqdm(dataloader, desc=f"Training Epoch {epoch + 1}", unit="batch")
        
        for batch in progress_bar:
            input_ids = batch['input_ids'].to('cuda')
            attention_mask = batch['attention_mask'].to('cuda')
            labels = batch['labels'].to('cuda')

            optimizer.zero_grad()

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss:.4f}")

        # Evaluate after each epoch
        evaluate(model, dataloader)

In [10]:
# Evaluation function
def evaluate(model, dataloader):
    model.eval()
    predictions, true_labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to('cuda')
            attention_mask = batch['attention_mask'].to('cuda')
            labels = batch['labels'].to('cuda')

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            preds = torch.argmax(logits, dim=-1).cpu().numpy()
            label_ids = labels.cpu().numpy()

            for i, label in enumerate(label_ids):
                true_labels.extend(label[label != -100])
                predictions.extend(preds[i][label != -100])

    print(classification_report(true_labels, predictions, digits=4))

In [11]:
# Run training
#train(model, dataloader, optimizer, EPOCHS)

# Test model

In [15]:
# Prepare test data
file_path = '/kaggle/input/genia-biomedical-event-dataset/dev_data.csv'
test_data = pd.read_csv(file_path)

In [27]:
# Initialize tokenizer
tokenizer = BertTokenizerFast.from_pretrained('dmis-lab/biobert-base-cased-v1.2')

# Preprocess test data
test_sentences, test_labels = preprocess_data(test_data, tokenizer, MAX_LEN)

# Create dataset and dataloader
test_dataset = WordClassificationDataset(test_sentences, test_labels, tokenizer, MAX_LEN)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle = True)

In [29]:
model = BertForTokenClassification.from_pretrained('dmis-lab/biobert-base-cased-v1.2', num_labels=2)
model = model.to('cuda')
model.load_state_dict(torch.load('/kaggle/input/ee-pretrain-biobert/trigger-detection-biobert.pt', weights_only=True))
model.eval()

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [30]:
evaluate(model, test_dataloader)

              precision    recall  f1-score   support

           0     0.9901    0.9937    0.9919     70036
           1     0.7933    0.7073    0.7478      2388

    accuracy                         0.9843     72424
   macro avg     0.8917    0.8505    0.8699     72424
weighted avg     0.9836    0.9843    0.9838     72424



# Save model

In [12]:
torch.save(model.state_dict(), "/kaggle/working/model_checkpoint.pt")

In [13]:
import os
import subprocess
from IPython.display import FileLink, display

def download_file(path, download_file_name):
    os.chdir('/kaggle/working/')
    zip_name = f"/kaggle/working/{download_file_name}.zip"
    command = f"zip {zip_name} {path} -r"
    result = subprocess.run(command, shell=True, capture_output=True, text=True)
    if result.returncode != 0:
        print("Unable to run zip command!")
        print(result.stderr)
        return
    display(FileLink(f'{download_file_name}.zip'))

In [None]:
download_file('/kaggle/working/model_checkpoint.pt', 'model')

dmis-lab/biobert-base-cased-v1.2