In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/genia-biomedical-event-dataset/train_data.csv
/kaggle/input/genia-biomedical-event-dataset/test_data.csv
/kaggle/input/genia-biomedical-event-dataset/dev_data.csv
/kaggle/input/genia-biomedical-event-dataset/GE11-LICENSE


In [2]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd
from sklearn.metrics import f1_score
from tqdm import tqdm

In [3]:
# Load data
data = pd.read_csv('/kaggle/input/genia-biomedical-event-dataset/train_data.csv')

In [4]:
# Define constants
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 32
EPOCHS = 10
LEARNING_RATE = 2e-5
MODEL_SAVE_PATH = "bert_classification_model.pt"

In [5]:
# Dataset class
class EventDataset(Dataset):
    """
    Custom PyTorch Dataset for tokenizing and managing event data.
    """
    def __init__(self, words, labels, tokenizer, max_len):
        self.words = words
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.words)

    def __getitem__(self, index):
        word = self.words[index]
        label = self.labels[index]
        encoding = self.tokenizer(
            word,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [6]:
# Prepare data
all_words = []
labels = []

# Generate label indices and map "No Event" to 0
label_to_index = {"No Event": 0}
current_index = 1
for row in data.itertuples():
    trigger_words = str(row.TriggerWord).split(';') if pd.notna(row.TriggerWord) else []
    event_types = str(row.EventType).split(';') if pd.notna(row.EventType) else []
    word_to_class = dict(zip(trigger_words, event_types))

    seen_words = set()
    for word in row.Sentence.split():
        if word in seen_words:
            continue
        seen_words.add(word)
        all_words.append(word)
        if word in word_to_class:
            event_class = word_to_class[word]
            if event_class not in label_to_index:
                label_to_index[event_class] = current_index
                current_index += 1
            labels.append(label_to_index[event_class])
        else:
            labels.append(label_to_index["No Event"])

# Filter out "No Event" samples from training and validation data
train_words, val_words, train_labels, val_labels = train_test_split(
    all_words, labels, test_size=0.2, random_state=42
)

train_filtered = [(w, l) for w, l in zip(train_words, train_labels) if l != label_to_index["No Event"]]
val_filtered = [(w, l) for w, l in zip(val_words, val_labels) if l != label_to_index["No Event"]]

train_words, train_labels = zip(*train_filtered)
val_words, val_labels = zip(*val_filtered)

# Reset labels to be consecutive integers
unique_labels = sorted(set(train_labels))
label_remap = {old_label: new_index for new_index, old_label in enumerate(unique_labels)}
train_labels = [label_remap[label] for label in train_labels]
val_labels = [label_remap[label] for label in val_labels]

# Tokenizer
tokenizer = BertTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.2")

# Dataset and DataLoader
train_dataset = EventDataset(list(train_words), list(train_labels), tokenizer, max_len=32)
val_dataset = EventDataset(list(val_words), list(val_labels), tokenizer, max_len=32)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

vocab.txt:   0%|          | 0.00/228k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]



In [7]:
# Define model
class BertClassifier(nn.Module):
    """
    BERT-based classifier for event classification.
    """
    def __init__(self, num_classes):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained("dmis-lab/biobert-base-cased-v1.2")
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        dropout_output = self.dropout(pooled_output)
        return self.classifier(dropout_output)

In [8]:
# Initialize model with filtered number of classes
num_classes = len(label_remap)  
model = BertClassifier(num_classes)
model = nn.DataParallel(model)
model.to(device)

# Define loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

In [9]:
# Training and evaluation
def train_epoch(model, data_loader, criterion, optimizer):
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    progress_bar = tqdm(data_loader, desc=f"Training Epoch {epoch + 1}", unit="batch")
        
    for batch in progress_bar:
        if batch is None: 
            continue
            
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        preds = torch.argmax(outputs, dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    return total_loss / len(data_loader), correct / total


In [10]:
def evaluate_epoch(model, data_loader, criterion):
    """
    Evaluation function for one epoch.
    """
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            preds = torch.argmax(outputs, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Generate classification report excluding "No Event"
    filtered_target_names = [name for name in label_to_index.keys() if name != "No Event"]
    report = classification_report(
        all_labels, all_preds,
        labels=[label_remap[v] for k, v in label_to_index.items() if k != "No Event"],
        target_names=filtered_target_names
    )
    
    return total_loss / len(data_loader), correct / total, report

In [11]:
# Training loop
for epoch in range(EPOCHS):
    train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer)
    val_loss, val_acc, report = evaluate_epoch(model, val_loader, criterion)

    print(f"Epoch {epoch + 1}/{EPOCHS}")
    print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.4f}")
    print(f"Val Loss: {val_loss:.4f}, Val Accuracy: {val_acc:.4f}")
    print(report)

Training Epoch 1: 100%|██████████| 174/174 [00:38<00:00,  4.48batch/s]


Epoch 1/10
Train Loss: 0.7248, Train Accuracy: 0.7879
Val Loss: 0.3325, Val Accuracy: 0.9095
                     precision    recall  f1-score   support

Negative_regulation       0.95      0.93      0.94       174
    Gene_expression       0.88      0.94      0.91       335
         Regulation       0.81      0.92      0.86       161
      Transcription       0.91      0.70      0.79        90
Positive_regulation       0.93      0.92      0.93       450
            Binding       0.98      0.95      0.96       128
       Localization       0.94      0.74      0.83        43
    Phosphorylation       0.94      1.00      0.97        30
 Protein_catabolism       0.93      0.87      0.90        15

           accuracy                           0.91      1426
          macro avg       0.92      0.88      0.90      1426
       weighted avg       0.91      0.91      0.91      1426



Training Epoch 2: 100%|██████████| 174/174 [00:38<00:00,  4.56batch/s]


Epoch 2/10
Train Loss: 0.3026, Train Accuracy: 0.9163
Val Loss: 0.3298, Val Accuracy: 0.9067
                     precision    recall  f1-score   support

Negative_regulation       0.94      0.94      0.94       174
    Gene_expression       0.87      0.95      0.91       335
         Regulation       0.89      0.85      0.87       161
      Transcription       0.98      0.63      0.77        90
Positive_regulation       0.94      0.92      0.93       450
            Binding       0.92      0.95      0.93       128
       Localization       0.68      0.84      0.75        43
    Phosphorylation       0.94      1.00      0.97        30
 Protein_catabolism       0.88      0.93      0.90        15

           accuracy                           0.91      1426
          macro avg       0.89      0.89      0.89      1426
       weighted avg       0.91      0.91      0.91      1426



Training Epoch 3: 100%|██████████| 174/174 [00:39<00:00,  4.46batch/s]


Epoch 3/10
Train Loss: 0.2660, Train Accuracy: 0.9205
Val Loss: 0.3261, Val Accuracy: 0.9011
                     precision    recall  f1-score   support

Negative_regulation       0.96      0.93      0.95       174
    Gene_expression       0.91      0.92      0.91       335
         Regulation       0.82      0.87      0.84       161
      Transcription       0.91      0.70      0.79        90
Positive_regulation       0.89      0.92      0.91       450
            Binding       0.98      0.93      0.95       128
       Localization       0.76      0.81      0.79        43
    Phosphorylation       0.94      1.00      0.97        30
 Protein_catabolism       0.93      0.93      0.93        15

           accuracy                           0.90      1426
          macro avg       0.90      0.89      0.89      1426
       weighted avg       0.90      0.90      0.90      1426



Training Epoch 4: 100%|██████████| 174/174 [00:38<00:00,  4.49batch/s]


Epoch 4/10
Train Loss: 0.2542, Train Accuracy: 0.9235
Val Loss: 0.3269, Val Accuracy: 0.9039
                     precision    recall  f1-score   support

Negative_regulation       0.97      0.93      0.95       174
    Gene_expression       0.89      0.93      0.91       335
         Regulation       0.80      0.89      0.84       161
      Transcription       0.88      0.71      0.79        90
Positive_regulation       0.91      0.91      0.91       450
            Binding       0.98      0.92      0.95       128
       Localization       0.92      0.79      0.85        43
    Phosphorylation       0.94      1.00      0.97        30
 Protein_catabolism       0.88      0.93      0.90        15

           accuracy                           0.90      1426
          macro avg       0.91      0.89      0.90      1426
       weighted avg       0.91      0.90      0.90      1426



Training Epoch 5: 100%|██████████| 174/174 [00:38<00:00,  4.47batch/s]


Epoch 5/10
Train Loss: 0.2382, Train Accuracy: 0.9289
Val Loss: 0.3457, Val Accuracy: 0.9053
                     precision    recall  f1-score   support

Negative_regulation       0.92      0.96      0.94       174
    Gene_expression       0.89      0.93      0.91       335
         Regulation       0.82      0.88      0.85       161
      Transcription       0.90      0.71      0.80        90
Positive_regulation       0.94      0.90      0.92       450
            Binding       0.93      0.96      0.95       128
       Localization       0.83      0.79      0.81        43
    Phosphorylation       0.94      1.00      0.97        30
 Protein_catabolism       0.93      0.87      0.90        15

           accuracy                           0.91      1426
          macro avg       0.90      0.89      0.89      1426
       weighted avg       0.91      0.91      0.90      1426



Training Epoch 6: 100%|██████████| 174/174 [00:39<00:00,  4.46batch/s]


Epoch 6/10
Train Loss: 0.2258, Train Accuracy: 0.9282
Val Loss: 0.3478, Val Accuracy: 0.9011
                     precision    recall  f1-score   support

Negative_regulation       0.94      0.94      0.94       174
    Gene_expression       0.88      0.94      0.91       335
         Regulation       0.87      0.80      0.83       161
      Transcription       0.90      0.70      0.79        90
Positive_regulation       0.89      0.94      0.91       450
            Binding       0.98      0.91      0.95       128
       Localization       0.97      0.70      0.81        43
    Phosphorylation       0.94      1.00      0.97        30
 Protein_catabolism       0.93      0.93      0.93        15

           accuracy                           0.90      1426
          macro avg       0.92      0.87      0.89      1426
       weighted avg       0.90      0.90      0.90      1426



Training Epoch 7: 100%|██████████| 174/174 [00:38<00:00,  4.46batch/s]


Epoch 7/10
Train Loss: 0.2240, Train Accuracy: 0.9300
Val Loss: 0.3314, Val Accuracy: 0.9102
                     precision    recall  f1-score   support

Negative_regulation       0.93      0.94      0.94       174
    Gene_expression       0.88      0.95      0.92       335
         Regulation       0.88      0.86      0.87       161
      Transcription       0.86      0.71      0.78        90
Positive_regulation       0.92      0.92      0.92       450
            Binding       0.97      0.95      0.96       128
       Localization       0.97      0.77      0.86        43
    Phosphorylation       0.94      1.00      0.97        30
 Protein_catabolism       0.93      0.93      0.93        15

           accuracy                           0.91      1426
          macro avg       0.92      0.89      0.90      1426
       weighted avg       0.91      0.91      0.91      1426



Training Epoch 8: 100%|██████████| 174/174 [00:38<00:00,  4.48batch/s]


Epoch 8/10
Train Loss: 0.2189, Train Accuracy: 0.9297
Val Loss: 0.3260, Val Accuracy: 0.9081
                     precision    recall  f1-score   support

Negative_regulation       0.96      0.93      0.94       174
    Gene_expression       0.89      0.94      0.92       335
         Regulation       0.86      0.87      0.87       161
      Transcription       0.86      0.71      0.78        90
Positive_regulation       0.91      0.93      0.92       450
            Binding       0.98      0.92      0.95       128
       Localization       0.87      0.77      0.81        43
    Phosphorylation       0.94      1.00      0.97        30
 Protein_catabolism       0.93      0.93      0.93        15

           accuracy                           0.91      1426
          macro avg       0.91      0.89      0.90      1426
       weighted avg       0.91      0.91      0.91      1426



Training Epoch 9: 100%|██████████| 174/174 [00:39<00:00,  4.46batch/s]


Epoch 9/10
Train Loss: 0.2126, Train Accuracy: 0.9297
Val Loss: 0.3318, Val Accuracy: 0.9004
                     precision    recall  f1-score   support

Negative_regulation       0.95      0.94      0.94       174
    Gene_expression       0.90      0.92      0.91       335
         Regulation       0.88      0.80      0.84       161
      Transcription       0.86      0.71      0.78        90
Positive_regulation       0.89      0.94      0.91       450
            Binding       0.94      0.95      0.95       128
       Localization       0.78      0.74      0.76        43
    Phosphorylation       0.94      1.00      0.97        30
 Protein_catabolism       0.93      0.87      0.90        15

           accuracy                           0.90      1426
          macro avg       0.90      0.87      0.88      1426
       weighted avg       0.90      0.90      0.90      1426



Training Epoch 10: 100%|██████████| 174/174 [00:38<00:00,  4.48batch/s]


Epoch 10/10
Train Loss: 0.2094, Train Accuracy: 0.9349
Val Loss: 0.3424, Val Accuracy: 0.9046
                     precision    recall  f1-score   support

Negative_regulation       0.94      0.94      0.94       174
    Gene_expression       0.90      0.93      0.91       335
         Regulation       0.88      0.80      0.83       161
      Transcription       0.86      0.71      0.78        90
Positive_regulation       0.89      0.95      0.92       450
            Binding       0.96      0.97      0.96       128
       Localization       0.94      0.74      0.83        43
    Phosphorylation       0.94      1.00      0.97        30
 Protein_catabolism       0.93      0.87      0.90        15

           accuracy                           0.90      1426
          macro avg       0.92      0.88      0.89      1426
       weighted avg       0.90      0.90      0.90      1426



# Save model

In [12]:
torch.save(model.state_dict(), "/kaggle/working/model_checkpoint.pt")

In [13]:
import os
import subprocess
from IPython.display import FileLink, display

def download_file(path, download_file_name):
    os.chdir('/kaggle/working/')
    zip_name = f"/kaggle/working/{download_file_name}.zip"
    command = f"zip {zip_name} {path} -r"
    result = subprocess.run(command, shell=True, capture_output=True, text=True)
    if result.returncode != 0:
        print("Unable to run zip command!")
        print(result.stderr)
        return
    display(FileLink(f'{download_file_name}.zip'))

In [14]:
download_file('/kaggle/working/model_checkpoint.pt', 'model')

In [15]:
#dmis-lab/biobert-base-cased-v1.2