In [None]:
!pip install -U sacremoses  #biogpt tokenizer

Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/897.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m358.4/897.5 kB[0m [31m10.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sacremoses
Successfully installed sacremoses-0.1.1


In [1]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
import re
import pandas as pd
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from tqdm.auto import tqdm

In [2]:
import os
from google.colab import drive
drive.mount('/content/gdrive')

path = "/content/gdrive/MyDrive/"
os.listdir(path)
data = pd.read_csv(path+'comback_rev.csv')
data.head()

Mounted at /content/gdrive


Unnamed: 0,text,readmission_30
0,Subdural hematoma ETOH abuse Wernicke's ence...,1.0
1,Primary:eosinophilic enteropathy on IL-5 inhib...,1.0
2,PROSTATE CANCER,1.0
3,clotted brachiocephalic fistulaesrdhtnDM II,1.0
4,DyspneaAsthmaCOPDCHFDOE,1.0


In [None]:
def extract_and_combine_conditions(text):
    parts = text.split('Secondary:')
    primary_conditions = parts[0].replace('Primary:', '').strip()
    secondary_conditions = parts[1].strip() if len(parts) > 1 else ''
    combined_conditions = primary_conditions + ', ' + secondary_conditions if secondary_conditions else primary_conditions
    return combined_conditions

In [None]:
def clean_text(text):
    text = text.lower() 
    text = re.sub(r'\b(patientname|patientid):\s*\S+', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = extract_and_combine_conditions(text)
    return text

In [None]:
class ReadmissionDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings  
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(self.encodings[key][idx]) for key in self.encodings}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
data['clean_text']=data['text'].apply(clean_text)

In [None]:
train_val_texts, test_texts, train_val_labels, test_labels = train_test_split(
    data['clean_text'], data['readmission_30'], test_size=0.2, random_state=42)

train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_val_texts, train_val_labels, test_size=0.25, random_state=42) # 0.25 x 0.8 = 0.2

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12')

config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [None]:
# 토크나이저 사용
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(test_texts.tolist(), truncation=True, padding=True, max_length=128)

In [None]:
# 데이터셋 생성
train_dataset = ReadmissionDataset(train_encodings, train_labels.tolist())
val_dataset = ReadmissionDataset(val_encodings, val_labels.tolist())
test_dataset = ReadmissionDataset(test_encodings, test_labels.tolist())

In [None]:
# DataLoader 설정
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained('bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12', num_labels=data['readmission_30'].nunique())

pytorch_model.bin:   0%|          | 0.00/441M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=3e-5)

## earlyStopping

In [None]:
class EarlyStopping:
    def __init__(self, patience=3, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_loss = None
        self.early_stop = False

    def __call__(self, val_loss):
        if self.best_loss is None:
            self.best_loss = val_loss
        elif self.best_loss - val_loss > self.min_delta:
            self.best_loss = val_loss
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True

In [None]:
from sklearn.metrics import f1_score, recall_score, accuracy_score, precision_score
import torch

def evaluate(model, test_loader, criterion, device):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            loss = criterion(logits, labels)
            total_loss += loss.item()

            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    average_loss = total_loss / len(test_loader)
    f1 = f1_score(all_labels, all_preds, average='weighted')
    recall = recall_score(all_labels, all_preds, average='weighted')
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average='weighted')

    return average_loss, f1, recall, accuracy, precision


In [None]:
early_stopper = EarlyStopping(patience=3, min_delta=0.001)
epoch = 7
for epoch in range(epoch):
    model.train()
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f'Epoch {epoch + 1}', leave=False)

    for batch in progress_bar:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        loss = criterion(logits, labels)
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

        progress_bar.set_postfix({'loss': f'{loss.item():.4f}'})

    average_train_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch} finished - Avg Train Loss: {average_train_loss:.4f}')

    # Validation Step
    val_loss, val_f1, val_recall, val_acc, val_prec = evaluate(model, val_loader, criterion, device)
    print(f'Validation - Loss: {val_loss:.4f}, F1 Score: {val_f1:.4f}, Recall: {val_recall:.4f}, Accuracy: {val_acc:.4f}, Precision: {val_prec:.4f}')

    early_stopper(val_loss)
    if early_stopper.early_stop:
        print("Early stopping triggered")
        break

Epoch 1:   0%|          | 0/3750 [00:00<?, ?it/s]

Epoch 0 finished - Avg Train Loss: 0.6076
Validation - Loss: 0.5940, F1 Score: 0.5918, Recall: 0.6910, Accuracy: 0.6910, Precision: 0.6878


Epoch 2:   0%|          | 0/3750 [00:00<?, ?it/s]

Epoch 1 finished - Avg Train Loss: 0.5933
Validation - Loss: 0.5964, F1 Score: 0.6493, Recall: 0.6952, Accuracy: 0.6952, Precision: 0.6657


Epoch 3:   0%|          | 0/3750 [00:00<?, ?it/s]

Epoch 2 finished - Avg Train Loss: 0.5762
Validation - Loss: 0.5947, F1 Score: 0.6307, Recall: 0.6967, Accuracy: 0.6967, Precision: 0.6725


Epoch 4:   0%|          | 0/3750 [00:00<?, ?it/s]

Epoch 3 finished - Avg Train Loss: 0.5657
Validation - Loss: 0.6013, F1 Score: 0.6530, Recall: 0.6884, Accuracy: 0.6884, Precision: 0.6578
Early stopping triggered


In [None]:
test_loss, test_f1, test_recall, test_acc, test_prec = evaluate(model, test_loader, criterion, device)
print(f'Test Loss: {test_loss:.4f}, F1 Score: {test_f1:.4f}, Recall: {test_recall:.4f}, Accuracy: {test_acc:.4f}, Precision: {test_prec:.4f}')

Test Loss: 0.6080, F1 Score: 0.6475, Recall: 0.6825, Accuracy: 0.6825, Precision: 0.6527


In [None]:
model.save_pretrained('test/blue-bert-readmission_2nd')
model.config.to_json_file("config.json")
tokenizer.save_pretrained('test/blue-bert-readmission_2nd')
!tar -cvf blue-bert_finetuned_2nd.tar.gz test/
!mv ./blue-bert_finetuned_2nd.tar.gz /content/gdrive/MyDrive/blue-bert_finetuned_2nd.tar.gz

test/
test/blue-bert-readmission_2nd/
test/blue-bert-readmission_2nd/model.safetensors
test/blue-bert-readmission_2nd/config.json
test/blue-bert-readmission_2nd/tokenizer.json
test/blue-bert-readmission_2nd/vocab.txt
test/blue-bert-readmission_2nd/special_tokens_map.json
test/blue-bert-readmission_2nd/tokenizer_config.json
