In [None]:
import os
import spacy
import torch
import numpy as np
import pandas as pd
import seaborn as sns
import torch.nn as nn
import matplotlib.pyplot as plt
import torch.nn.functional as F
from tqdm import tqdm
from ast import literal_eval
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import StepLR
from sklearn.model_selection import train_test_split
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from sklearn.metrics import f1_score, precision_recall_fscore_support

# !pip install -q "../input/nbme-huggingface-datasets-pip-wheels/datasets-2.1.0-py3-none-any.whl"
os.system("python -m pip install --no-index --find-links=../input/nbme-huggingface-datasets-pip-wheels datasets")

import transformers
from datasets import Dataset 
from transformers import AutoModel, AutoTokenizer

# Data

In [None]:
DATA_PATH = "../input/nbme-score-clinical-patient-notes" # reading the 3 CSVs

feats_df = pd.read_csv(DATA_PATH + "/features.csv")
notes_df = pd.read_csv(DATA_PATH + "/patient_notes.csv")
train_df = pd.read_csv(DATA_PATH + "/train.csv")

def change_location(x): # parse the locations and translate to Python list
    if len(x) == 0: # there is no annotation and location
        return []
    # the data format like this ["4 12", "24 29;33 36"] and form of Python list thanks to "literal_eval"
    # first, iterate over elements and split by ";" so that data becomes ["4 12", "24 29", "33 36"]
    # last thing is iterate over again, then make string to integer, data becomes [[4, 12], [24, 29], [33, 36]]
    locs_str = []
    locs_int = []
    for l in x:
        locs_str.extend(l.split(";"))
    for l in locs_str:
        l_split = l.split(" ")
        locs_int.append([int(l_split[0]), int(l_split[1])])
    return locs_int

merged_df = train_df.merge(notes_df, how="left") # merge by common columns
patient_df = merged_df.merge(feats_df, how="left") # merge by common columns
patient_df["location"] = [literal_eval(x) for x in patient_df["location"]] # string to Python list
patient_df["location"] = [change_location(x) for x in patient_df["location"]] # parse the locations
patient_df["annotation"] = [literal_eval(x) for x in patient_df["annotation"]] # string to Python list
patient_df["pn_history"] = [x.replace("\n", " ").replace("\r", " ").replace("\t", " ") for x in patient_df["pn_history"]] # remove chars
patient_df["feature_text"] = [x.replace("-OR-", " or ").replace("-", " ") for x in patient_df["feature_text"]] # make features readable
print(patient_df.shape)
patient_df.head(2)

In [None]:
patient_train_df, patient_test_df = train_test_split(patient_df, test_size=0.2, random_state=42)
print(patient_train_df.shape, patient_test_df.shape)

In [None]:
def create_submission_df():
    feats = pd.read_csv(f"{DATA_PATH}/features.csv")
    notes = pd.read_csv(f"{DATA_PATH}/patient_notes.csv")
    test = pd.read_csv(f"{DATA_PATH}/test.csv")

    merged = test.merge(notes, how="left")
    merged = merged.merge(feats, how="left")

    def process_text(text):
        return text.replace("-OR-", ";-").replace("-", " ").replace("\n", " ").replace("\r", " ").replace("\t", " ")
    merged["feature_text"] = [process_text(x) for x in merged["feature_text"]]
    merged["pn_history"] = [process_text(x) for x in merged["pn_history"]]
    merged["location"] = [[[-1, -1]], [[-1, -1]], [[-1, -1]], [[-1, -1]], [[-1, -1]]]
    merged["annotation"] = ""

    return merged

patient_submission_df = create_submission_df()
patient_submission_df.head()

In [None]:
patient_train = Dataset.from_pandas(patient_train_df)
patient_test = Dataset.from_pandas(patient_test_df)
patient_submission = Dataset.from_pandas(patient_submission_df)
patient_train, patient_test, patient_submission

In [None]:
checkpoint = "../input/distilbertbaseuncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
def tokenize_function(x):
    tokenized = tokenizer(
        x["feature_text"],          # first sequence (represented as 0)
        x["pn_history"],            # second sequence (represented as 1)
        truncation="only_second",   # only truncate from second sequence (pn_history)
        padding="max_length",       # pad to 512
        max_length=512,             # max 512
        return_offsets_mapping=True # offsets of tokens (start and end locations)
    )

    labels = [0.0] * len(tokenized["input_ids"]) # labels indicates whether given token in pn_history is annotaion/feature (1.0) or not (0.0)
    tokenized["location"] = x["location"] # add new data to tokenized
    tokenized["sequence_ids"] = tokenized.sequence_ids() # sequences and special characters between them
    
    for idx, (seq_id, offsets) in enumerate(zip(tokenized["sequence_ids"], tokenized["offset_mapping"])):
        if seq_id is None or seq_id == 0 or np.isnan(seq_id): # [token is not a sequence (special character)] or [it is first sequence (feature_text)]
            labels[idx] = -100.0 # give high loss and use as indicator
            continue
        token_start, token_end = offsets # get start and location of the token
        for location_start, location_end in tokenized["location"]: # search in location
            if token_start >= location_start and token_end <= location_end: # whether falls in between them
                labels[idx] = 1.0 # found
    tokenized["labels"] = labels # add created labels data
    tokenized["offset_mapping"] = np.array(tokenized["offset_mapping"]).flatten() # for correct batch shapes
    # offset = np.array([offset[::2], offset[1::2]]).reshape(-1, 2) # reverse of flatten
    
    return tokenized

tokenized_patient_train = patient_train.map(tokenize_function) # map each entry to "tokenize_function"
tokenized_patient_test = patient_test.map(tokenize_function) # map each entry to "tokenize_function"
tokenized_patient_submission = patient_submission.map(tokenize_function) # map each entry to "tokenize_function"

cols_all = tokenized_patient_train.column_names # get all columns
cols_to_use = ["input_ids", "attention_mask", "offset_mapping", "sequence_ids", "labels"] # we will use these columns
cols_to_remove = list(set(cols_all) - set(cols_to_use)) # find cols that will be removed

tokenized_patient_train.set_format(type="torch", columns=cols_to_use) # select columns make type of PyTorch tensor
tokenized_patient_test.set_format(type="torch", columns=cols_to_use) # select columns make type of PyTorch tensor
tokenized_patient_submission.set_format(type="torch", columns=cols_to_use) # select columns make type of PyTorch tensor

tokenized_patient_train = tokenized_patient_train.remove_columns(cols_to_remove) # remove unused columns
tokenized_patient_test = tokenized_patient_test.remove_columns(cols_to_remove) # remove unused columns
tokenized_patient_submission = tokenized_patient_submission.remove_columns(cols_to_remove) # remove unused columns

tokenized_patient_train, tokenized_patient_test, tokenized_patient_submission

In [None]:
batch_size = 1

dataloader_train = DataLoader(tokenized_patient_train, batch_size=batch_size, drop_last=True)
dataloader_test = DataLoader(tokenized_patient_test, batch_size=batch_size, drop_last=True, shuffle=False)
dataloader_submission = DataLoader(tokenized_patient_submission, batch_size=batch_size, shuffle=False)

# Modeling

In [None]:
class PatientModel(nn.Module): # this is typical custom PyTorch model 
    def __init__(self, checkpoint="../input/distilbertbaseuncased"):
        super(PatientModel, self).__init__()

        self.bert = AutoModel.from_pretrained(checkpoint) # load the model
        # for param in self.bert.parameters(): param.requires_grad = False
        
        self.dropout = nn.Dropout(0.2) # random zeroing
        self.fc = nn.Linear(768, 1) # classifier layer

    def forward(self, batch):
        ids = batch["input_ids"]
        mask = batch["attention_mask"]
        output = self.bert(ids, attention_mask=mask) # feed only these two
        x = output[0] # or "pooler_output"

        x = self.dropout(self.fc(x)).squeeze(-1) # classify and squeeze

        return x

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # gpu check
model = PatientModel().to(device) # initialize the model and send it to the device, hopefully to a GPU
criterion = torch.nn.BCEWithLogitsLoss(reduction="none") # initialize the loss
optimizer = torch.optim.AdamW(model.parameters(), lr=0.00001) # adamw with learning rate 0.00001
scheduler = StepLR(optimizer, step_size=1, gamma=0.7) # learning rate adjuster

num_epochs = 2 # epoch size is 2
for epoch in range(num_epochs): # iterate epoch size
    for i, batch in enumerate(tqdm(dataloader_train)): # output data amount of batch_size
        batch = {k: v.to(device) for k, v in batch.items()} # send the data to device
        
        optimizer.zero_grad() # refresh/zero the gradient
        outputs = model(batch) # get outputs from model
        loss = criterion(outputs, batch["labels"]) # criterion(y_pred, y_true)
        loss = torch.masked_select(loss, batch["labels"] > -1).mean() # only select second sequence (patient notes) 
        loss.backward() # train, calculate gradients
        optimizer.step() # update model parameters

        if i % 800 == 0: # logs
            print("Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
                    epoch + 1,
                    i * batch_size, 
                    len(dataloader_train.dataset),
                    100. * i / len(dataloader_train), 
                    loss.item()
                )
            )
    
    scheduler.step() # learning rate adjuster

torch.save(model.state_dict(), "patient_model.pth") # save our model

# Evaluation

In [None]:
def predictions_to_locations(predictions_batch, offsets_batch, seq_ids_batch, test=False):
    sigmoid = lambda z: 1 / (1 + np.exp(-z)) # for mapping logits between -1.0 to 1.0
    all_predictions = [] # will contain 0s (not a annotations/features) and 1s (an annotation)
    for predictions, offsets, seq_ids in zip(predictions_batch, offsets_batch, seq_ids_batch): # iterate over data
        offsets = np.array([offsets[::2], offsets[1::2]]).reshape(-1, 2) # creating 2D array from 1D by pairwise
        
        predictions = sigmoid(predictions) # logits/probabilities mapped between -1.0 to 1.0
        
        start_idx = None # will hold starting location of annotation
        current_prediction = [] # will hold current and be appended to all_predictions
        for prediction, offset, seq_id in zip(predictions, offsets, seq_ids): # iterate over words/tokens
            if seq_id is None or seq_id == 0 or np.isnan(seq_id): # ignore other than second sequence (patient notes)
                continue
            if prediction > 0.5: # it likely be a annotation/feature/key phrase
                if start_idx is None: # a flag for holding very first index of annotation/key phrase
                    start_idx = offset[0]
                end_idx = offset[1] # will hold last index of annotation and be overrided until not a annotation 
            elif start_idx is not None and start_idx != 0 and end_idx != 0: # found an annotation and location is not (0, 0)
                if test:
                    current_prediction.append(f"{start_idx} {end_idx}")
                else:
                    current_prediction.append((start_idx, end_idx)) # append found location
                start_idx = None # restart the process
        if test:
            all_predictions.append("; ".join(current_prediction))
        else:
            all_predictions.append(current_prediction) # append all locations from single data
    return all_predictions

In [None]:
def calculate_charwise_metrics(predictions_batch, offsets_batch, seq_ids_batch, labels_batch):
    y_true = [] # will hold all ground truths
    y_pred = [] # will hold all predictions
    for predictions, offsets, seq_ids, labels in zip(predictions_batch, offsets_batch, seq_ids_batch, labels_batch): # iterate over data
        offsets = np.array([offsets[::2], offsets[1::2]]).reshape(-1, 2) # creating 2D array from 1D by pairwise
        
        num_chars = 0 # calculate number of characters in the text
        for start_idx, end_idx in offsets: # look at offset because of it holds locations of words/tokens 
            num_chars = max(num_chars, start_idx, end_idx) # get biggest location which means number of characters
        
        char_true = np.zeros((num_chars)) # will consist of 0s and 1s indicates not a annotation and an annotation for ground truths
        for offset, seq_id, label in zip(offsets, seq_ids, labels): # iterate over data
            if seq_id is None or seq_id == 0 or np.isnan(seq_id): # ignore other than second sequence (patient notes)
                continue
            if int(label) == 1: # an annotation is found
                char_true[offset[0]:offset[1]] = 1 # mark as 1
        
        char_preds = np.zeros((num_chars)) # will consist of 0s and 1s indicates not a annotation and an annotation for predictions
        for start_idx, end_idx in predictions: # we already predicted but only start and end pairs
            char_preds[start_idx:end_idx] = 1 # make it character base
        
        y_true.extend(char_true) # concatenate all for metrics
        y_pred.extend(char_preds) # concatenate all for metrics
    
    micro_f1 = f1_score(y_true, y_pred) # required micro-averaged F1 score.
    results = precision_recall_fscore_support(y_true, y_pred, average="binary") # other metrics
    return {
        "micro_f1": micro_f1,
        "precision": results[0],
        "recall": results[1],
        "fbeta_score": results[2],
        "support": results[3],
    }

In [None]:
# model = PatientModel().to(device)
# model.load_state_dict(torch.load("patient_model.pth", map_location=device))

model.eval() # switch to evaluation mode
predictions = [] # will hold data
offsets = []
seq_ids = []
labels = []
with torch.no_grad():
    for batch in tqdm(dataloader_test):
        batch = {k: v.to(device) for k, v in batch.items()} # send to the device
        
        outputs = model(batch) # get outputs from model
        
        predictions.append(outputs.cpu().numpy()) # store them
        offsets.append(batch["offset_mapping"].cpu().numpy())
        seq_ids.append(batch["sequence_ids"].cpu().numpy())
        labels.append(batch["labels"].cpu().numpy())

predictions = np.concatenate(predictions) # concatenate along axis 0 
offsets = np.concatenate(offsets)
seq_ids = np.concatenate(seq_ids)
labels = np.concatenate(labels)

location_predictions = predictions_to_locations(predictions, offsets, seq_ids)
calculate_charwise_metrics(location_predictions, offsets, seq_ids, labels)

In [None]:
# model = PatientModel().to(device)
# model.load_state_dict(torch.load("patient_model.pth", map_location=device))

model.eval() # switch to evaluation mode
predictions = [] # will hold data
offsets = []
seq_ids = []
labels = []
with torch.no_grad():
    for batch in tqdm(dataloader_submission):
        batch = {k: v.to(device) for k, v in batch.items()} # send to the device
        
        outputs = model(batch) # get outputs from model
        
        predictions.append(outputs.cpu().numpy()) # store them
        offsets.append(batch["offset_mapping"].cpu().numpy())
        seq_ids.append(batch["sequence_ids"].cpu().numpy())
        labels.append(batch["labels"].cpu().numpy())

predictions = np.concatenate(predictions) # concatenate along axis 0 
offsets = np.concatenate(offsets)
seq_ids = np.concatenate(seq_ids)
labels = np.concatenate(labels)

location_predictions = predictions_to_locations(predictions, offsets, seq_ids, test=True)
test_df = create_submission_df()
test_df["location"] = location_predictions
test_df[["id", "location"]].to_csv("submission.csv", index=False)
pd.read_csv("submission.csv").head()

# References

1. https://pytorch.org/
1. https://huggingface.co/
1. https://en.wikipedia.org/wiki/F-score
1. https://huggingface.co/bert-base-uncased
1. https://huggingface.co/docs/datasets/index
1. https://huggingface.co/distilbert-base-uncased
1. https://en.wikipedia.org/wiki/Sigmoid_function
1. https://huggingface.co/docs/transformers/index
1. https://www.kaggle.com/code/odins0n/nbme-detailed-eda
1. https://towardsdatascience.com/the-f1-score-bec2bbc38aa6
1. https://www.kaggle.com/c/nbme-score-clinical-patient-notes
1. https://www.kaggle.com/code/nbroad/qa-ner-hybrid-train-nbme
1. https://pytorch.org/docs/stable/generated/torch.nn.BCELoss.html
1. https://www.kaggle.com/code/utcarshagrawal/nbme-complete-eda
1. https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html
1. https://analyticsindiamag.com/ultimate-guide-to-pytorch-optimizers/
1. https://www.kaggle.com/code/theoviel/evaluation-metric-folds-baseline
1. https://www.kaggle.com/code/tomohiroh/nbme-bert-for-beginners/notebook
1. https://pytorch.org/docs/stable/generated/torch.nn.BCEWithLogitsLoss.html
1. https://www.kaggle.com/code/ruchi798/score-clinical-patient-notes-spacy-w-b
1. https://pytorch.org/docs/stable/generated/torch.optim.lr_scheduler.StepLR.html
1. https://www.analyticsvidhya.com/blog/2021/03/binary-cross-entropy-log-loss-for-binary-classification/
1. https://www.analyticsvidhya.com/blog/2021/06/nlp-application-named-entity-recognition-ner-in-python-with-spacy/
1. https://medium.com/dejunhuang/learning-day-57-practical-5-loss-function-crossentropyloss-vs-bceloss-in-pytorch-softmax-vs-bd866c8a0d23