Original thanks to Jude Tchaye: https://www.kaggle.com/code/tchaye59/nbme-tensorflow-bert-baseline

Changes so far: 
cased version (to possibly better recognize names and abbreviations)

Changes planned: split rain-test, larger batches, GPU - move from Tensorflow 2.15 to PyTorch: https://towardsdatascience.com/the-subtleties-of-converting-a-model-from-tensorflow-to-pytorch-e9acc199b8bb

In [1]:
import numpy as np 
import pandas as pd
from sklearn.preprocessing import  LabelEncoder
from tqdm.auto import tqdm
import random
import os
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.layers as layers
import dill
import tensorflow.keras.backend as K
from tqdm.auto import tqdm
from tensorflow.keras import mixed_precision
import matplotlib.pyplot as plt
import tensorflow as tf
from transformers import AutoTokenizer, AutoConfig,TFAutoModel
import json




In [2]:
# NEW on TPU in TensorFlow 24: shorter cross-compatible TPU/GPU/multi-GPU/cluster-GPU detection code

try: # detect TPUs
    tpu  = tf.distribute.cluster_resolver.TPUClusterResolver.connect() # TPU detection
    tf.config.experimental_connect_to_cluster(tpu )
    tf.tpu.experimental.initialize_tpu_system(tpu )
    strategy = tf.distribute.TPUStrategy(tpu )
    print('Using TPU')
except ValueError: # detect GPUs
    tpu = None
    strategy = tf.distribute.MirroredStrategy() # for GPU or multi-GPU machines
    #strategy = tf.distribute.get_strategy() # default strategy that works on CPU and single GPU
    #strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() # for clusters of multi-GPU machines

print("Number of accelerators: ", strategy.num_replicas_in_sync)


AUTO = tf.data.experimental.AUTOTUNE
REPLICAS = strategy.num_replicas_in_sync
print(f'REPLICAS: {REPLICAS}')

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:CPU:0',)


Number of accelerators:  1
REPLICAS: 1


In [3]:
seed=999
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)
os.environ['TF_ENABLE_AUTO_MIXED_PRECISION'] = '1'
tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True})
print('Mixed precision enabled')

Mixed precision enabled


In [4]:
# fine-tunes all layers with 10 epochs and low learning rate
TRAIN = True
# fine tunes only last layer for efficiency, TODO
# https://stackoverflow.com/questions/56028464/fine-tuning-last-x-layers-of-bert
LAST_ONLY = False

 # Load dataframes

In [5]:
features = pd.read_csv("./data/features.csv")
patient_notes = pd.read_csv("./data/patient_notes.csv")
test = pd.read_csv("./data/test.csv")
train= pd.read_csv("./data/train.csv")
sample_submission= pd.read_csv("./data/sample_submission.csv")

In [6]:
test = test.merge(patient_notes,on=['case_num','pn_num']).merge(features,on=['case_num','feature_num'])
train = train.merge(patient_notes,on=['case_num','pn_num']).merge(features,on=['case_num','feature_num'])

In [7]:
train.head(5)

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,pn_history,feature_text
0,00016_000,0,16,0,['dad with recent heart attcak'],['696 724'],HPI: 17yo M presents with palpitations. Patien...,Family-history-of-MI-OR-Family-history-of-myoc...
1,00041_000,0,41,0,[],[],17 Y/O M CAME TO THE CLINIC C/O HEART POUNDING...,Family-history-of-MI-OR-Family-history-of-myoc...
2,00046_000,0,46,0,['father: heart attack'],['824 844'],Mr. Cleveland is a 17yo M who was consented by...,Family-history-of-MI-OR-Family-history-of-myoc...
3,00082_000,0,82,0,['Father MI'],['622 631'],17 yo M w/ no cardiac or arrhythmia PMH presen...,Family-history-of-MI-OR-Family-history-of-myoc...
4,00100_000,0,100,0,['Dad-MI'],['735 741'],HPI: Dillon Cleveland is an otherwise healthy ...,Family-history-of-MI-OR-Family-history-of-myoc...


# Tokenizer

In [8]:
MODEL_NAME = 'bert-base-uncased'
DATA_PATH = "./bert_input"
DATA_EXISTS = os.path.exists(DATA_PATH)
DATA_EXISTS = False
SEQUENCE_LENGTH = 512

In [9]:
if DATA_EXISTS:
    tokenizer = AutoTokenizer.from_pretrained(DATA_PATH+"/my_tokenizer/",normalization=True)
    config = AutoConfig.from_pretrained(DATA_PATH+"/my_tokenizer/config.json")
else:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME,normalization=True)
    config = AutoConfig.from_pretrained(MODEL_NAME)
    tokenizer.save_pretrained('my_tokenizer')
    config.save_pretrained('my_tokenizer')

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

# Encode the label

In [10]:
EMPTY =  'EMPTY'
CLASSES = [EMPTY,]+features.feature_num.unique().tolist()

if DATA_EXISTS:
    label_encoder = dill.load(open(DATA_PATH+"/label_encoder.dill",'rb'))
else:
    # label_encoder
    label_encoder = LabelEncoder()
    # Encode labels
    label_encoder.fit(CLASSES)
    dill.dump(label_encoder,open('label_encoder.dill','wb'))
# TODO assign some test_data from train
train['TARGET']= label_encoder.transform(train['feature_num'])
test['TARGET']= label_encoder.transform(test['feature_num'])
print(test)
N_CLASSES = len(label_encoder.classes_)
EMPTY_IDX = label_encoder.transform([EMPTY,]) [0]

          id  case_num  pn_num  feature_num   
0  00016_000         0      16            0  \
1  00016_001         0      16            1   
2  00016_002         0      16            2   
3  00016_003         0      16            3   
4  00016_004         0      16            4   

                                          pn_history   
0  HPI: 17yo M presents with palpitations. Patien...  \
1  HPI: 17yo M presents with palpitations. Patien...   
2  HPI: 17yo M presents with palpitations. Patien...   
3  HPI: 17yo M presents with palpitations. Patien...   
4  HPI: 17yo M presents with palpitations. Patien...   

                                        feature_text  TARGET  
0  Family-history-of-MI-OR-Family-history-of-myoc...       0  
1                 Family-history-of-thyroid-disorder       1  
2                                     Chest-pressure      18  
3                              Intermittent-symptoms      36  
4                                        Lightheaded      53  


In [11]:
def decode_location(locations):
    for x in ["[","]","'"]:
        locations = locations.replace(x,'')
    locations = locations.replace(',',';')
    locations = locations.split(";")
    res = []
    for location in locations:
        if location:
            x,y = location.split()
            res.append((int(x),int(y)))
    return sorted(res,key=lambda x:x[0])
    

In [12]:
if DATA_EXISTS:
    sequences = np.load(open(DATA_PATH+"/sequences.npy",'rb'))
    masks = np.load(open(DATA_PATH+"/masks.npy",'rb'))
    labels = np.load(open(DATA_PATH+"/labels.npy",'rb'))
else:
    sequences, labels, masks = [], [], []
    for g1 in tqdm(train.groupby('pn_num')):
        gdf = g1[1]
        pn_history  = gdf.iloc[0].pn_history

        tokens = tokenizer.encode_plus(pn_history, max_length=SEQUENCE_LENGTH, padding='max_length',truncation=True, return_offsets_mapping=True)
        sequence = tokens['input_ids']
        attention_mask = tokens['attention_mask']
        label = np.array([EMPTY_IDX for _ in range(SEQUENCE_LENGTH)])

        # BUILD THE TARGET ARRAY
        offsets = tokens['offset_mapping']
        label_empty = True
        for index, row in gdf.iterrows():
            TARGET = row.TARGET
            for i, (w_start, w_end) in enumerate(offsets):
                for start,end in decode_location(row.location):
                    if w_start < w_end and (w_start >= start) and (end >= w_end):
                        label[i] = TARGET
                        label_empty = False
                    if w_start >= w_end:
                        break
        if not label_empty:
            print(sequence)
            sequences.append(sequence)
            masks.append(attention_mask)
            labels.append(label)

    sequences = np.array(sequences).astype(np.int32)
    masks = np.array(masks).astype(np.uint8)
    labels = np.array(tf.keras.utils.to_categorical(labels,N_CLASSES)).astype(np.uint8)

    np.save(open("sequences.npy",'wb'), sequences)
    np.save(open("masks.npy",'wb'), masks)
    np.save(open("labels.npy",'wb'), labels)

  0%|          | 0/1000 [00:00<?, ?it/s]

[101, 6522, 2072, 1024, 2459, 7677, 1049, 7534, 2007, 14412, 23270, 10708, 1012, 5776, 4311, 1017, 1011, 1018, 2706, 1997, 23852, 4178, 1997, 1000, 2540, 6012, 1013, 9836, 2041, 1997, 2026, 3108, 1012, 1000, 1016, 2420, 3283, 2076, 1037, 4715, 2208, 2018, 2019, 2792, 1010, 2021, 2023, 2051, 2018, 3108, 3778, 1998, 2371, 2004, 2065, 2002, 2020, 2183, 2000, 3413, 2041, 1006, 2106, 2025, 4558, 9530, 18436, 2791, 1007, 1012, 1997, 3602, 5776, 2203, 5668, 2229, 8273, 7741, 5587, 21673, 2140, 1010, 3952, 2000, 2817, 1006, 1015, 1011, 1017, 2335, 2566, 2733, 1007, 1012, 2077, 3522, 4715, 2208, 1010, 2165, 5587, 2121, 7941, 2140, 2305, 2077, 1998, 2851, 1997, 2208, 1012, 23439, 2460, 2791, 1997, 3052, 1010, 22939, 8458, 16610, 2483, 1010, 9016, 2015, 1010, 10720, 2015, 1010, 14978, 1010, 16342, 1010, 3431, 1999, 3637, 1010, 3431, 1999, 4432, 1013, 4994, 1010, 21419, 29025, 2078, 1010, 3431, 1999, 6812, 2884, 2030, 24471, 3981, 2854, 14243, 1012, 7610, 2232, 2595, 1024, 3904, 1054, 2595, 1024, 

# Define Model

In [13]:
def build_model():
    
    tokens = tf.keras.layers.Input(shape=(SEQUENCE_LENGTH,), name = 'tokens', dtype=tf.int32)
    attention = tf.keras.layers.Input(shape=(SEQUENCE_LENGTH,), name = 'attention', dtype=tf.int32)
    
    if DATA_EXISTS:
        config = AutoConfig.from_pretrained(DATA_PATH+"/my_tokenizer/config.json")
        backbone = TFAutoModel.from_config(config)
    else:
        config = AutoConfig.from_pretrained(MODEL_NAME)
        backbone = TFAutoModel.from_pretrained(MODEL_NAME,config=config)
    
    out = backbone(tokens, attention_mask=attention)[0]
    out = tf.keras.layers.Dropout(0.2)(out)
    out = tf.keras.layers.Dense(N_CLASSES, activation='softmax')(out)
    
    model = tf.keras.Model([tokens,attention],out)
    
    return model

# Model training

In [14]:
#From TensorFlow 2.11 onwards, the only way to get GPU support on Windows is to use WSL2.
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))


Num GPUs Available:  0


In [15]:
if LAST_ONLY:
    ...
    # freeze all but one layer
    # https://stackoverflow.com/questions/56028464/fine-tuning-last-x-layers-of-bert


In [16]:
# Include the epoch in the file name (uses `str.format`)

# Saving weights: https://medium.com/analytics-vidhya/tensorflow-2-0-save-and-restore-models-4708ed3f0d8
checkpoint_path = "bert_training/cp-{epoch:04d}.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)


if TRAIN:
    # creating a model under the TPUStrategy will place the model in a replicated (same weights on each of the cores)
    # manner on the TPU and will keep the replica weights in sync by adding appropriate collective communications 
    # (all reducing the gradients).
    with strategy.scope():
        model = build_model()

        callback = tf.keras.callbacks.EarlyStopping(monitor='loss',mode='min', patience=3)
        # Create a callback that saves the model's weights every 5 epochs
        cp_callback = tf.keras.callbacks.ModelCheckpoint(
            filepath=checkpoint_path, 
            verbose=1, 
            save_weights_only=True,
            period=5)# previously, there was no period defined 

        # Compile the model
        model.compile(optimizer=tf.keras.optimizers.Adam(1e-5),
                      loss=tf.keras.losses.categorical_crossentropy,metrics=['acc',])

        ''' More extensive .fit function call:
            history = model.fit(
            x={'input_ids': x['input_ids'], 'attention_mask': x['attention_mask']},
            #x={'input_ids': x['input_ids']},
            y={'outputs': train_y},
            validation_split=0.1,
            batch_size=32,
            epochs=1)
        '''
        history = model.fit((sequences,masks),labels,
                            batch_size=12,
                            epochs=10,
                            callbacks=[callback,cp_callback])
        # validation_data=(x_val, y_val),

        model.save_weights(f'model.h5')
    # Display the model's architecture
    # model.summary()




Downloading:   0%|          | 0.00/511M [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
 9/84 [==>...........................] - ETA: 25:11 - loss: 0.4939 - acc: 0.9158

# Submit

In [None]:
if not TRAIN:
    model = build_model()
    #model.load_weights(DATA_PATH+"/model.h5")

In [None]:
test_sequences, test_masks, test_offsets = [], [],[]
row_ids = []
targets = []

for g1 in tqdm(test.groupby('pn_num')):
    gdf = g1[1]
    pn_history  = gdf.iloc[0].pn_history
    targets.append([])
    row_ids.append([])
    
    test_tokens = tokenizer.encode_plus(pn_history, max_length=SEQUENCE_LENGTH, padding='max_length',truncation=True, return_offsets_mapping=True)
    test_sequence = test_tokens['input_ids']
    test_attention_mask = test_tokens['attention_mask'] 

    # BUILD THE TARGET ARRAY
    offset = test_tokens['offset_mapping']
    
    for index, row in gdf.iterrows():
        targets[-1].append(row.TARGET)
        row_ids[-1].append(row.id)
         
    test_sequences.append(test_sequence)
    test_masks.append(test_attention_mask)
    test_offsets.append(offset)

test_sequences = np.array(test_sequences).astype(np.int32)
test_masks = np.array(test_masks).astype(np.uint8)
targets_to_row_ids = [dict(zip(a,b)) for a,b in zip(targets,row_ids)]

  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
preds = model.predict((test_sequences,test_masks),batch_size=16)
preds = np.argmax(preds,axis=-1)



In [None]:
def decode_position(pos):
    return ";".join([" ".join(np.array(p).astype(str)) for p in pos])


def translate(preds,targets_to_row_ids,offsets):
    all_ids = []
    all_pos = []

    for k in range(len(preds)):
        offset = offsets[k]
        pred = preds[k]
        targets_to_ids = targets_to_row_ids[k]
        
        prediction = {targets_to_ids[t]:[] for t in targets_to_ids}
        i = 0
        while i<SEQUENCE_LENGTH:
            label = pred[i]
            
            if label == EMPTY_IDX:
                i += 1
                continue
            if label in targets_to_ids:
                key = targets_to_ids[label]
                start = offset[i][0]
                while i<SEQUENCE_LENGTH:
                    if pred[i] != label:
                        break
                    else:
                        end = max(offset[i])
                    i += 1
                if  end == 0:
                    break
                prediction[key].append((start,end))
            else:
                i+=1
        for key in prediction:
            all_ids.append(key)
            all_pos.append(decode_position(prediction[key]))
    df = pd.DataFrame({
        "id":all_ids,
        "location": all_pos
    })
    return df

In [None]:
sub = translate(preds,targets_to_row_ids,test_offsets)
sub.to_csv('submission.csv',index=False)
sub.head(50)

Unnamed: 0,id,location
0,00016_000,696 724
1,00016_001,668 693
2,00016_002,203 217
3,00016_003,70 91
4,00016_004,222 258


In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(1e-5),
                      loss=tf.keras.losses.categorical_crossentropy,metrics=['acc',])
# TODO: larger test-mask
model.evaluate(x = (sequences, masks), y = labels, batch_size=12)
# f1:
# https://datascience.stackexchange.com/questions/45165/how-to-get-accuracy-f1-precision-and-recall-for-a-keras-model
# https://stackoverflow.com/questions/70589698/tensorflow-compute-precision-recall-f1-score
'''
evaluate(
    x=None,
    y=None,
    batch_size=None,
    verbose='auto',
    sample_weight=None,
    steps=None,
    callbacks=None,
    max_queue_size=10,
    workers=1,
    use_multiprocessing=False,
    return_dict=False,
    **kwargs
)
        More extensive .fit function call:
            history = model.fit(
            x={'input_ids': x['input_ids'], 'attention_mask': x['attention_mask']},
            #x={'input_ids': x['input_ids']},
            y={'outputs': train_y},
            validation_split=0.1,
            batch_size=32,
            epochs=1)

        history = model.fit((sequences,masks),labels,
                            batch_size=12,
                            epochs=10,
                            callbacks=[callback,])
'''



"\nevaluate(\n    x=None,\n    y=None,\n    batch_size=None,\n    verbose='auto',\n    sample_weight=None,\n    steps=None,\n    callbacks=None,\n    max_queue_size=10,\n    workers=1,\n    use_multiprocessing=False,\n    return_dict=False,\n    **kwargs\n)\n        More extensive .fit function call:\n            history = model.fit(\n            x={'input_ids': x['input_ids'], 'attention_mask': x['attention_mask']},\n            #x={'input_ids': x['input_ids']},\n            y={'outputs': train_y},\n            validation_split=0.1,\n            batch_size=32,\n            epochs=1)\n\n        history = model.fit((sequences,masks),labels,\n                            batch_size=12,\n                            epochs=10,\n                            callbacks=[callback,])\n"

In [None]:
print(np.array(test_sequences).shape)

(1, 512)
