**Imports**

In [None]:
import numpy as np 
import pandas as pd 
import re
import tqdm
import transformers
import tensorflow as tf
import json

from transformers import RobertaTokenizer, TFRobertaModel
from collections import Counter
from tqdm import tqdm

**Settings**

In [None]:
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_column', None)
pd.set_option('display.max_seq_items', None)
pd.set_option('display.max_colwidth', 500)
pd.set_option('expand_frame_repr', True)

**Load**

In [None]:
roberta_model = TFRobertaModel.from_pretrained('../input/score-clinical-patient/roberta_large/roberta_large', config='../input/score-clinical-patient/roberta_large/roberta_large/config.json')
tokenizer = RobertaTokenizer.from_pretrained('../input/score-clinical-patient/roberta_large_tokenizer/roberta_large_tokenizer')

data = pd.read_csv('../input/nbme-score-clinical-patient-notes/train.csv')
data['num'] = data.index
patient_notes = pd.read_csv('../input/nbme-score-clinical-patient-notes/patient_notes.csv')
with open('../input/score-clinical-patient/roberta_large_tokenizer/roberta_large_tokenizer/vocab.json') as json_file:
    json_data = json.load(json_file)
    vocab = {v: k for k, v in json_data.items()}
    
def decode(token_id):
    return vocab[token_id]

In [None]:
rgroups

**Rgroups**

In [None]:
rgroups = pd.read_csv('../input/score-clinical-patient/rgroups_version2.csv')

rgroups.iloc[3]['name'] = 'water'
rgroups.iloc[6]['name'] = 'irregular'
rgroups.iloc[7]['name'] = 'medication'
rgroups.iloc[8]['name'] = 'abdominal'
rgroups.iloc[9]['name'] = 'migraine'
rgroups.iloc[10]['name'] = 'thyroid'
rgroups.iloc[11]['name'] = 'abdomen'
rgroups.iloc[12]['name'] = 'tight'
rgroups.iloc[13]['name'] = 'heart'
rgroups.iloc[14]['name'] = 'pain'
rgroups.iloc[15]['name'] = 'appetite'
rgroups.iloc[16]['name'] = 'decrease'
rgroups.iloc[17]['name'] = 'increase'

rgroups.iloc[7]['words'] = "['aderal', 'aderall', 'aderals', 'aderel', 'aderil', 'adernal', 'aderol', 'aderole', 'aderoll', 'aderolol', 'aderols', 'aderral', 'aderrall', 'aderrol', 'aderroll', 'adheral', 'adherall', 'adherol', 'addyrall', 'addril', 'addrolls', 'addrrel', 'addral', 'addrall', 'addreal', 'addreall', 'addrell', 'addrella', 'addrelle', 'addreral', 'addira', 'addirall', 'addheral', 'adder', 'adderaal', 'adderal', 'adderall', 'adderalla', 'adderalll', 'adderalls', 'adderallto', 'adderally', 'adderals', 'addereall', 'adderell', 'adderil', 'adderol', 'adderoll', 'adderral', 'adderrall', 'adderrol', 'adderroll', 'adderrral', 'addearall', 'addarol', 'addarral', 'addderal', 'addderall', 'adarol', 'adaral', 'adarall', 'adarell', 'adaril', 'dsadderall', 'usesadderall', 'alderal', 'anderal', 'amphatamine', 'amphatamnine', 'ampheatmine', 'amphentamine', 'amphetaine', 'amphetamine', 'amphetamines', 'amphetimine', 'amphitamine', 'anfetamine', 'gogomol', 'gogogogomolen']"

words = []
for count, row in rgroups.iterrows():
    words = words + eval(row['words'])
words = set(words)

rgroups_dict = dict()
for word in tqdm(words, total=len(words)):
    tag = ''
    for count, row in rgroups.iterrows():
        if word in eval(row['words']):
            tag = tag + ' ' + row['name']
    tag = "</s>" + tag + "</s>"
    rgroups_dict[word] = tag

display(rgroups)

In [None]:
features = pd.read_csv('../input/nbme-score-clinical-patient-notes/features.csv')
features['feature_text'] = [' family history of myocardial infarction',
 ' family history of thyroid disorder',
 ' chest pressure',
 ' intermittent symptoms',
 ' lightheaded',
 ' no hair changes no nail changes no temperature intolerance',
 ' adderall use',
 ' shortness of breath',
 ' caffeine use',
 ' heart pounding heart racing',
 ' 2-3-4 months duration',
 ' age in years , year',
 ' male female',
 ' no vaginal discharge',
 ' weight loss',
 ' not sexually active',
 ' prior episodes of diarrhea',
 ' age in years , year',
 ' no bloody bowel movements',
 ' recurrent bouts over past few months',
 ' right lower quadrant abdominal pain',
 ' no urinary symptoms',
 ' diminished appetite',
 ' normal last menstrual period few weeks ago',
 ' few to few hours of acute pain',
 ' male female',
 ' prior normal periods',
 ' last pap smear i year ago',
 ' iud',
 ' sexually active',
 ' vaginal dryness',
 ' irregular menses',
 ' recent nausea vomiting recent flulike symptoms',
 ' no premenstrual symptoms',
 ' male female',
 ' has stress , has some stress',
 ' last menstrual period few months ago',
 ' hot flashes',
 ' irregular flow irregular frequency irregular intervals',
 ' onset few years ago',
 ' heavy sweating',
 ' sleep disturbance early awakenings',
 ' age in years , year',
 ' family history of peptic ulcer disease',
 ' epigastric discomfort',
 ' darker bowel movements',
 ' nsaid use nonsteroidal anti inflammatory drug use',
 ' burning gnawing or burning and gnawing',
 ' post prandial bloating fullness with meals',
 ' getting worse progressive symptoms now daily',
 ' few to few beers a week',
 ' male female',
 ' duration 2 months',
 ' awakens at night',
 ' no blood in stool',
 ' intermittent',
 ' minimal to no change with tums',
 ' nausea',
 ' age in years , year',
 ' lack of other thyroid symptoms',
 ' anxious nervous',
 ' stress due to caring for elderly parents',
 ' heavy caffeine use',
 ' no depressed mood',
 ' weight stable',
 ' insomnia',
 ' male female',
 ' decreased appetite',
 ' age in years , year',
 ' onset few years ago',
 ' male female',
 ' no caffeine use',
 ' associated shortness of breath',
 ' episodes of heart racing',
 ' recent visit to emergency department with negative workup',
 ' no chest pain',
 ' no illicit drug use',
 ' associated nausea',
 ' increased frequency recently',
 ' associated feeling of impending doom',
 ' episodes last few to few minutes',
 ' associated throat tightness',
 ' feels hot feels clammy',
 ' episode hand numbness episode of finger numbness',
 ' fatigue difficulty concentrating',
 ' increased stress',
 ' age in years , year',
 ' subjective fevers',
 ' male female',
 ' age in years , year',
 ' recent upper respiratory symptoms',
 ' worse with deep breath or pleuritic',
 ' exercise induced asthma',
 ' chest pain',
 ' duration few day',
 ' no shortness of breath',
 ' recent heavy lifting at work recent rock climbing',
 ' no relief with asthma inhaler',
 ' sharp stabbing few to few out of few on pain scale',
 ' male female',
 ' weight gain',
 ' heavy periods irregular periods',
 ' last menstrual period few months ago',
 ' unprotected sex',
 ' fatigue',
 ' infertility hx infertility history',
 ' age in years , year',
 ' symptoms for 6 months',
 ' increased appetite',
 ' son died few weeks ago',
 ' male female',
 ' auditory hallucination once',
 ' tossing and turning',
 ' age in years , year',
 ' difficulty falling asleep',
 ' hallucinations after taking ambien',
 ' duration few weeks',
 ' unsuccessful napping',
 ' sleeping medication ineffective',
 ' diminished energy feeling drained',
 ' loss of interest',
 ' visual hallucination once',
 ' fhx of depression family history of depression',
 ' early wakening',
 ' no suicidal ideations',
 ' difficulty with sleep',
 ' no relief with motrin no relief with tylenol',
 ' age in years , year',
 ' few day duration few days duration',
 ' myalgias',
 ' global headache diffuse headache',
 ' neck pain',
 ' vomiting',
 ' no rash',
 ' nausea',
 ' viral symptoms rhinorrhea scratchy throat',
 ' shares an apartment',
 ' meningococcal vaccine status unknown',
 ' family history of migraines',
 ' male female',
 ' photophobia',
 ' no known illness contacts',
 ' subjective fever']

In [None]:
data = data.merge(features)
data = data.merge(patient_notes)
submission_ids = data.id

In [None]:
def text_transform(text):
    text = text.lower()
    text = re.sub(r'-', r' ', text)
    text = re.sub(r'\(', r' ', text)
    text = re.sub(r'\)', r' ', text)
    text = re.sub(r'\{', r' ', text)
    text = re.sub(r'\}', r' ', text)
    text = re.sub(r'\[', r' ', text)
    text = re.sub(r'\]', r' ', text)
    text = re.sub(r'\:', r' ', text)
    text = re.sub(r'\;', r' ', text)
    text = re.sub(r'\"', r' ', text)
    text = re.sub(r"\'", r' ', text)
    text = re.sub(r'\+', r' ', text)
    text = re.sub(r'\/', r' ', text)
    text = re.sub(r'\\', r' ', text)
    text = re.sub(r'\&', r' ', text)
    text = re.sub(r'\_', r' ', text)
    text = re.sub(r'\=', r' ', text)
    text = re.sub(r'\*', r' ', text)
    text = re.sub(r'\<', r' ', text)
    text = re.sub(r'\>', r' ', text)
    text = re.sub(r'\%', r' ', text)
    text = re.sub(r'\~', r' ', text)
    return text

In [None]:
train_examples_len = len(data)
MAX_TEXT_LEN = 512

# inputs ids 0 
# attention masks 1
# token type ids 2
# text masks 3
input_data = np.zeros((train_examples_len, MAX_TEXT_LEN, 4), dtype='int32')
targets_data = np.zeros((train_examples_len, MAX_TEXT_LEN), dtype='int32')

In [None]:
def add_text(pn_history, text, position, side, pn_history_targets=None, target=None):
    if position <= 0 and side=='left':
        pn_history = text + pn_history
        if type(pn_history_targets) == np.ndarray:
            pn_history_targets = np.concatenate(([target] * len(text), pn_history_targets))
            return pn_history, pn_history_targets
        else:
            return pn_history, None
        
    if position >= len(pn_history)-1 and side=='right':
        pn_history = pn_history + text
        if type(pn_history_targets) == np.ndarray:
            pn_history_targets = np.concatenate((pn_history_targets, [target] * len(text)))
            return pn_history, pn_history_targets
        else:
            return pn_history, None
        
    if side=='right':
        position += 1
    
    pn_history_a = pn_history[:position]
    pn_history_b = pn_history[position:]
    pn_history = pn_history_a + text + pn_history_b
    if type(pn_history_targets) == np.ndarray:
        pn_history_targets_a = pn_history_targets[:position]
        pn_history_targets_b = pn_history_targets[position:]
        pn_history_targets = np.concatenate((pn_history_targets_a, [target] * len(text), pn_history_targets_b))
        return pn_history, pn_history_targets
    else:
        return pn_history, None

def get_example(pn_history, feature_text, pn_history_targets=None):
    pn_history = text_transform(pn_history)
    pn_history, pn_history_targets = add_text(pn_history=pn_history, 
                                              text=' ', 
                                              position=0, 
                                              side='left', 
                                              pn_history_targets=pn_history_targets, 
                                              target=0)
    pn_history_split = []
    text_offsets, idx = [], 0
    split = pn_history.split(' ')
    for count, word in enumerate(split):
        if len(word) == 0:
            pn_history_split.append(' ')
            text_offsets.append((idx, idx+1))
            idx += 1
        else:
            pn_history_split.append(word)
            text_offsets.append((idx, idx+len(word)))
            idx += len(word)
            if count != (len(split)-1):
                pn_history_split.append(' ')
                text_offsets.append((idx, idx+1))
                idx += 1
       
    additions = []
    for word, offset in zip(pn_history_split[::-1], text_offsets[::-1]):
        reduction = False
        if len(word) >= 2:
            if word[-1] in ['.', ',', '!', '?']:
                word = word[:-1]
                reduction = True
        if word == ' ':
            continue
        tag = rgroups_dict.get(word, False)
        if tag:
            position = offset[1]-1
            if reduction:
                position -= 1
            if type(pn_history_targets) == np.ndarray:
                if np.sum(pn_history_targets[offset[0]:offset[1]]) > 0:
                    additions.append((tag, position, 1))
                else:
                    additions.append((tag, position, 0))
            else:
                additions.append((tag, position, None))
                
    for addition in additions:
        pn_history, pn_history_targets = add_text(pn_history=pn_history, 
                                                  text=addition[0], 
                                                  position=addition[1], 
                                                  side='right', 
                                                  pn_history_targets=pn_history_targets, 
                                                  target=addition[2])

    #############################################################################################################################################
    
    text_ids = tokenizer.encode(pn_history, add_special_tokens=False)
    feature_ids = tokenizer.encode(feature_text, add_special_tokens=False)
    main_ids = [0] + text_ids + [2, 2] + feature_ids + [2]
    
    text_offsets, idx = [], 0
    for text_id in text_ids:
        decode_text = decode(text_id)
        text_offsets.append((idx, idx+len(decode_text)))
        idx += len(decode_text)
    
    if type(pn_history_targets) == np.ndarray:
        text_ids_targets = [0] * len(text_ids)
        for count_1, (c, d) in enumerate(text_offsets):
            if np.sum(pn_history_targets[c:d]) > 0:
                text_ids_targets[count_1] = 1
                
        main_ids_targets = [0] + text_ids_targets + [0, 0] + [1] * len(feature_ids) + [0]
        
        input_data_example = np.zeros((MAX_TEXT_LEN, 4), dtype='int32')
        input_data_example[:, 0] = 1
        input_data_example[:len(main_ids), 0] = main_ids
        input_data_example[:len(main_ids), 1] = [1] * len(main_ids)
        input_data_example[1:len(text_ids)+1, 3] = 1
        
        targets_data_example = np.zeros((MAX_TEXT_LEN), dtype='int32')
        targets_data_example[:len(main_ids)] = main_ids_targets
        
        return input_data_example, targets_data_example
    
    else:
        input_data_example = np.zeros((MAX_TEXT_LEN, 4), dtype='int32')
        input_data_example[:, 0] = 1
        input_data_example[:len(main_ids), 0] = main_ids
        input_data_example[:len(main_ids), 1] = [1] * len(main_ids)
        input_data_example[1:len(text_ids)+1, 3] = 1
        
        return input_data_example
        
def get_pn_history_targets(pn_history, locations):
    pn_history_targets = np.zeros((len(pn_history)), dtype='int32')
    if len(locations) > 2:
        locations = [location.strip() for location in locations[1:-1].split(',')]
        for location in locations:
            location = location.split(';')
            for element in location:
                element = re.sub(r"\'", '', element)
                element = element.split()
                element = [int(t) for t in element]
                pn_history_targets[element[0]:element[1]] = 1
    return pn_history_targets
    
for count, row in tqdm(data.iterrows(), total=len(data)):
    pn_history = row['pn_history']
    feature_text = row['feature_text']
    input_data[count] = get_example(pn_history, feature_text)

In [None]:
def build_model():
    ids = tf.keras.layers.Input((MAX_TEXT_LEN,), dtype=tf.int32)
    att = tf.keras.layers.Input((MAX_TEXT_LEN,), dtype=tf.int32)
    tok = tf.keras.layers.Input((MAX_TEXT_LEN,), dtype=tf.int32)
    
    x = roberta_model(ids, attention_mask=att, token_type_ids=tok)
    
    x1 = tf.keras.layers.Dense(1, activation='sigmoid')(x[0])
    
    model = tf.keras.models.Model(inputs=[ids, att, tok], outputs=x1)
    model.summary()
    
    return model

model = build_model()   
optimizer = tf.keras.optimizers.Adam(learning_rate=1.5e-5)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
model.load_weights('../input/score-clinical-patient/model_best(3).h5')

In [None]:
def model_stats(input_data, stats_nums):
    preds = model.predict([input_data[stats_nums, :, 0], input_data[stats_nums, :, 1], input_data[stats_nums, :, 2]], verbose=1)
    nums, encode_ids, decode_ids, predictions = [], [], [], []
    for (count, ids, predictions_0, text_mask_0) in zip(stats_nums, input_data[stats_nums, :, 0], preds, input_data[stats_nums, :, 3]):
        for encode_id, prediction, text_mask in zip(ids, predictions_0, text_mask_0):
            if text_mask:
                nums.append(count)
                encode_ids.append(encode_id)
                decode_ids.append("'" + decode(encode_id) + "'")
                predictions.append(prediction[0])
        
    predictions_csv = pd.DataFrame({'num': nums,
                           'encode_id': encode_ids,
                           'decode_id': decode_ids,
                           'prediction': predictions,
    })

    predictions_csv['prediction1'] = predictions_csv['prediction'].apply(lambda x: 1 if x > 0.5 else 0)

    def get_predictions_ids(predictions, text_nums):
        series = []
        counter = -1
        prediction_save = 'Start'
        text_num_save = 0
        for prediction, text_num in tqdm(zip(predictions.iloc[0:], text_nums), total=len(text_nums)):
            if prediction_save != prediction:
                counter = counter + 1
                prediction_save = prediction
            if text_num_save != text_num:
                counter = counter + 1
                text_num_save = text_num
            if prediction:
                series.append(counter)
            else:
                series.append(-1)
        return pd.Series(series, predictions.index)

    predictions_csv['prediction1_id'] = get_predictions_ids(predictions_csv['prediction1'], predictions_csv['num'])
    nums, predictions = [], []
    for num in tqdm(pd.unique(predictions_csv['num'])):
        predictions_csv_num = predictions_csv[predictions_csv['num'] == num].copy()
        
        def get_tag_labels(decode_ids):
            series, idx = [], 0
            for decode_id in decode_ids.iloc[:]:
                if decode_id == "'</s>'":
                    if idx == 0:
                        idx = 1
                    elif idx == 1:
                        idx = 0
                    series.append(1)
                else:
                    series.append(idx)
                    
            return pd.Series(series, decode_ids.index)
        
        predictions_csv_num['tag'] = get_tag_labels(predictions_csv_num['decode_id'])
        #predictions_csv_num = predictions_csv_num[predictions_csv_num['tag'] == 0]
        #predictions_csv_num['text_offsets'] = predictions_csv_num['decode_id'].apply(lambda x: len(x) - 2)
        #predictions_csv_num['text_offsets'] = predictions_csv_num['text_offsets'].cumsum()
        #predictions_csv_num['text_offsets'] = predictions_csv_num['text_offsets'] - 1
        #predictions_csv_num['text_offsets_shift'] = predictions_csv_num['text_offsets'].shift(1, fill_value = 0)
        display(predictions_csv_num)
        predictions_csv_num = predictions_csv_num[predictions_csv_num['prediction1_id'] >= 0]
        locations = []
        for prediction1_id in pd.unique(predictions_csv_num['prediction1_id']):
            predictions_csv_num_prediction1_id = predictions_csv_num[predictions_csv_num['prediction1_id'] == prediction1_id].copy()
            if predictions_csv_num_prediction1_id['decode_id'].values[0][1] == 'Ġ':
                locations.append(str(predictions_csv_num_prediction1_id['text_offsets_shift'].values[0]+1) + ' ' + str(predictions_csv_num_prediction1_id['text_offsets'].values[-1]))
            else:
                locations.append(str(predictions_csv_num_prediction1_id['text_offsets_shift'].values[0]) + ' ' + str(predictions_csv_num_prediction1_id['text_offsets'].values[-1]))
        locations = ';'.join(locations)
        nums.append(num)
        predictions.append(locations)

#submit = model_stats(input_data, range(len(data)))
#submit.to_csv('submission.csv', index=False)
#display(submit)

In [None]:
pd.udata['feature_text']

In [None]:
data[data['feature_text'] == ' adderall use'].iloc[-2]['pn_history']

In [None]:
text = "Mr. Cleveland is a 17 yo male who presents with a chief complaint of heart pounding. Patient has experienced this sensation for the past 2-3 months, 5-6 times total, roughly 2x/month. The last episode was 2 days ago while playing basketball. Patient describes heart racing, pressure, shortness of breath, and light headedness. Episodes have occured variably, some while exercising, some while resting. He reports the use of a friend's gogogogomolen roughly 2x/week for the past 7 months to help with concenctration. He also endorses drinking 3-4 cups of coffee/day and energy drinks as well. Patient is a freshman in collee, sexually active with his long term girlfriend, uses condoms, but denies use of tobacco products and ilicit drugs. He reports drinking 3-4 alcholoic drinks on weekends. Family hx is positive for thyroid dx in his mother and a MI in his father. He has no significiant medical/surgical hx. No allergies or weight/appetite changes."

In [None]:
example = get_example(text, ' adderall use')
example = np.expand_dims(example, 0)
model_stats(example, [0])

In [None]:
example