# All the important imports

In [1]:
import numpy as np
import pandas as pd

import transformers
import tokenizers
import torch
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from tqdm.autonotebook import tqdm

from ast import literal_eval
import time

In [2]:
torch.cuda.empty_cache()

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [6]:
BASE_PATH = "../data/"
features_df = pd.read_csv(BASE_PATH + "features.csv")
patient_notes_df = pd.read_csv(BASE_PATH + "patient_notes.csv")
train_df = pd.read_csv(BASE_PATH +"train.csv")
test_df = pd.read_csv(BASE_PATH + "test.csv")
#submission_df = pd.read_csv(BASE_PATH"sample_submission.csv")

In [9]:
test_df.shape

(5, 4)

In [10]:
features_df.tail()

Unnamed: 0,feature_num,case_num,feature_text
138,912,9,Family-history-of-migraines
139,913,9,Female
140,914,9,Photophobia
141,915,9,No-known-illness-contacts
142,916,9,Subjective-fever


In [11]:
features_df.nunique()

feature_num     143
case_num         10
feature_text    131
dtype: int64

In [7]:
pd.set_option('max_colwidth', None)
patient_notes_df.head()

Unnamed: 0,pn_num,case_num,pn_history
0,0,0,"17-year-old male, has come to the student health clinic complaining of heart pounding. Mr. Cleveland's mother has given verbal consent for a history, physical examination, and treatment\r\n-began 2-3 months ago,sudden,intermittent for 2 days(lasting 3-4 min),worsening,non-allev/aggrav\r\n-associated with dispnea on exersion and rest,stressed out about school\r\n-reports fe feels like his heart is jumping out of his chest\r\n-ros:denies chest pain,dyaphoresis,wt loss,chills,fever,nausea,vomiting,pedal edeam\r\n-pmh:non,meds :aderol (from a friend),nkda\r\n-fh:father had MI recently,mother has thyroid dz\r\n-sh:non-smoker,mariguana 5-6 months ago,3 beers on the weekend, basketball at school\r\n-sh:no std"
1,1,0,"17 yo male with recurrent palpitations for the past 3 mo lasting about 3 - 4 min, it happened about 5 - 6 times since the beginning. One time durign a baskeball game two days ago light headedness, pressure in the chest, catching breath, but no fainting. During teh episodes no sweating. No diarrhea, no heat intolerance, no weight loss. Has tried aterol to be able to better concentrate, has received it from his roommate. ."
2,2,0,"Dillon Cleveland is a 17 y.o. male patient with no significant PMH who presents with complaints of heart pounding. This has been going on for a few months and happens once or twice a month. He cannot think of any triggers, and it has occurred both with activity and at rest. Occasionally, it is accompanied by chest pressure but not pain that is located at the center of his chest. On one instance, he experienced chest pressure, lightheadedness and shortness of breath for 10 minutes with the heart pounding. \r\n\r\nOtherwise, he has not had shortness of breath, chest pain, anxiety, \r\n\r\nMedication: adderall twice a week as study aid (not prescribed)\r\nFH: mother - thyroid disease, father - heart attack at age 52, both living\r\nSH: no smoking; 3-4 drinks on the weekend per sitting; marijuanna once recently, no other recreational drugs"
3,3,0,"a 17 yo m c/o palpitation started 3 mos ago; \r\nNOTHING IMPROVES OR EXACERBATES THE SYMPTOMS ACCORDING TO HIM; IT CAN HAPPEN ANY TIME; MAY TAKE A FEW MINUTES; LAST TIME HAPPENED 2 DAYS AGO DURING PLAYING A GAME AND IT WAS ASSOCIATED WITH RETROSTERNAL PRESSURE LIKE DISCOMFORT; AND HE FELT LIGHTHEADED. BUT HE DID NOT LOSE CONCIOUSNESS AND DID NOT FALL. \r\nNOT ASSOCIATED WITH NAUSEA VOMITING; HEADACHE; ABDOMINAL PAIN; CHANGES IN URINATION OR BOWEL HABITS, OR TREMOR OR SKIN OR HAIR CHANGE OR INTOLERABC\r\n\r\nPMH NONE\r\nPSHH: NONE\r\nMEDS: ADEROL TO STAY AWAKE\r\nHOSP: NONE\r\nFH: MOTHER HAS THYROID DISEASE; FATHER HAS CAD X 1 YR\r\nSH: NO SMOKING; DRINKING ON WEEKENDS CAGE 0/4; ONE TIME USE OF MARIJUANA\r\n\r\n\r\n"
4,4,0,"17yo male with no pmh here for evaluation of palpitations. States for the last 3-4mo he has felt that his heart with intermittently ""beat out of his chest,"" with some associated difficulty catching his breath. States that the most recent event was 2 days ago, and during activity at a soccer game. He does not seem to note any specific precipitatinig factors at this time. He also states that he feels as if he will faint during these events, but has not lost consciousness at any point. Furthermore, he does endorse theses attacks occuring 1-2 times a month and peak at 4 mins. He denies any stressors at home. ROS: denies weight loss, fevers, recnet illness, change in bowel habits. PMH: negative, PSH negative, FHX mom with thyroid disorder, dad with heart condition and MI at 52yo. SHX no tobacco, ETOH on weekends, Marijuana tried a month ago. Med: is taking some of roommates Adderoll intermittently (last was 2 days ago prior to event). KNDA"


In [13]:
patient_notes_df.nunique()

pn_num        42146
case_num         10
pn_history    42146
dtype: int64

In [14]:
train_df

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location
0,00016_000,0,16,0,['dad with recent heart attcak'],['696 724']
1,00016_001,0,16,1,"['mom with ""thyroid disease']",['668 693']
2,00016_002,0,16,2,['chest pressure'],['203 217']
3,00016_003,0,16,3,"['intermittent episodes', 'episode']","['70 91', '176 183']"
4,00016_004,0,16,4,['felt as if he were going to pass out'],['222 258']
...,...,...,...,...,...,...
14295,95333_912,9,95333,912,[],[]
14296,95333_913,9,95333,913,[],[]
14297,95333_914,9,95333,914,['photobia'],['274 282']
14298,95333_915,9,95333,915,['no sick contacts'],['421 437']


In [15]:
train_df.nunique()

id             14300
case_num          10
pn_num          1000
feature_num      143
annotation      5201
location        7534
dtype: int64

In [8]:
df = pd.merge(train_df, features_df, on=['feature_num','case_num'], how='inner')
df =pd.merge(df, patient_notes_df, on=['pn_num','case_num'], how='inner')
df = df.head(100)

In [17]:
df.nunique()

id              14300
case_num           10
pn_num           1000
feature_num       143
annotation       5201
location         7534
feature_text      131
pn_history       1000
dtype: int64

In [18]:
df['feature_text'].value_counts()

Female                                   700
Male                                     300
20-year                                  200
Nausea                                   200
35-year                                  200
                                        ... 
44-year                                  100
Sleep-disturbance-OR-Early-awakenings    100
Heavy-sweating                           100
Onset-3-years-ago                        100
Subjective-fever                         100
Name: feature_text, Length: 131, dtype: int64

##### The 'annotation' is picked form the 'pn_history' and the text location is mentiond in 'location' column. Location:Character spans indicating the location(s) of the feature within the note.



In [19]:
df['pn_history'][1]

'HPI: 17yo M presents with palpitations. Patient reports 3-4 months of intermittent episodes of "heart beating/pounding out of my chest." 2 days ago during a soccer game had an episode, but this time had chest pressure and felt as if he were going to pass out (did not lose conciousness). Of note patient endorses abusing adderall, primarily to study (1-3 times per week). Before recent soccer game, took adderrall night before and morning of game. Denies shortness of breath, diaphoresis, fevers, chills, headache, fatigue, changes in sleep, changes in vision/hearing, abdominal paun, changes in bowel or urinary habits. \r\nPMHx: none\r\nRx: uses friends adderrall\r\nFHx: mom with "thyroid disease," dad with recent heart attcak\r\nAll: none\r\nImmunizations: up to date\r\nSHx: Freshmen in college. Endorses 3-4 drinks 3 nights / week (on weekends), denies tabacco, endorses trying marijuana. Sexually active with girlfriend x 1 year, uses condoms'

In [20]:
print(df['pn_history'][1][668 : 693])
print(df['pn_history'][1][203 : 217])
print(df['pn_history'][1][696 : 724])

mom with "thyroid disease
chest pressure
dad with recent heart attcak


In [9]:
df["annotation"] = [literal_eval(x) for x in df["annotation"]]
df["location"] = [literal_eval(x) for x in df["location"]]
df

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,feature_text,pn_history
0,00016_000,0,16,0,[dad with recent heart attcak],[696 724],Family-history-of-MI-OR-Family-history-of-myocardial-infarction,"HPI: 17yo M presents with palpitations. Patient reports 3-4 months of intermittent episodes of ""heart beating/pounding out of my chest."" 2 days ago during a soccer game had an episode, but this time had chest pressure and felt as if he were going to pass out (did not lose conciousness). Of note patient endorses abusing adderall, primarily to study (1-3 times per week). Before recent soccer game, took adderrall night before and morning of game. Denies shortness of breath, diaphoresis, fevers, chills, headache, fatigue, changes in sleep, changes in vision/hearing, abdominal paun, changes in bowel or urinary habits. \r\nPMHx: none\r\nRx: uses friends adderrall\r\nFHx: mom with ""thyroid disease,"" dad with recent heart attcak\r\nAll: none\r\nImmunizations: up to date\r\nSHx: Freshmen in college. Endorses 3-4 drinks 3 nights / week (on weekends), denies tabacco, endorses trying marijuana. Sexually active with girlfriend x 1 year, uses condoms"
1,00016_001,0,16,1,"[mom with ""thyroid disease]",[668 693],Family-history-of-thyroid-disorder,"HPI: 17yo M presents with palpitations. Patient reports 3-4 months of intermittent episodes of ""heart beating/pounding out of my chest."" 2 days ago during a soccer game had an episode, but this time had chest pressure and felt as if he were going to pass out (did not lose conciousness). Of note patient endorses abusing adderall, primarily to study (1-3 times per week). Before recent soccer game, took adderrall night before and morning of game. Denies shortness of breath, diaphoresis, fevers, chills, headache, fatigue, changes in sleep, changes in vision/hearing, abdominal paun, changes in bowel or urinary habits. \r\nPMHx: none\r\nRx: uses friends adderrall\r\nFHx: mom with ""thyroid disease,"" dad with recent heart attcak\r\nAll: none\r\nImmunizations: up to date\r\nSHx: Freshmen in college. Endorses 3-4 drinks 3 nights / week (on weekends), denies tabacco, endorses trying marijuana. Sexually active with girlfriend x 1 year, uses condoms"
2,00016_002,0,16,2,[chest pressure],[203 217],Chest-pressure,"HPI: 17yo M presents with palpitations. Patient reports 3-4 months of intermittent episodes of ""heart beating/pounding out of my chest."" 2 days ago during a soccer game had an episode, but this time had chest pressure and felt as if he were going to pass out (did not lose conciousness). Of note patient endorses abusing adderall, primarily to study (1-3 times per week). Before recent soccer game, took adderrall night before and morning of game. Denies shortness of breath, diaphoresis, fevers, chills, headache, fatigue, changes in sleep, changes in vision/hearing, abdominal paun, changes in bowel or urinary habits. \r\nPMHx: none\r\nRx: uses friends adderrall\r\nFHx: mom with ""thyroid disease,"" dad with recent heart attcak\r\nAll: none\r\nImmunizations: up to date\r\nSHx: Freshmen in college. Endorses 3-4 drinks 3 nights / week (on weekends), denies tabacco, endorses trying marijuana. Sexually active with girlfriend x 1 year, uses condoms"
3,00016_003,0,16,3,"[intermittent episodes, episode]","[70 91, 176 183]",Intermittent-symptoms,"HPI: 17yo M presents with palpitations. Patient reports 3-4 months of intermittent episodes of ""heart beating/pounding out of my chest."" 2 days ago during a soccer game had an episode, but this time had chest pressure and felt as if he were going to pass out (did not lose conciousness). Of note patient endorses abusing adderall, primarily to study (1-3 times per week). Before recent soccer game, took adderrall night before and morning of game. Denies shortness of breath, diaphoresis, fevers, chills, headache, fatigue, changes in sleep, changes in vision/hearing, abdominal paun, changes in bowel or urinary habits. \r\nPMHx: none\r\nRx: uses friends adderrall\r\nFHx: mom with ""thyroid disease,"" dad with recent heart attcak\r\nAll: none\r\nImmunizations: up to date\r\nSHx: Freshmen in college. Endorses 3-4 drinks 3 nights / week (on weekends), denies tabacco, endorses trying marijuana. Sexually active with girlfriend x 1 year, uses condoms"
4,00016_004,0,16,4,[felt as if he were going to pass out],[222 258],Lightheaded,"HPI: 17yo M presents with palpitations. Patient reports 3-4 months of intermittent episodes of ""heart beating/pounding out of my chest."" 2 days ago during a soccer game had an episode, but this time had chest pressure and felt as if he were going to pass out (did not lose conciousness). Of note patient endorses abusing adderall, primarily to study (1-3 times per week). Before recent soccer game, took adderrall night before and morning of game. Denies shortness of breath, diaphoresis, fevers, chills, headache, fatigue, changes in sleep, changes in vision/hearing, abdominal paun, changes in bowel or urinary habits. \r\nPMHx: none\r\nRx: uses friends adderrall\r\nFHx: mom with ""thyroid disease,"" dad with recent heart attcak\r\nAll: none\r\nImmunizations: up to date\r\nSHx: Freshmen in college. Endorses 3-4 drinks 3 nights / week (on weekends), denies tabacco, endorses trying marijuana. Sexually active with girlfriend x 1 year, uses condoms"
...,...,...,...,...,...,...,...,...
95,00211_004,0,211,4,[lightheaded],[374 385],Lightheaded,"HPI: Patient is a 17 yo m with a c/o of palpitations. Palpitations began a few months ago. States that palpitations are sudden, unpredictable and feel like his heart is pounding fast/jumping out of his chest. Typically these episodes last 3-4 minutes and resolve on their own. His most recent episode was 2 days ago and lasted about 10 minutes. During this epsiode he felt lightheaded, Short of breath and had chest pressure located in the middle of his chest. Denies any sweating, changes in hair or bowel movements.\r\nROS: Negative except as stated above\r\nPMH: None\r\nMeds: Takes his roommates Adderall to help study\r\nAllergies: NKDA\r\nPSHx: None\r\nFH: Mother has a thyroid problem, Father had a MI this past year at age 53\r\nSH: denies to"
96,00211_005,0,211,5,[Denies sweating],[462 468;473 481],No-hair-changes-OR-no-nail-changes-OR-no-temperature-intolerance,"HPI: Patient is a 17 yo m with a c/o of palpitations. Palpitations began a few months ago. States that palpitations are sudden, unpredictable and feel like his heart is pounding fast/jumping out of his chest. Typically these episodes last 3-4 minutes and resolve on their own. His most recent episode was 2 days ago and lasted about 10 minutes. During this epsiode he felt lightheaded, Short of breath and had chest pressure located in the middle of his chest. Denies any sweating, changes in hair or bowel movements.\r\nROS: Negative except as stated above\r\nPMH: None\r\nMeds: Takes his roommates Adderall to help study\r\nAllergies: NKDA\r\nPSHx: None\r\nFH: Mother has a thyroid problem, Father had a MI this past year at age 53\r\nSH: denies to"
97,00211_006,0,211,6,[Adderall],[595 603],Adderall-use,"HPI: Patient is a 17 yo m with a c/o of palpitations. Palpitations began a few months ago. States that palpitations are sudden, unpredictable and feel like his heart is pounding fast/jumping out of his chest. Typically these episodes last 3-4 minutes and resolve on their own. His most recent episode was 2 days ago and lasted about 10 minutes. During this epsiode he felt lightheaded, Short of breath and had chest pressure located in the middle of his chest. Denies any sweating, changes in hair or bowel movements.\r\nROS: Negative except as stated above\r\nPMH: None\r\nMeds: Takes his roommates Adderall to help study\r\nAllergies: NKDA\r\nPSHx: None\r\nFH: Mother has a thyroid problem, Father had a MI this past year at age 53\r\nSH: denies to"
98,00211_007,0,211,7,[Short of breath],[387 402],Shortness-of-breath,"HPI: Patient is a 17 yo m with a c/o of palpitations. Palpitations began a few months ago. States that palpitations are sudden, unpredictable and feel like his heart is pounding fast/jumping out of his chest. Typically these episodes last 3-4 minutes and resolve on their own. His most recent episode was 2 days ago and lasted about 10 minutes. During this epsiode he felt lightheaded, Short of breath and had chest pressure located in the middle of his chest. Denies any sweating, changes in hair or bowel movements.\r\nROS: Negative except as stated above\r\nPMH: None\r\nMeds: Takes his roommates Adderall to help study\r\nAllergies: NKDA\r\nPSHx: None\r\nFH: Mother has a thyroid problem, Father had a MI this past year at age 53\r\nSH: denies to"


In [10]:
frames = []
df_split = np.array_split(df, 5)
for split in range(0, 5):
    df_split[split]['kfold'] = split
    frames.append(df_split[split])
dfx = pd.concat(frames)
dfx = dfx.head(100)

In [11]:
max_len = df['pn_history'].map(lambda x: len(x)).max()
max_len

950

# Configrations | Hyperparameters

In [12]:
from transformers import AutoTokenizer


class config:

    MAX_LEN = 416
    TRAIN_BATCH_SIZE = 8
    VALID_BATCH_SIZE = 8
    EPOCHS = 1
    #tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
    #model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
    BERT_PATH = "bert-base-uncased" 
    MODEL_PATH = "model.bin"
    
    TOKENIZER = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
    DROPOUT = 0.2
    MAX_GRAD_NORM = 1.0
    LEARNING_RATE = 1e-5


# Data Processing

In [13]:
first = df.loc[3]
example = {
    "feature_text": first.feature_text,
    "pn_history": first.pn_history,
    "location": first.location,
    "annotation": first.annotation
}
for key in example.keys():
    print(key)
    print(example[key])
    print("=" * 100)

feature_text
Intermittent-symptoms
pn_history
HPI: 17yo M presents with palpitations. Patient reports 3-4 months of intermittent episodes of "heart beating/pounding out of my chest." 2 days ago during a soccer game had an episode, but this time had chest pressure and felt as if he were going to pass out (did not lose conciousness). Of note patient endorses abusing adderall, primarily to study (1-3 times per week). Before recent soccer game, took adderrall night before and morning of game. Denies shortness of breath, diaphoresis, fevers, chills, headache, fatigue, changes in sleep, changes in vision/hearing, abdominal paun, changes in bowel or urinary habits. 
PMHx: none
Rx: uses friends adderrall
FHx: mom with "thyroid disease," dad with recent heart attcak
All: none
Immunizations: up to date
SHx: Freshmen in college. Endorses 3-4 drinks 3 nights / week (on weekends), denies tabacco, endorses trying marijuana. Sexually active with girlfriend x 1 year, uses condoms
location
['70 91', '1

In [14]:
def loc_list_to_ints(loc_list):
    to_return = []
    for loc_str in loc_list:
        loc_strs = loc_str.split(";")
        for loc in loc_strs:
            start, end = loc.split()
            to_return.append((int(start), int(end)))
    return to_return

example_loc_ints = loc_list_to_ints(example["location"])


In [15]:
def process_data_tokenize(pn_history, feature_text, annotation, location, tokenizer, max_len):    ##X , Y, selected_text  
    
    location_list = loc_list_to_ints(location)        
    char_targets = [0] * len(pn_history) #creating empty list(all zeros) of character;it will be made 1 if annotation in text   
    
    for loc,anno in zip(location_list,annotation): 
      len_st = loc[1] - loc[0]

      idx0 = None
      idx1 = None
      for ind in (i for i, e in enumerate(pn_history) if (e == anno[0] and i == loc[0])):
        if pn_history[ind: ind+len_st] == anno:

            idx0 = ind
            idx1 = ind + len_st - 1
            if idx0 != None and idx1 != None:
                for ct in range(idx0, idx1 + 1):
                    char_targets[ct] = 1  #replacing zeros with 1 if that part of the text is selected text
    
            break
      
    tokenized_input = tokenizer.encode_plus(feature_text,pn_history, 
                      return_offsets_mapping=True, 
                      max_length=max_len, 
                      pad_to_max_length=True, truncation=True)
        
    input_ids = tokenized_input.input_ids
    mask = tokenized_input.attention_mask
    token_type_ids = tokenized_input.token_type_ids
    offsets = tokenized_input.offset_mapping
    
    target_idx = []
    for j, (offset1, offset2) in enumerate(offsets):
        if sum(char_targets[offset1: offset2]) > 0:
            target_idx.append(j)
            
    #padding
    padding_length = max_len - len(input_ids)
    if padding_length > 0:
        input_ids = input_ids + ([0] * padding_length)
        mask = mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)
        offsets = offsets + ([(0, 0)] * padding_length)
       
    #creating label
    ignore_idxes = np.where(np.array(token_type_ids) != 1)[0]

    label = np.zeros(len(offsets))
    label[ignore_idxes] = -1
    label[target_idx] = 1

    
    return {
    'ids': input_ids,
    'mask': mask,
    'token_type_ids': token_type_ids,
    'labels': label,
    'offsets': offsets
}

In [16]:
output = process_data_tokenize(example["pn_history"],example["feature_text"],example["annotation"],example["location"],config.TOKENIZER,config.MAX_LEN)

for key in output.keys():
    print(key)
    print(output[key])
    print("=" * 100)
    

ids
[101, 27946, 118, 8006, 102, 6857, 1182, 131, 1542, 7490, 182, 8218, 1114, 185, 1348, 18965, 6006, 119, 5351, 3756, 124, 118, 125, 1808, 1104, 27946, 3426, 1104, 107, 1762, 5405, 120, 9683, 1149, 1104, 1139, 2229, 119, 107, 123, 1552, 2403, 1219, 170, 5862, 1342, 1125, 1126, 2004, 117, 1133, 1142, 1159, 1125, 2229, 2997, 1105, 1464, 1112, 1191, 1119, 1127, 1280, 1106, 2789, 1149, 113, 1225, 1136, 3857, 14255, 9589, 1757, 114, 119, 1104, 3805, 5351, 1322, 18649, 1116, 170, 7441, 1158, 5194, 21716, 1233, 117, 3120, 1106, 2025, 113, 122, 118, 124, 1551, 1679, 1989, 114, 119, 1196, 2793, 5862, 1342, 117, 1261, 5194, 1200, 4412, 1233, 1480, 1196, 1105, 2106, 1104, 1342, 119, 26360, 1603, 1757, 1104, 2184, 117, 4267, 25890, 12238, 1548, 117, 10880, 1116, 117, 11824, 1116, 117, 16320, 117, 18418, 117, 2607, 1107, 2946, 117, 2607, 1107, 4152, 120, 4510, 117, 24716, 185, 3984, 1179, 117, 2607, 1107, 7125, 1883, 1137, 190, 9324, 1616, 15640, 119, 9852, 1324, 1775, 131, 3839, 187, 1775, 131, 



# Data Loader

In [17]:
class NBMEDataset:
    def __init__(self, pn_history, feature_text, annotation, location):   #text(X) #label(Y), #selected_text #start:end
        self.pn_history = pn_history
        self.feature_text = feature_text
        self.annotation = annotation
        self.location = location
        self.tokenizer = config.TOKENIZER
        self.max_len = config.MAX_LEN
        
    def __len__(self):
        return len(self.pn_history)
        
    def __getitem__(self, item):
        data = process_data_tokenize(
            self.pn_history[item],
            self.feature_text[item],
            self.annotation[item],
            self.location[item],
            self.tokenizer,
            self.max_len
        )
        
        return {
            'ids': torch.tensor(data["ids"]), #input_ids
            'mask': torch.tensor(data["mask"], dtype=torch.long), #attention_mask
            'token_type_ids': torch.tensor(data["token_type_ids"], dtype=torch.long), #segment_ids
            'labels': torch.tensor(data["labels"], dtype=torch.long), 
            'offsets': torch.tensor(data["offsets"], dtype=torch.long)
        }


# The Model

In [18]:
class NBMEModel(transformers.BertPreTrainedModel):    #torch.nn.Module
    def __init__(self,conf):
        super(NBMEModel,self).__init__(conf)
        self.bert = transformers.BertModel.from_pretrained(config.BERT_PATH, config=conf)
        self.dropout = torch.nn.Dropout(config.DROPOUT)
        self.classifier = torch.nn.Linear(768, 1)
        torch.nn.init.normal_(self.classifier.weight, std=0.02) 
        
    def forward(self, ids, mask, token_type_ids):
        sequence_out = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids)[0] #last_hidden_state
        batch_size,max_len,feat_dim = sequence_out.shape
        sequence_output = self.dropout(sequence_out)
        logits = self.classifier(sequence_output)
        logits = logits.squeeze(-1) 
        return logits

# Utility Function

In [19]:
class AverageMeter:
    """
    Computes and stores the average and current value
    """
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count
        

# Loss Function

In [20]:
def loss_fn(logits, labels):
    loss_fct = torch.nn.BCEWithLogitsLoss(reduction = "none")
    loss = loss_fct(logits,labels)
    return loss


# Training Function

In [21]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [22]:
def train_fn(dataloader, model, optimizer, scheduler=None):
    model.train()
    losses = AverageMeter() # Computes and stores the average and current value
    tk = tqdm(dataloader, total=len(dataloader)) #tqdm is a Python library for adding progress bar. 
    
    for batch, data in enumerate(tk):

      print(data)
      print('*********')
      ids = data['ids']
      token_type_ids = data["token_type_ids"]
      mask = data["mask"]
      labels = data["labels"]
      offsets = data["offsets"]
        
      #adding the below data to device ;device enables you to specify the device type responsible to load a tensor into memory.
      ids = ids.to(DEVICE, dtype=torch.long)
      token_type_ids = token_type_ids.to(DEVICE, dtype=torch.long)
      mask = mask.to(DEVICE, dtype=torch.long)
      labels = labels.to(DEVICE, dtype=torch.float64)

      model.zero_grad()
      logits = model(ids=ids, mask=mask, token_type_ids=token_type_ids) #last_hidden_state

      loss = loss_fn(logits, labels)
      loss = torch.masked_select(loss, labels > -1.0).mean()
      losses.update(loss.item(),ids.size(0))
      loss.backward()

      torch.nn.utils.clip_grad_norm_(model.parameters(), config.MAX_GRAD_NORM)
      optimizer.step()
      scheduler.step() ## Update learning rate schedule
      
      #output = torch.argmax(torch.softmax(logits, dim=2),dim=2).cpu().detach().numpy()
      tk.set_postfix(loss=losses.avg)
      return losses.avg

# Evaluation Functions

In [23]:
def eval_fn(dataloader, model):
    model.eval()
    losses = AverageMeter() # Computes and stores the average and current value

    with torch.no_grad():
        tk = tqdm(dataloader, total=len(dataloader)) 
        for batch, data in enumerate(tk):

            ids = data['ids']
            token_type_ids = data["token_type_ids"]
            mask = data["mask"]
            labels = data["labels"]
            offsets = data["offsets"]

            ids = ids.to(DEVICE, dtype=torch.long)
            token_type_ids = token_type_ids.to(DEVICE, dtype=torch.long)
            mask = mask.to(DEVICE, dtype=torch.long)
            labels = labels.to(DEVICE, dtype=torch.float64)

            logits = model(ids=ids, mask=mask, token_type_ids=token_type_ids) #last_hidden_state
            
            loss = loss_fn(logits, labels)
            loss = torch.masked_select(loss, labels > -1.0).mean()
            losses.update(loss.item(),ids.size(0))
            tk.set_postfix(loss=losses.avg)
        
        return losses.avg


# Training

In [24]:
def run(fold):
    
    train_loss_data, valid_loss_data = [], []
    best_loss = np.inf
    since = time.time()
   
    df_train = dfx[dfx.kfold != fold].reset_index(drop=True) 
    df_valid = dfx[dfx.kfold == fold].reset_index(drop=True)
    
    train_dataset = NBMEDataset(
        pn_history=df_train.pn_history.values,
        feature_text=df_train.feature_text.values,
        annotation=df_train.annotation.values,
        location=df_train.location.values
        
    )
    
    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN_BATCH_SIZE,
        num_workers=4
    )

    valid_dataset = NBMEDataset(
        pn_history=df_valid.pn_history.values,
        feature_text=df_valid.feature_text.values,
        annotation=df_valid.annotation.values,
        location=df_valid.location.values
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=2
    )

    model_config = transformers.BertConfig.from_pretrained(config.BERT_PATH)
    model_config.output_hidden_states = True
    model = NBMEModel(conf=model_config)
    model.to(DEVICE)
    
    num_train_steps = int(len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
    ]
    optimizer = AdamW(optimizer_parameters, lr=config.LEARNING_RATE)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=0, 
        num_training_steps=num_train_steps
    )

    best_loss = np.inf
    
    for i in range(config.EPOCHS):
        print("Epoch: {}/{}".format(i + 1, config.EPOCHS))
    
        # train model
        train_loss = train_fn(train_data_loader, model, optimizer, scheduler=scheduler)
        train_loss_data.append(train_loss)
        print(f"Train loss: {train_loss}")

        # evaluate model
        valid_loss = eval_fn(valid_data_loader, model)
        valid_loss_data.append(valid_loss)
        print(f"Valid loss: {valid_loss}")


        if valid_loss < best_loss:
            best_loss = valid_loss
            torch.save(model.state_dict(), "model_fold1.bin")


        time_elapsed = time.time() - since
        print('Training completed in {:.0f}m {:.0f}s'.format(
            time_elapsed // 60, time_elapsed % 60))
    
    

In [37]:
run(fold=0)

  cpuset_checked))


Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: 1/1




0it [00:00, ?it/s]

Train loss: None


  0%|          | 0/13 [00:00<?, ?it/s]

Valid loss: 0.751505089274985
Training completed in 0m 32s


In [38]:
#run(fold=1)

In [39]:
#run(fold=2)

In [40]:
#run(fold=3)

In [41]:
#run(fold=4)

# Do the evauation on test data
##### [inference in progress...]

In [58]:
sentence = "HuggingFace is a company based in New York,  he is sick , headach but is also has employees working in Paris"
#tokenizer = config.TOKENIZER,config.MAX_LEN
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
tokenized = tokenizer.encode_plus(sentence, max_length=512, pad_to_max_length=True,return_offsets_mapping=True, return_token_type_ids=True) #return_tensors="pt"
input_ids = torch.tensor([tokenized["input_ids"]]).to(DEVICE)
attention_mask = torch.tensor([tokenized["attention_mask"]]).to(DEVICE)
token_type_ids = torch.tensor([tokenized["token_type_ids"]]).to(DEVICE)
offsets = torch.tensor([tokenized["offset_mapping"]]).to(DEVICE)
print(offsets)

model_config = transformers.BertConfig.from_pretrained(config.BERT_PATH)
model_config.output_hidden_states = True

# #similarly this can be done for all 5 models
model = NBMEModel(conf=model_config)
model.load_state_dict(torch.load("../models/model_fold1.bin",  map_location=torch.device('cpu')))

model.to(DEVICE)
  
with torch.no_grad():
    logits = model(ids=input_ids, mask=attention_mask, token_type_ids=token_type_ids) #last_hidden_state

def get_predictions(logits):
    preds = torch.sigmoid(logits).cpu().detach().numpy()
    preds = np.where(preds > 0.5, 1, 0)
    return preds

def get_prediction_locations(preds, offsets):
    locations = []
    for pred, offset in zip(preds, offsets):
        for i in range(len(pred)):
            if pred[i] == 1:
                locations.append(offset[i])
    return locations

def get_prediction_keywords(preds, offsets, sentence):
    keywords = []
    for pred, offset in zip(preds, offsets):
        #print(len(pred))
        for i in range(len(pred)):
            #print(pred[i])
            if pred[i] == 1:
                #print('yes')
                keywords.append(sentence[offset[i][0]:offset[i][1]])
    return keywords


preds = get_predictions(logits)
#offsets = tokenized["offset_mapping"]
keyword = get_prediction_keywords(preds, offsets, sentence)

def get_labels(preds):
    labels = []
    for pred in preds:
        labels.append(np.where(pred == 1)[0])
    return labels


labels = get_labels(preds)
print(sentence)
print(keyword)
print(labels)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


tensor([[[ 0,  0],
         [ 0,  7],
         [ 7, 11],
         ...,
         [ 0,  0],
         [ 0,  0],
         [ 0,  0]]])


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HuggingFace is a company based in New York,  he is sick , headach but is also has employees working in Paris
['', 'Face', 'based', 'in', 'New', 'he', 'is', 'sick', 'ach', 'but', 'is', 'has', 'employees', 'in', 'Par', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
[array([  0,   2,   6,   7,   8,  12,  13,  14,  17,  18,  19,  21,  22,
        24,  25,  29,  34,  35,  50,  51, 

In [None]:
def get_location_predictions(preds, offset_mapping, sequence_ids, test=False):
    all_predictions = []
    for pred, offsets, seq_ids in zip(preds, offset_mapping, sequence_ids):
        start_idx = None
        current_preds = []
        for p, o, s_id in zip(pred, offsets, seq_ids):
            if s_id is None or s_id == 0:
                continue
            if p > 0.5:
                if start_idx is None:
                    start_idx = o[0]
                end_idx = o[1]
            elif start_idx is not None:
                if test:
                    current_preds.append(f"{start_idx} {end_idx}")
                else:
                    current_preds.append((start_idx, end_idx))
                start_idx = None
        if test:
            all_predictions.append("; ".join(current_preds))
        else:
            all_predictions.append(current_preds)
    return all_predictions



def predict_location_preds(tokenizer, model, feature_text, pn_history):

    test_ds = NBMETestData(feature_text, pn_history, tokenizer)
    test_dl = torch.utils.data.DataLoader(
        test_ds, 
        batch_size=config.batch_size, 
        pin_memory=True, 
        shuffle=False, 
        drop_last=False
    )

    all_preds = None
    offsets = []
    seq_ids = []

    preds = []

    with torch.no_grad():
        for batch in tqdm(test_dl):

            for k, v in batch.items():
                if k not in  ['offset_mapping', 'sequence_id']:
                    batch[k] = v.to(config.device)

            logits = model(batch)['logits']
            preds.append(logits.cpu().numpy())

            offset_mapping = batch['offset_mapping']
            sequence_ids = batch['sequence_ids']
            offsets.append(offset_mapping.cpu().numpy())
            seq_ids.append(sequence_ids.cpu().numpy())

    preds = np.concatenate(preds, axis=0)
    if all_preds is None:
        all_preds = np.array(preds).astype(np.float32)
    else:
        all_preds += np.array(preds).astype(np.float32)
    torch.cuda.empty_cache()

    all_preds = all_preds.squeeze()

    offsets = np.concatenate(offsets, axis=0)
    seq_ids = np.concatenate(seq_ids, axis=0)

    print(all_preds.shape, offsets.shape, seq_ids.shape)

    location_preds = get_location_predictions([all_preds], offsets, seq_ids, test=False)[0]
    
    x = []
    
    for location in location_preds:
        x.append(pn_history[0][location[0]: location[1]])
    
    return location_preds, ', '.join(x)

def get_predictions(feature_text, pn_history):
    location_preds, pred_string = predict_location_preds(tokenizer, model, [feature_text], [pn_history])
    print('pred string', pred_string)
    return pred_string

# def get_predictions_from_text(text):
#     feature_text, pn_history = get_feature_text(text)
#     return get_predictions(feature_text, pn_history)

# def get_predictions_from_text_list(text_list):
#     feature_text, pn_history = get_feature_text(text_list)
#     return get_predictions(feature_text, pn_history)



tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_path)#['tokenizer_path']) # config.TOKENIZER_PATH
path ='model.pth'

device = torch.device("cuda")
model = NBMEModel()
#model.to(device)#(torch.device)#['device'])
model.load_state_dict(torch.load(path, map_location=torch.device('cpu')))#torch.device(config.device)))#['model'])
model.to(device)#(torch.device)#['device'])
model.eval()

input_text = create_sample_test()
feature_text = input_text.feature_text[0].lower()
pn_history = input_text.pn_history[0].lower()
get_predictions(feature_text, pn_history)  

In [38]:

    #    
#input_ids = tokenized_input.input_ids
#mask = tokenized_input.attention_mask
#token_type_ids = tokenized_input.token_type_ids
#offsets = tokenized_input.offset_mapping
    
print(tokenized_input["input_ids"].shape, tokenized_input["attention_mask"].shape)
# move to gpu
ids = tokenized_input["input_ids"].to(DEVICE)
mask = tokenized_input["attention_mask"].to(DEVICE)
token_type_ids = tokenized_input["token_type_ids"]
# forward pass
model_config = transformers.BertConfig.from_pretrained(config.BERT_PATH)
model_config.output_hidden_states = True

# #similarly this can be done for all 5 models
model1 = NBMEModel(conf=model_config)
#model1.to(config.DEVICE)
model1.load_state_dict(torch.load("../models/model_fold1.bin", map_location=torch.device('cpu')))


outputs = model1(ids, mask, token_type_ids )
logits = outputs.logits
print(outputs.shape)
#logits = outputs[0]
print(logits)


torch.Size([1, 500]) torch.Size([1, 500])


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


AttributeError: 'Tensor' object has no attribute 'logits'

In [69]:
#active_logits = logits.view(-1, model1.num_labels) # shape (batch_size * seq_len, num_labels)
active_logits = outputs.view(-1)
print(active_logits.shape)
print(outputs.shape)

flattened_predictions = torch.argmax(outputs, axis=1) # shape (batch_size*seq_len,) - predictions at the token level
print(flattened_predictions)

torch.Size([500])
torch.Size([1, 500])
tensor([7])


In [64]:
import pandas as pd



ids_l = ids.squeeze().tolist()
words = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
df = pd.DataFrame({'ids': ids_l,'words': words})
df.head()

Unnamed: 0,ids,words
0,101,[CLS]
1,19558,hugging
2,10931,##face
3,1110,is
4,170,a


In [61]:
tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
tokens
#token_predictions = [i for i in flattened_predictions.cpu().numpy()]
#wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction)


['[CLS]',
 'hugging',
 '##face',
 'is',
 'a',
 'company',
 'based',
 'in',
 'new',
 'yo',
 '##rk',
 ',',
 'he',
 'is',
 'sick',
 ',',
 'head',
 '##ach',
 'but',
 'is',
 'also',
 'has',
 'employees',
 'working',
 'in',
 'par',
 '##is',
 '[SEP]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PA

In [56]:
from transformers import AutoTokenizer, BertForTokenClassification

model_name = "dslim/bert-base-NER"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = BertForTokenClassification.from_pretrained(model_name)

text = "Obama was the president of the United States and he was born in Hawai. I have a broken leg"

encoding = tokenizer(text, return_tensors="pt")

# forward pass
outputs = model(**encoding)

logits = outputs.logits
print(logits.shape)
predicted_label_classes = logits.argmax(-1)
print(predicted_label_classes)
predicted_labels = [model.config.id2label[id] for id in predicted_label_classes.squeeze().tolist()]
print(predicted_labels)

for id, label in zip(encoding.input_ids.squeeze().tolist(), predicted_labels):
  print(tokenizer.decode([id]), label)

Downloading:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/829 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/413M [00:00<?, ?B/s]

torch.Size([1, 23, 9])
tensor([[0, 3, 0, 0, 0, 0, 0, 7, 8, 0, 0, 0, 0, 0, 7, 7, 0, 0, 0, 0, 0, 0, 0]])
['O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
[CLS] O
Obama B-PER
was O
the O
president O
of O
the O
United B-LOC
States I-LOC
and O
he O
was O
born O
in O
Ha B-LOC
##wai B-LOC
. O
I O
have O
a O
broken O
leg O
[SEP] O
