In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import re
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from simpletransformers.ner import NERModel,NERArgs

In [2]:
# load the raw medical text data
data = pd.read_json('data/medical-ner.json')
data.head()

Unnamed: 0,examples
0,"{'id': '18c2f619-f102-452f-ab81-d26f7e283ffe',..."
1,"{'id': '487c93e3-0d45-4088-a378-cf3a01c8953d',..."
2,"{'id': 'd5056874-895a-4a7f-9e0f-828d414d65d9',..."
3,"{'id': '20c792c7-0c4b-42d0-8127-0e04113db384',..."
4,"{'id': 'f5359e0d-4d4a-4707-95a3-4c627fc4a83b',..."


In [3]:
def preprocess_text(text):
    pattern = r'\[\d+\]'
    text = re.sub(pattern, "",text)
    text = re.sub(r"\[.*?]","",text)
    text = re.sub("[^a-zA-Z0-9]", " ", text)
    return text.strip().lower()

In [4]:
# prepare medical tag data
dfm = pd.DataFrame()
for i,d in data.iterrows():
    # from each record, get annotations
    a_ls = [[x.get("value"),x.get("tag_name")] for x  in d["examples"].get("annotations")]
    dfm = pd.concat([dfm,pd.DataFrame(a_ls,columns=["words","tags"])])

dfm["length"] = dfm.words.apply(lambda x:len(x))
dfm = dfm.loc[dfm.length<50]
dfm = dfm.reset_index(drop=True)
dfm["words"] = dfm.words.apply(lambda x:preprocess_text(x))

In [5]:
dfm.head()

Unnamed: 0,words,tags,length
0,diosmectite,Medicine,11
1,aluminomagnesium silicate,Medicine,25
2,diarrhea,MedicalCondition,8
3,kaopectate,Medicine,10
4,bismuth compounds,Medicine,17


In [6]:
dfw = pd.DataFrame()

for i,d in data.iterrows():
    # get contents for record
    s_id = d["examples"].get("id")
    w_ls = [x.lower() for x in d["examples"].get("content").split()]
    dfwi = pd.DataFrame(w_ls,columns=["words"])
    dfwi["text_id"] = s_id
    dfw = pd.concat([dfw,dfwi])    

dfw["length"] = dfw.words.apply(lambda x:len(x))
dfw["words"] = dfw.words.apply(lambda x:preprocess_text(x))

In [7]:
dfw.head()

Unnamed: 0,words,text_id,length
0,while,18c2f619-f102-452f-ab81-d26f7e283ffe,5
1,bismuth,18c2f619-f102-452f-ab81-d26f7e283ffe,7
2,compounds,18c2f619-f102-452f-ab81-d26f7e283ffe,9
3,pepto bismol,18c2f619-f102-452f-ab81-d26f7e283ffe,14
4,decreased,18c2f619-f102-452f-ab81-d26f7e283ffe,9


In [8]:
dfw = dfw.drop("length",axis=1)
dfm = dfm.drop("length",axis=1)

In [9]:
# merge the text, text id and tags for each word in the text
df = pd.merge(dfw,dfm,on=["words"],how="left")
df["tags"] = df.tags.fillna("Others")
df["text_id"] = LabelEncoder().fit_transform(df["text_id"] )
df.rename(columns={"text_id":"sentence_id","tags":"labels"}, inplace =True)

In [10]:
X = df[["sentence_id","words"]]
Y = df["labels"]

In [11]:
x_train,y_train = X[X.sentence_id<=24],Y[X.sentence_id<=24]
x_test,y_test = X[X.sentence_id>24],Y[X.sentence_id>24]

In [12]:
x_train.head()

Unnamed: 0,sentence_id,words
0,1,while
1,1,bismuth
2,1,compounds
3,1,pepto bismol
4,1,decreased


In [13]:
label = df["labels"].unique().tolist()
label

['Others', 'Medicine', 'MedicalCondition', 'Pathogen']

In [14]:
OUTPUT_DIR = "output_dir"
args = NERArgs()
args.num_train_epochs = 30
args.learning_rate = 1e-4
args.overwrite_output_dir = True
args.train_batch_size = 32
args.eval_batch_size = 32
args.save_steps = 10
args.output_dir = OUTPUT_DIR
args.resume_from_checkpoint=True

In [15]:
model = NERModel('bert', 'bert-base-cased',labels=label,args =args)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
train_data = pd.DataFrame({"sentence_id":x_train["sentence_id"],"words":x_train["words"],"labels":y_train})
test_data = pd.DataFrame({"sentence_id":x_test["sentence_id"],"words":x_test["words"],"labels":y_test})

In [17]:
train_data.head()

Unnamed: 0,sentence_id,words,labels
0,1,while,Others
1,1,bismuth,Others
2,1,compounds,Others
3,1,pepto bismol,Medicine
4,1,decreased,Others


In [18]:
model.train_model(train_data,eval_data = test_data,acc=accuracy_score,verbose=True)

  0%|          | 0/1 [00:00<?, ?it/s]

Epoch:   0%|          | 0/30 [00:00<?, ?it/s]

  scaler = amp.GradScaler()


Running Epoch 1 of 30:   0%|          | 0/1 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 2 of 30:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 3 of 30:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 4 of 30:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 5 of 30:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 6 of 30:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 7 of 30:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 8 of 30:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 9 of 30:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 10 of 30:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 11 of 30:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 12 of 30:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 13 of 30:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 14 of 30:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 15 of 30:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 16 of 30:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 17 of 30:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 18 of 30:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 19 of 30:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 20 of 30:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 21 of 30:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 22 of 30:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 23 of 30:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 24 of 30:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 25 of 30:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 26 of 30:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 27 of 30:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 28 of 30:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 29 of 30:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 30 of 30:   0%|          | 0/1 [00:00<?, ?it/s]

(30, 0.2630932913937916)

In [19]:
result, model_outputs, preds_list = model.eval_model(test_data)

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1 [00:00<?, ?it/s]

  with amp.autocast():


In [20]:
result

{'eval_loss': 0.39668184518814087,
 'precision': 0.8,
 'recall': 0.6511627906976745,
 'f1_score': 0.7179487179487181}

In [21]:
test_input = """
While bismuth compounds (Pepto-Bismol) decreased the number of bowel movements in those with travelers' 
diarrhea, they do not decrease the length of illness.[91] Anti-motility agents like loperamide are also effective 
at reducing the number of stools but not the duration of disease.[8] 
These agents should be used only if bloody diarrhea is not present.[92]\n\nDiosmectite, a natural 
aluminomagnesium silicate clay, is effective in alleviating symptoms of acute diarrhea in children,[93] 
and also has some effects in chronic functional diarrhea, radiation-induced diarrhea, and 
chemotherapy-induced diarrhea.[45] Another absorbent agent used for the treatment of mild diarrhea is kaopectate. 
Racecadotril an antisecretory medication may be used to treat diarrhea in children and adults.[86] It has better tolerability 
than loperamide, as it causes less constipation and flatulence.[94]
"""

In [22]:
prediction, model_output = model.predict([test_input])

  0%|          | 0/1 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

  with amp.autocast():


In [47]:
final_ouput = ""
for word_tag_pair in prediction[0]:
    tag = list(word_tag_pair.values())[0]
    word = list(word_tag_pair.keys())[0]
    
    if tag!="Others":
        final_ouput+=f" {preprocess_text(word)} [{tag.upper()}]"
    else:
        final_ouput+=f" {preprocess_text(word)}"
final_ouput = final_ouput.strip()
final_ouput

'while bismuth compounds pepto bismol decreased the number of bowel movements in those with travelers diarrhea [MEDICALCONDITION] they do not decrease the length of illness anti motility agents like loperamide [MEDICINE] are also effective at reducing the number of stools but not the duration of disease these agents should be used only if bloody diarrhea [MEDICALCONDITION] is not present diosmectite [MEDICINE] a natural aluminomagnesium [MEDICINE] silicate [MEDICALCONDITION] clay is effective in alleviating symptoms of acute diarrhea [MEDICALCONDITION] in children'