# Use environment: transformer

In [14]:
# import sys
# !{sys.executable} -m pip install stanza

In [15]:
from datetime import datetime
date = datetime.today().strftime('%y%m%d')
print ('Last modified by Xiaoqing: ' + date)

Last modified by Xiaoqing: 211207


# Overview
Allergies are frequently mentioned in clinical trial eligibility and also in patient EHR.
But most packages, including stanza and AWS comprehend, cannot recognize allergies. I built this notebook to recognize allergy as an entity.

More specifically, stanza can recognize "seasonal allergy" as a problem, but it cannot recognize "allergy to [a chemical]".

To fix this, here are the general rules:

- If a criteria contains ‘allergy to’: Replace all detected entities as allergy
- If a ‘problem’ entity contains the word allergy: Replace all entities as allergy


In [16]:
import stanza
import pandas as pd

In [17]:
# download and initialize a mimic pipeline with an i2b2 NER model
# stanza.download('en', package='mimic', processors={'ner': 'i2b2'})
nlp = stanza.Pipeline('en', package='mimic', processors={'ner': 'i2b2'})

2021-12-07 12:08:04 INFO: Loading these models for language: en (English):
| Processor | Package |
-----------------------
| tokenize  | mimic   |
| pos       | mimic   |
| lemma     | mimic   |
| depparse  | mimic   |
| ner       | i2b2    |

2021-12-07 12:08:04 INFO: Use device: cpu
2021-12-07 12:08:04 INFO: Loading: tokenize
2021-12-07 12:08:04 INFO: Loading: pos
2021-12-07 12:08:04 INFO: Loading: lemma
2021-12-07 12:08:04 INFO: Loading: depparse
2021-12-07 12:08:04 INFO: Loading: ner
2021-12-07 12:08:05 INFO: Done loading processors!


# Read data: real clinical trial eligibility criteria containing allergies

In [41]:
df = pd.read_csv('allergy_input.csv')
df['criteria']= df['criteria'].str.lower()

In [42]:
df.head()

Unnamed: 0,sentence_id,criteria
0,1,seasonal allergies with significant effect on ...
1,2,"patients with diabetic retinopathy, collagen, ..."
2,3,"has a history of relevant drug allergies, food..."
3,4,"babies who have a skin condition, allergies, o..."
4,5,gadolinium allergy


# Iterate through df sentences, record entities

In [43]:
df1 = pd.DataFrame(columns=['sentence_id','ent_text', 'ent_type']) # create empty df with column names only so we can append rows to it

In [44]:
for index, row in df.iterrows():
    doc = nlp(row['criteria'])
    for ent in doc.entities:
        df1 = df1.append({'sentence_id': row['sentence_id'], 'ent_text': ent.text, 'ent_type': ent.type}, ignore_index = True)

In [45]:
df1.head(10)

Unnamed: 0,sentence_id,ent_text,ent_type
0,1,seasonal allergies,PROBLEM
1,2,diabetic retinopathy,PROBLEM
2,2,diagnosed autoimmune disease,PROBLEM
3,2,lupus,PROBLEM
4,2,rheumatoid arthritis,PROBLEM
5,2,fibromylagia),PROBLEM
6,2,immunodeficiency,PROBLEM
7,2,hiv),PROBLEM
8,2,connective tissue disease,PROBLEM
9,2,clinically significant atopic syndrome,PROBLEM


# Note: some sentences contain no detectible entities, like sentence_id = 2

In [46]:
df2 = pd.merge(df, df1, on='sentence_id',  how='left')
df2.head(15)

Unnamed: 0,sentence_id,criteria,ent_text,ent_type
0,1,seasonal allergies with significant effect on ...,seasonal allergies,PROBLEM
1,2,"patients with diabetic retinopathy, collagen, ...",diabetic retinopathy,PROBLEM
2,2,"patients with diabetic retinopathy, collagen, ...",diagnosed autoimmune disease,PROBLEM
3,2,"patients with diabetic retinopathy, collagen, ...",lupus,PROBLEM
4,2,"patients with diabetic retinopathy, collagen, ...",rheumatoid arthritis,PROBLEM
5,2,"patients with diabetic retinopathy, collagen, ...",fibromylagia),PROBLEM
6,2,"patients with diabetic retinopathy, collagen, ...",immunodeficiency,PROBLEM
7,2,"patients with diabetic retinopathy, collagen, ...",hiv),PROBLEM
8,2,"patients with diabetic retinopathy, collagen, ...",connective tissue disease,PROBLEM
9,2,"patients with diabetic retinopathy, collagen, ...",clinically significant atopic syndrome,PROBLEM


# Rule 1
If criteria contain ‘allergy to’: Replace all detected entities as allergy


In [47]:
a = ['allergy to', 'allergies to', 'allergic to','hypersensitivity to','hypersensitivities to','serious adverse reaction to']

In [48]:
for index, row in df2.iterrows():
    if any(x in row['criteria'] for x in a):
        df2.loc[index,'ent_type'] = 'ALLERGY'

Now, we delete the artifacts such as "known allergy"

In [49]:
delete = ['allergy', 'allergies', 'allergic','hypersensitivity','hypersensitivities',
          'known allergy','a known allergy','known hypersensitivity','known allergies','known hypersensitivities',
          'serious adverse reaction','sensitivity']

In [50]:
for i in delete:
    df2 = df2[df2.ent_text != i]


# Rule 2
If a ‘problem’ entity contains the word allergy: Replace all entities as allergy


In [52]:
b = ['allergy', 'allergies', 'allergic','hypersensitivity','hypersensitivities']

In [53]:
for index, row in df2.iterrows():
    if any(x in row['ent_text'] for x in b):
        df2.loc[index,'ent_type'] = 'ALLERGY'

# Save

In [54]:
df2.to_csv(('allergy_output_'+ date + '.csv'),index = False)