In [2]:
# import sys
# !{sys.executable} -m pip install stanza

In [10]:
from datetime import datetime
date = datetime.today().strftime('%y%m%d')
print ('Last modified by Xiaoqing: ' + date)

Last modified by Xiaoqing: 211112


# Data source: 10 synthetic patients and 5 real clinical trials from google Drive

# Use environment: transformer

In [2]:
import stanza
import pandas as pd

In [6]:
# download and initialize a mimic pipeline with an i2b2 NER model
# stanza.download('en', package='mimic', processors={'ner': 'i2b2'})
nlp = stanza.Pipeline('en', package='mimic', processors={'ner': 'i2b2'})

2021-11-12 16:15:53 INFO: Loading these models for language: en (English):
| Processor | Package |
-----------------------
| tokenize  | mimic   |
| pos       | mimic   |
| lemma     | mimic   |
| depparse  | mimic   |
| ner       | i2b2    |

2021-11-12 16:15:53 INFO: Use device: cpu
2021-11-12 16:15:53 INFO: Loading: tokenize
2021-11-12 16:15:53 INFO: Loading: pos
2021-11-12 16:15:53 INFO: Loading: lemma
2021-11-12 16:15:53 INFO: Loading: depparse
2021-11-12 16:15:53 INFO: Loading: ner
2021-11-12 16:15:54 INFO: Done loading processors!


# Named entity recognition: clinical trials

In [31]:
df = pd.read_csv('trials.csv')

In [32]:
# annotate clinical text; this is trial 1 from "ten_trials"
doc = nlp('Patient has history of a history of hypertension, and after antihypertensive treatment, systolic blood pressure ≥ 160 mmHg or diastolic blood pressure ≥ 100 mmHg ')
# print out all entities
for ent in doc.entities:
    print(f'{ent.text}\t{ent.type}')

hypertension	PROBLEM
antihypertensive treatment	TREATMENT
systolic blood pressure	TEST
diastolic blood pressure	TEST


In [13]:
ent.text

'diastolic blood pressure'

# One bullet point can have multiple problems, treatments, and tests

In [14]:
doc.entities

[{
   "text": "hypertension",
   "type": "PROBLEM",
   "start_char": 36,
   "end_char": 48
 },
 {
   "text": "antihypertensive treatment",
   "type": "TREATMENT",
   "start_char": 60,
   "end_char": 86
 },
 {
   "text": "systolic blood pressure",
   "type": "TEST",
   "start_char": 88,
   "end_char": 111
 },
 {
   "text": "diastolic blood pressure",
   "type": "TEST",
   "start_char": 126,
   "end_char": 150
 }]

# Iterate through df sentences, record entities

In [38]:
df1 = pd.DataFrame(columns=['sentence_id','ent_text', 'ent_type']) # create empty df with column names only so we can append rows to it

In [39]:
for index, row in df.iterrows():
    doc = nlp(row['sentence'])
    for ent in doc.entities:
        df1 = df1.append({'sentence_id': row['sentence_id'], 'ent_text': ent.text, 'ent_type': ent.type}, ignore_index = True)

In [40]:
df1.head(10)

Unnamed: 0,sentence_id,ent_text,ent_type
0,1,type 2 diabetes mellitus,PROBLEM
1,3,HbA1C,TEST
2,3,screening,TEST
3,3,HbA1C,TEST
4,4,type 1 diabetes mellitus,PROBLEM
5,5,ketoacidosis,PROBLEM
6,6,severe unconscious hypoglycemosis,PROBLEM
7,7,acute and chronic pancreatitis,PROBLEM
8,7,pancreatic injury,PROBLEM
9,7,pancreatitis,PROBLEM


# Note: some sentences contain no detectible entities, like sentence_id = 2

In [44]:
df2 = pd.merge(df, df1, on='sentence_id',  how='left')
df2.head(15)

Unnamed: 0,trial_id,sentence_id,sentence,inclusion,ent_text,ent_type
0,1,1,Patients diagnosed with type 2 diabetes mellitus,1,type 2 diabetes mellitus,PROBLEM
1,1,2,Patients have treated with diet/exercise at le...,1,,
2,1,3,"7.5% ≤HbA1C ≤11.0% at screening,7.0% ≤HbA1C ≤1...",1,HbA1C,TEST
3,1,3,"7.5% ≤HbA1C ≤11.0% at screening,7.0% ≤HbA1C ≤1...",1,screening,TEST
4,1,3,"7.5% ≤HbA1C ≤11.0% at screening,7.0% ≤HbA1C ≤1...",1,HbA1C,TEST
5,1,4,Patient has history of type 1 diabetes mellitus,0,type 1 diabetes mellitus,PROBLEM
6,1,5,Patient has history of ketoacidosis,0,ketoacidosis,PROBLEM
7,1,6,Patient has history of severe unconscious hypo...,0,severe unconscious hypoglycemosis,PROBLEM
8,1,7,Patient has history of acute and chronic pancr...,0,acute and chronic pancreatitis,PROBLEM
9,1,7,Patient has history of acute and chronic pancr...,0,pancreatic injury,PROBLEM


In [49]:
df2.to_csv(('trials_ner_'+ date + '.csv'),index = False)

# To do: extract test minimum and maximum

# To do: how to compare sentences like "Patients have treated with diet/exercise at least 3 months"?

The only place we may find this type statement in a medical record will be physician ‘clinical notes’. If we label these entities as “other”, “we can potentially compare patient’s “other” entities with a trial’s “other” entities


# Named entity recognition: patient

In [3]:
df = pd.read_csv('patients_synthetic.csv')

In [14]:
# need to remove some artifacts like (disorder)
remove_words = ['disorder', 'finding', 'pls']
pat = r'\b(?:{})\b'.format('|'.join(remove_words))
df['sentence'] = df['sentence'].str.replace(pat, '')

df.tail()

  df['sentence'] = df['sentence'].str.replace(pat, '')


Unnamed: 0,patient_id,sentence_id,sentence
35,9,36,hispanic 32 year old white Male
36,10,37,Acetaminophen 325 MG Oral Tablet
37,10,38,111.0 mm[Hg] of Systolic 82.0 mm[Hg] of Diasto...
38,10,39,Acute viral pharyngitis () Viral sinusitis () ...
39,10,40,nonhispanic 25 year old white Male


In [15]:
df1 = pd.DataFrame(columns=['sentence_id','ent_text', 'ent_type']) # create empty df with column names only so we can append rows to it

In [16]:
for index, row in df.iterrows():
    doc = nlp(row['sentence'])
    for ent in doc.entities:
        df1 = df1.append({'sentence_id': row['sentence_id'], 'ent_text': ent.text, 'ent_type': ent.type}, ignore_index = True)

In [17]:
df2 = pd.merge(df, df1, on='sentence_id',  how='left')
df2.head(15)

Unnamed: 0,patient_id,sentence_id,sentence,ent_text,ent_type
0,1,1,Ibuprofen 200 MG Oral Tablet Naproxen sodium 2...,Ibuprofen,TREATMENT
1,1,1,Ibuprofen 200 MG Oral Tablet Naproxen sodium 2...,Naproxen sodium,TREATMENT
2,1,1,Ibuprofen 200 MG Oral Tablet Naproxen sodium 2...,Acetaminophen,TREATMENT
3,1,1,Ibuprofen 200 MG Oral Tablet Naproxen sodium 2...,Dextromethorphan Hydrobromide,TREATMENT
4,1,1,Ibuprofen 200 MG Oral Tablet Naproxen sodium 2...,doxylamine succinate,TREATMENT
5,1,1,Ibuprofen 200 MG Oral Tablet Naproxen sodium 2...,Metformin hydrochloride,TREATMENT
6,1,2,118.0 mm[Hg] of Systolic 74.0 mm[Hg] of Diasto...,Hg,TEST
7,1,2,118.0 mm[Hg] of Systolic 74.0 mm[Hg] of Diasto...,Systolic,TEST
8,1,2,118.0 mm[Hg] of Systolic 74.0 mm[Hg] of Diasto...,Hg,TEST
9,1,2,118.0 mm[Hg] of Systolic 74.0 mm[Hg] of Diasto...,Diastolic,TEST


In [18]:
df2.to_csv(('patients_ner_'+ date + '.csv'),index = False)

# summary: 10 patients produced 55 diagnosis. 5 clinical trials produced 65 diagnoses. 

# unused notes

In [7]:
# annotate clinical text; this is patient 1 from "Patients"
doc = nlp('Ibuprofen 200 MG Oral Tablet Naproxen sodium 220 MG Oral Tablet Acetaminophen 21.7 MG/ML / Dextromethorphan Hydrobromide 1 MG/ML / doxylamine succinate 0.417 MG/ML Oral Solution 24 HR Metformin hydrochloride 500 MG Extended Release Oral Tablet. 118.0 mm[Hg] of Systolic 74.0 mm[Hg] of Diastolic 30.1 BMI 7.5 HA1c total in blood. Diabetes Anemia Hypertriglyceridemia Metabolic syndrome X Whiplash injury to neck Viral sinusitis Facial laceration Acute bronchitis')
# print out all entities
for ent in doc.entities:
    print(f'{ent.text}\t{ent.type}')

Ibuprofen	TREATMENT
Naproxen sodium	TREATMENT
Acetaminophen	TREATMENT
Dextromethorphan Hydrobromide	TREATMENT
doxylamine succinate	TREATMENT
Metformin hydrochloride	TREATMENT
Hg	TEST
Systolic	TEST
Hg	TEST
Diastolic	TEST
BMI	TEST
HA1c	TEST
total in blood	TEST
Diabetes Anemia	PROBLEM
Hypertriglyceridemia	PROBLEM
Metabolic syndrome	PROBLEM
Whiplash injury to neck	PROBLEM
Viral sinusitis	PROBLEM
Facial laceration	PROBLEM
Acute bronchitis	PROBLEM


In [8]:
# get the relationships
doc.sentences[0].print_dependencies

<bound method Sentence.print_dependencies of [
  {
    "id": 1,
    "text": "Ibuprofen",
    "lemma": "Ibuprofen",
    "upos": "PROPN",
    "xpos": "NNP",
    "head": 0,
    "deprel": "root",
    "start_char": 0,
    "end_char": 9,
    "ner": "S-TREATMENT"
  },
  {
    "id": 2,
    "text": "200",
    "lemma": "200",
    "upos": "NUM",
    "xpos": "CD",
    "head": 3,
    "deprel": "nummod",
    "start_char": 10,
    "end_char": 13,
    "ner": "O"
  },
  {
    "id": 3,
    "text": "MG",
    "lemma": "mg",
    "upos": "NOUN",
    "xpos": "NN",
    "head": 1,
    "deprel": "list",
    "start_char": 14,
    "end_char": 16,
    "ner": "O"
  },
  {
    "id": 4,
    "text": "Oral",
    "lemma": "Oral",
    "upos": "PROPN",
    "xpos": "NNP",
    "head": 5,
    "deprel": "compound",
    "start_char": 17,
    "end_char": 21,
    "ner": "O"
  },
  {
    "id": 5,
    "text": "Tablet",
    "lemma": "Tablet",
    "upos": "PROPN",
    "xpos": "NNP",
    "head": 1,
    "deprel": "list",
    "start_ch