Draft notebook for getting NER labels.

Modified from MedRed's `NER_labels_from_AMT.py`

---

In [5]:
import pandas as pd
from collections import OrderedDict
import re
import nltk
from nltk import word_tokenize
# THIS needs to be RUN once, for the first time
# nltk.download('punkt')
import string

In [6]:
# # included for convenience to help find correct paths
# import os
# os.getcwd()
# os.listdir("..")

Constants.

In [7]:
# paths
MEDRED_REPRODUCIBLE_DIR = "../"
#   MedRed data
MEDRED_IN = MEDRED_REPRODUCIBLE_DIR + 'data/AMT/labels/MedRed_AMT_labels.csv'
MEDRED_TAGGED_TOKENS_OUT = MEDRED_REPRODUCIBLE_DIR + 'data/AMT/labels/NER_Reddit_AMT_labels_href.csv'
MEDRED_FILTERED_OUT = MEDRED_REPRODUCIBLE_DIR + 'data/AMT/labels/filtered_Reddit_AMT_labels.csv'
#   Micromed data
MICROMED_IN = MEDRED_REPRODUCIBLE_DIR + 'data/Micromed/Micromed_labels.csv'
MICROMED_TAGGED_TOKENS_OUT = MEDRED_REPRODUCIBLE_DIR + 'data/Micromed/NER_Micromed_labels.csv'
MICROMED_FILTERED_OUT = MEDRED_REPRODUCIBLE_DIR + 'data/Micromed/filtered_Micromed_labels.csv'
#   AskAPatient/CADEC data
CADEC_IN = MEDRED_REPRODUCIBLE_DIR + 'data/cadec/CADEC_labels.csv'
CADEC_TAGGED_TOKENS_OUT = MEDRED_REPRODUCIBLE_DIR + 'data/cadec/NER_CADEC_labels.csv'
CADEC_FILTERED_OUT = MEDRED_REPRODUCIBLE_DIR + 'data/cadec/filtered_CADEC_labels.csv'

Read in data.

In [8]:
def read_data(path):
    return pd.read_csv(path)

dfMedRed = read_data(MEDRED_IN)
dfMicromed = read_data(MICROMED_IN)
dfCadec = read_data(CADEC_IN)

dfMicromed

Unnamed: 0,id,Locations.diseases,Locations.drugs,Locations.symptoms,Answer.diseases,Answer.drugs,Answer.symptoms,post
0,466073644209156096,"[[{'start': 20, 'end': 40}], [{'start': 71, 'e...",,,small-vessel disease;Cerebral small-vessel dis...,,,"Association between small-vessel disease, Alzh..."
1,466091939730030592,,,"[[{'start': 50, 'end': 55}], [{'start': 121, '...",,,blind;pain,I don't wanna drown in the rain. I don't wanna...
2,466095676883873792,"[[{'start': 75, 'end': 82}], [{'start': 88, 'e...",,,besity.;bese ;verweight,,,RT @DhesiBahaRaja: ⚠️Important Message;\nMalay...
3,466112043049701376,"[[{'start': 127, 'end': 135}]]",,,migraine,,,@rachjohnson0 @DailyDose248 @LoraRoule previou...
4,466130938393403392,"[[{'start': 60, 'end': 66}], [{'start': 38, 'e...",,,scurvy;malnourished,,,@el_diabl0_cake and you'd probably be malnouri...
...,...,...,...,...,...,...,...,...
269,471682384954675201,,,"[[{'start': 51, 'end': 57}], [{'start': 35, 'e...",,,stress;pain,"Sleeping is a cure to forget about pain, probl..."
270,471709203300495360,,,"[[{'start': 77, 'end': 83}]]",,,stress,"Edible greens such as kale, spinach, bok choy ..."
271,471723585577308160,,,"[[{'start': 15, 'end': 20}]]",,,tired,Mood: cold and tired
272,471725712093626368,,,"[[{'start': 35, 'end': 40}]]",,,tired,"Day 2 is winding down. We are hot, tired, dirt..."


Clean up terms (`Answers._` columns).

Note, reformatting is redundant to some parts of Micromed parsing in `micromed_prep.ipynb`; set up this way to make project more modular.

In [9]:
def parse_answers(df, column_name):
    '''
    Cleans up terms of interest, removign hyperlinks and splitting string into list.
    Unmodified from __parse_answers in NER_labels_from_AMT in MedRed.
    '''
    # maybe just do NLTK tokenize here
    def parse_row(row):
        row = [ elem.strip().strip(string.punctuation) for elem in row ]
        return [ elem for elem in row if elem != '' ]
    df[column_name] = df[column_name].apply(
        #lambda row: re.sub(r'^https?:\/\/.*[\r\n]*', '', str(row), flags=re.MULTILINE)
        lambda row: re.sub(r"http\S+", "", str(row))
        )
    df[column_name] = df[column_name].apply(
        #lambda row: row.split(';')
        lambda row: re.split(';|,|/|\.',str(row) )
        )
    df[column_name] = df[column_name].apply(
        lambda row: parse_row(row)
        )

parse_answers(dfMedRed, "Answer.symptoms")
parse_answers(dfMedRed, "Answer.drugs")
parse_answers(dfMicromed, "Answer.symptoms")
parse_answers(dfMicromed, "Answer.drugs")
parse_answers(dfCadec, "Answer.symptoms")
parse_answers(dfCadec, "Answer.drugs")
dfMicromed

Unnamed: 0,id,Locations.diseases,Locations.drugs,Locations.symptoms,Answer.diseases,Answer.drugs,Answer.symptoms,post
0,466073644209156096,"[[{'start': 20, 'end': 40}], [{'start': 71, 'e...",,,small-vessel disease;Cerebral small-vessel dis...,[nan],[nan],"Association between small-vessel disease, Alzh..."
1,466091939730030592,,,"[[{'start': 50, 'end': 55}], [{'start': 121, '...",,[nan],"[blind, pain]",I don't wanna drown in the rain. I don't wanna...
2,466095676883873792,"[[{'start': 75, 'end': 82}], [{'start': 88, 'e...",,,besity.;bese ;verweight,[nan],[nan],RT @DhesiBahaRaja: ⚠️Important Message;\nMalay...
3,466112043049701376,"[[{'start': 127, 'end': 135}]]",,,migraine,[nan],[nan],@rachjohnson0 @DailyDose248 @LoraRoule previou...
4,466130938393403392,"[[{'start': 60, 'end': 66}], [{'start': 38, 'e...",,,scurvy;malnourished,[nan],[nan],@el_diabl0_cake and you'd probably be malnouri...
...,...,...,...,...,...,...,...,...
269,471682384954675201,,,"[[{'start': 51, 'end': 57}], [{'start': 35, 'e...",,[nan],"[stress, pain]","Sleeping is a cure to forget about pain, probl..."
270,471709203300495360,,,"[[{'start': 77, 'end': 83}]]",,[nan],[stress],"Edible greens such as kale, spinach, bok choy ..."
271,471723585577308160,,,"[[{'start': 15, 'end': 20}]]",,[nan],[tired],Mood: cold and tired
272,471725712093626368,,,"[[{'start': 35, 'end': 40}]]",,[nan],[tired],"Day 2 is winding down. We are hot, tired, dirt..."


Clean up post text.

In [10]:
def parse_posts(df, column_name):
	'''
	Removes hyperlinks and replaces non-standard symbols with spaces in post text.
	Unmodified from __parse_answers in NER_labels_from_AMT.py in MedRed.
	'''
	df[column_name] = df[column_name].apply(
		lambda row: re.sub(r'^https?:\/\/.*[\r\n]*', '', str(row), flags=re.MULTILINE)
		)
	df[column_name] = df[column_name].apply(
		lambda row: re.sub(r"http\S+", "", str(row))
		)
	df[column_name] = df[column_name].apply(
		lambda row: re.sub('[^a-zA-Z0-9-_*.,;]', ' ', str(row))
		)

parse_posts(dfMedRed, "post")
parse_posts(dfMicromed, "post")
parse_posts(dfCadec, "post")
dfMicromed

Unnamed: 0,id,Locations.diseases,Locations.drugs,Locations.symptoms,Answer.diseases,Answer.drugs,Answer.symptoms,post
0,466073644209156096,"[[{'start': 20, 'end': 40}], [{'start': 71, 'e...",,,small-vessel disease;Cerebral small-vessel dis...,[nan],[nan],"Association between small-vessel disease, Alzh..."
1,466091939730030592,,,"[[{'start': 50, 'end': 55}], [{'start': 121, '...",,[nan],"[blind, pain]",I don t wanna drown in the rain. I don t wanna...
2,466095676883873792,"[[{'start': 75, 'end': 82}], [{'start': 88, 'e...",,,besity.;bese ;verweight,[nan],[nan],RT DhesiBahaRaja Important Message; Malays...
3,466112043049701376,"[[{'start': 127, 'end': 135}]]",,,migraine,[nan],[nan],rachjohnson0 DailyDose248 LoraRoule previou...
4,466130938393403392,"[[{'start': 60, 'end': 66}], [{'start': 38, 'e...",,,scurvy;malnourished,[nan],[nan],el_diabl0_cake and you d probably be malnouri...
...,...,...,...,...,...,...,...,...
269,471682384954675201,,,"[[{'start': 51, 'end': 57}], [{'start': 35, 'e...",,[nan],"[stress, pain]","Sleeping is a cure to forget about pain, probl..."
270,471709203300495360,,,"[[{'start': 77, 'end': 83}]]",,[nan],[stress],"Edible greens such as kale, spinach, bok choy ..."
271,471723585577308160,,,"[[{'start': 15, 'end': 20}]]",,[nan],[tired],Mood cold and tired
272,471725712093626368,,,"[[{'start': 35, 'end': 40}]]",,[nan],[tired],"Day 2 is winding down. We are hot, tired, dirt..."


Set up post tagger.

In [11]:
def find_indices_in_text(text, token):
	'''Get list of character indices where token starts in a string.'''
	return [m.start() for m in re.finditer(re.escape(token), text)]


def init_tags_for_all_words(text, split_text):
	'''
	get two indices for each token in text:
	1. position in the list, for tagging
	2. character position in the text, for output comparison
	'''
	#   1. position in list
	indices_TAGS = OrderedDict({ (i, token):'O' for i, token in enumerate( split_text ) })
	#     validate size
	assert(len(indices_TAGS) == len(split_text))
	#
	#   2. position in text
	double_indices_TAGS_lst = OrderedDict() # will contain all appearances of token within text
	#     for each token, get positions of all appearances in the text
	for (i,token) in indices_TAGS:
		try:
			all_ii_found = find_indices_in_text(text, token)
			all_ii = frozenset(all_ii_found)
		except:
			all_ii = frozenset([])
	#     mark all double-indices for each token with an "other" tag (neither symptom nor drug)
		double_indices_TAGS_lst[(i, all_ii, token)] = 'O'
	#   validate size
	assert(len(double_indices_TAGS_lst) == len(split_text))
	return indices_TAGS, double_indices_TAGS_lst


def init_tag_at_first_appearance(double_indices_TAGS_lst, split_text):
	'''
	chunk function not 100% clear, but seems like should
	  mark the double-index for the first appearance of a token in the
	  text with an "other" tag
	original comment:
	  "we now select from the list of possible indices for each token
	  the right one -- because we know its token's position,
	  i.e., the first index in the pair"
	'''
	double_indices_TAGS = OrderedDict()     # will contain 1st(?) appearance of token within text
	prev_ii = -1
	for (i, all_ii, token) in double_indices_TAGS_lst:
		all_ii_lst = sorted(list(all_ii))
		#print(tag, all_ii_lst)
		if all_ii_lst == []:
			this_ii = prev_ii+1
		for possible_ii in all_ii_lst:
			if possible_ii > prev_ii:
				this_ii = possible_ii
				break
		prev_ii = this_ii
		double_indices_TAGS[(i, this_ii, token)] = 'O'
	# validate size
	assert(len(double_indices_TAGS) == len(split_text))
	return double_indices_TAGS

def assign_tags_to_text(text, entity_type='DIS', entities=None):

	# sort the entities by size (length of tokens)
	# because we will prioritise longer among overlapping entities
	entities = sorted(entities, key=lambda x: len(x.split()), reverse=True)
	
	# counter for marked entities not found in text
	cnt_not_found = 0

	# text into tokens, removing punctuation
	try:
		split_text = word_tokenize(text)
	# ...or if empty, assign nothing and exit
	except TypeError:
		print (text, entities)
		return {}, [], 0, 0

	# tag everything in the text with 'other' ('O'), producing several index structure
	#   indices_TAGS            - {positions among tokens, token)} = 'O'
	#   double_indices_TAGS_lst - {(positions among tokens, [positions in text], token)} = 'O'
	#   double_indices_TAGS     - {(positions among tokens, 1st position in text, token)} = 'O'
	indices_TAGS, double_indices_TAGS_lst = init_tags_for_all_words(text, split_text)
	double_indices_TAGS = init_tag_at_first_appearance(double_indices_TAGS_lst, split_text)


	# retag entities from 'O' to appropriate tags
	entity_missmatch_with_AMT = 0
	entity_missmatched = []
	kept_entities = []
	# go from the longest to the shortest entities
	for e in entities:
		if e == 'nan':
			continue
		found_i = -1
		# if they are found in the raw text
		if text.find(e) != -1:
			# get the index of its FIRST position
			found_ii_list = find_indices_in_text(text, e)
			n_entity_tokens = len(e.split())
			# now find the corresponding tokens
			# for each token
			for (i, ii, token) in double_indices_TAGS:
				# if the token's text position is the same as the first word's position of an entity
				if ii in found_ii_list:
					# tag the token as the entity's beginning 'B-'
					found_i = i
					if indices_TAGS[(found_i, token)] == 'O': 
						indices_TAGS[(found_i, token)] = 'B-' + entity_type
						kept_entities.append(e)
						# find the rest of the tokens and mark them with inside-entity tags	'I-'
						s = 1
						while s < n_entity_tokens:
							if indices_TAGS[( found_i+s, split_text[found_i+s] )] == 'O':
								indices_TAGS[( found_i+s, split_text[found_i+s] )] = 'I-' + entity_type
							# note cases where entities overlap
							else:
								print ('XXXXX  UNSOLVED OVERLAPP  XXXXX')
							s += 1
						break
			# this is to be solved -- how you clean AMT text and parse tokens here should be the same
			if found_i == -1:
				entity_missmatch_with_AMT += 1
				entity_missmatched.append(e)
				continue
		# if entity is missing from text, ID as such - dirty data
		else:
			cnt_not_found += 1
	return indices_TAGS, kept_entities, entity_missmatch_with_AMT, cnt_not_found

# test
# assign_tags_to_text("wow my head still hurts after that", entity_type='DIS', entities=["head still hurts"])[0]

In [12]:
def tag_post(df_row, open_fl, total_entities, total_missmatched_entities, total_unkept_entities, all_sym_kept, all_drug_kept):
    '''
    Tags a single post `df_row` input as a series (row of a pd.DataFrame) and writes into the open file `open_fl`.
    Other arguments are accumulators used for diagnostics.
    '''
    syms = df_row['Answer.symptoms']
    drugs = df_row['Answer.drugs']
    text = df_row['post']
    # DIS stands for symptom, not disease; based on original
    post_tags_sym, sym_kept, sym_missmatch_with_AMT, sym_unkept = assign_tags_to_text(text, entity_type='DIS', entities=syms)
    post_tags_drug, drug_kept, drug_missmatch_with_AMT, drug_unkept = assign_tags_to_text(text, entity_type='DRUG', entities=drugs)
    # merge the two types of tags, prioritizing symptom tags over drug ones
    #   result is only one tag per token in the text
    post_tags = OrderedDict({ (index, token): sym_tag if sym_tag != 'O' else post_tags_drug[(index, token)] for (index, token), sym_tag in post_tags_sym.items() })
    # adjust accumulators
    all_sym_kept.append(';'.join(sym_kept))
    all_drug_kept.append(';'.join(drug_kept))
    total_entities += len(sym_kept) + len(drug_kept)
    total_missmatched_entities += sym_missmatch_with_AMT + drug_missmatch_with_AMT
    total_unkept_entities += sym_unkept + drug_unkept
    # write to output
    i_post_len = 0
    for (index, token),tag in post_tags.items():
        if token != 'null':
            i_post_len += 1
            open_fl.write(token + '\t' + tag + '\n')
            # split long posts into smaller chunks
            #   could be cleaner since can break up entities,
            #   but left as originally implemented
            if i_post_len >= 300:
                i_post_len = 0
                open_fl.write('\t\n')
    open_fl.write('\t\n')
    return [total_entities, total_missmatched_entities, total_unkept_entities, all_sym_kept, all_drug_kept]


def tag_dataset(df, tagged_token_out_fl, filtered_out_fl):
    
    # init accumulators
    total_entities, total_missmatched_entities, total_unkept_entities = 0, 0.0, 0.0
    all_sym_kept, all_drug_kept = [], []

    with open(tagged_token_out_fl, 'w') as fl:
        for i, line in df.iterrows():
            accumulators = tag_post(line, fl, total_entities, total_missmatched_entities, total_unkept_entities, all_sym_kept, all_drug_kept)
            total_entities, total_missmatched_entities, total_unkept_entities, all_sym_kept, all_drug_kept = accumulators
        print("Processed {} posts. Percent of entities not matched {:.2f}%, and percent of entities discounted {:.2f}%, of total {} accepted.".\
				format(i, total_missmatched_entities/total_entities*100, total_unkept_entities/total_entities*100, total_entities))
    df['new_sym'] = all_sym_kept
    df['new_drug'] = all_drug_kept
    df.to_csv(filtered_out_fl, columns=['Answer.symptoms', 'new_sym','Answer.drugs', 'new_drug', 'post'])

print("MedRed")
tag_dataset(dfMedRed, MEDRED_TAGGED_TOKENS_OUT, MEDRED_FILTERED_OUT)
print("Micromed")
tag_dataset(dfMicromed, MICROMED_TAGGED_TOKENS_OUT, MICROMED_FILTERED_OUT)
print("CADEC")
tag_dataset(dfCadec, CADEC_TAGGED_TOKENS_OUT, CADEC_FILTERED_OUT)

MedRed
Processed 1975 posts. Percent of entities not matched 0.18%, and percent of entities discounted 11.51%, of total 3788 accepted.
Micromed
Processed 273 posts. Percent of entities not matched 6.45%, and percent of entities discounted 0.25%, of total 403 accepted.
CADEC
Processed 819 posts. Percent of entities not matched 0.05%, and percent of entities discounted 1.67%, of total 2042 accepted.


Split into train, dev, and test sets.

In [13]:
def split_train_test(tagged_token_out_fl):
    '''
    50-25-25 split for training, validation (dev), and test sets.
    '''
    all_labels = pd.read_csv(tagged_token_out_fl, sep='\t', header=None)
    N = len(all_labels)

    train = all_labels.iloc[:int(N*0.5)]
    dev = all_labels.iloc[int(N*0.5):int(N*0.75)]
    test = all_labels.iloc[int(N*0.75): ]
    # validate splits make expected total
    assert (len(train)+len(test)+len(dev) == N)
    train.to_csv(tagged_token_out_fl.replace('.csv', '_train.csv'), sep=' ', index=None, header=None)
    dev.to_csv(tagged_token_out_fl.replace('.csv', '_dev.csv'), sep=' ', index=None, header=None)
    test.to_csv(tagged_token_out_fl.replace('.csv', '_test.csv'), sep=' ', index=None, header=None)

split_train_test(MEDRED_TAGGED_TOKENS_OUT)
split_train_test(MICROMED_TAGGED_TOKENS_OUT)
split_train_test(CADEC_TAGGED_TOKENS_OUT)