This notebook is used to generate named entity recognition (NER) labels from the CADEC dataset.

---

Environment config.

In [77]:
import os
from itertools import chain
import pandas as pd

In [78]:
# # included for convenience to help find correct paths
# import os
# os.getcwd()
# os.listdir()

Constants.

In [79]:
# paths
MEDRED_REPRODUCIBLE_DIR = "../"
CADEC_IN_DIR = MEDRED_REPRODUCIBLE_DIR + "data/cadec/input/"
CADEC_IN_ORIGINAL = CADEC_IN_DIR + "cadec/original/"
CADEC_IN_TEXT = CADEC_IN_DIR + "cadec/text/"
NER_OUT = MEDRED_REPRODUCIBLE_DIR + "data/cadec/CADEC_labels.csv"

Load files and check that count is same between tags and text.

In [80]:
tag_files = os.listdir(CADEC_IN_ORIGINAL)
text_files = os.listdir(CADEC_IN_TEXT)

len(text_files) == len(tag_files)

True

Parse tags into df.

In [81]:
def parse_tag_line(line):
    ls = line.split('\t')
    # split 2nd col on 1st space:
    #   "AnnotatorNotes T1" -> ["AnnotatorNotes", "T1"]
    #   "Drug 93 102" -> ["Drug", "93 102"]
    annotation_type, _ = ls[1].split(" ", 1)
    # get term from last position and drop trailing newline
    term = ls[-1][:-1]
    # subset to only relevant tags
    if annotation_type in ["Drug", "Symptom", "Disease"]:
        return [annotation_type, [term]] # term formatted this way to match micromed

list_for_df = []
i = 0 # counter to make key
for tag_fname, text_fname in zip(tag_files, text_files):
    with open(CADEC_IN_ORIGINAL + tag_fname, 'r') as fl:
        pairs = [parse_tag_line(line) for line in fl.readlines()]
        out = list(filter(None, pairs))
    with open(CADEC_IN_TEXT + text_fname, 'r') as fl:
        text = ''.join(fl.readlines())
    for row in out:
        row.append(text)
        row.append(i)
        row.append(i) # duplicate for convenient later parsing
    i += 1
    list_for_df.extend(out)
df = pd.DataFrame.from_records(list_for_df, columns=["type", "terms", "post", "id", "tid"], index="id")
df

Unnamed: 0_level_0,type,terms,post,tid
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Drug,[Arthrotec],I feel a bit drowsy & have a little blurred vi...,0
0,Disease,[arthritis],I feel a bit drowsy & have a little blurred vi...,0
0,Symptom,[agony],I feel a bit drowsy & have a little blurred vi...,0
0,Symptom,[pains],I feel a bit drowsy & have a little blurred vi...,0
1,Symptom,[hip pain],"Hunger pangs.\nBrilliant, I have a new lease o...",1
...,...,...,...,...
1249,Drug,[Zipsor],I was prescribed Zipsor for bone and joint pai...,1249
1249,Symptom,[Lupus pain],I was prescribed Zipsor for bone and joint pai...,1249
1249,Symptom,[pain from the herniated discs],I was prescribed Zipsor for bone and joint pai...,1249
1249,Drug,[Zipsor],I was prescribed Zipsor for bone and joint pai...,1249


Reformat the data.

In [82]:
# collapse lists of values within a post by type into a single cell
#   e.g. below needs to be made into one row, keyed on id and type
# df[(df.index==0) & (df.type == "Symptom")]

#   common procedure to group and format, abstracting away how to handle values
def agg_col(df, col, aggfunc):
    '''Calls and formats an aggregating function, grouping by post id and term type.'''
    dfAgg = df.groupby(["tid", "type"])[col].aggregate(aggfunc)
    dfAgg = dfAgg.reset_index()
    dfAgg = dfAgg.rename(columns={"tid":"id", "terms":"terms_list"})
    dfAgg = dfAgg.set_index(["id", "type"])
    return dfAgg

#   aggregation function specific to terms, used with grouping above
def join_terms(col):
    '''Collapses series of lists of terms into single string, with terms separated by semicolons.'''
    x = ';'.join(list(chain(*col)))
    return x

#   ...do it
dfTerms = agg_col(df, "terms", join_terms)

dfTerms

# test:
# dfTerms[dfTerms.index == (466073644209156096, "Disease")].terms_list.values[0] == 'small-vessel disease;Cerebral small-vessel disease;Alzhe;Alzheimer'

Unnamed: 0_level_0,Unnamed: 1_level_0,terms_list
id,type,Unnamed: 2_level_1
0,Disease,arthritis
0,Drug,Arthrotec
0,Symptom,agony;pains
1,Symptom,hip pain;walk up & down steps sideways
2,Symptom,pain
...,...,...
1248,Drug,mobic
1248,Symptom,pain
1249,Disease,Lupus;herniated discs
1249,Drug,Zipsor;Vicodin;Zipsor;Zipsor


In [83]:
# join terms
#   merging onto posts for incremental readability; post text dropped and re-added later
dfText = df[~df.index.duplicated(keep='first')]["post"].to_frame()
dfAgg = dfText.join(dfTerms)

dfAgg

Unnamed: 0_level_0,Unnamed: 1_level_0,post,terms_list
id,type,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Disease,I feel a bit drowsy & have a little blurred vi...,arthritis
0,Drug,I feel a bit drowsy & have a little blurred vi...,Arthrotec
0,Symptom,I feel a bit drowsy & have a little blurred vi...,agony;pains
1,Symptom,"Hunger pangs.\nBrilliant, I have a new lease o...",hip pain;walk up & down steps sideways
2,Symptom,no side effects for the first two months .\nth...,pain
...,...,...,...
1248,Drug,"Gave pretty good pain relief, with no side eff...",mobic
1248,Symptom,"Gave pretty good pain relief, with no side eff...",pain
1249,Disease,I was prescribed Zipsor for bone and joint pai...,Lupus;herniated discs
1249,Drug,I was prescribed Zipsor for bone and joint pai...,Zipsor;Vicodin;Zipsor;Zipsor


In [84]:
# pivot to match MedRed_AMT_labels.csv formatting
dfOut = dfAgg.reset_index()
dfOut = pd.pivot_table(dfOut, index="id", columns="type", values="terms_list", aggfunc=lambda x: x)

# add full post text
dfOut = dfOut.join(dfText)

# set column names
name_map = {
    "Drug":"Answer.drugs",
    "Symptom":"Answer.symptoms",
    "Disease":"Answer.diseases"
}
dfOut = dfOut.rename(columns=name_map)

dfOut

# row with all three types of terms (diseases, drugs, symptoms) as sanity check
# dfOut[dfOut.index == 1249]

Unnamed: 0_level_0,Answer.diseases,Answer.drugs,Answer.symptoms,post
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,arthritis,Arthrotec,agony;pains,I feel a bit drowsy & have a little blurred vi...
1,,,hip pain;walk up & down steps sideways,"Hunger pangs.\nBrilliant, I have a new lease o..."
2,,,pain,no side effects for the first two months .\nth...
4,,Arthrotec,I can't stand or walk for any lengths of time,I have had no side effects been taking Arthrot...
5,,Arthotec,pain and stiffness,It seems as if bleeding forever is a side effe...
...,...,...,...,...
1244,,Voltaren,pain,Mild sedation.\nThis is a GREAT drug for me.\n...
1245,,,pain,nausea.\nsome pain relief.\n
1246,,,pain;sore throat,Haven't really experienced any side effects th...
1248,,mobic,pain,"Gave pretty good pain relief, with no side eff..."


Save for further preprocessing.

In [85]:
dfOut.to_csv(NER_OUT)