In [41]:
import pandas as pd
import re
import numpy as np
from random import random
from random import randint
from random import seed
import os

In [38]:
#Change directory to data download folder
#os.chdir(/path to data download folder)

# Load data from Google Cloud Platform (obtained from SQL code)

In [5]:
#Merge 3 datasets downloaded from Google Cloud Platform by rows.
text_a1 = pd.read_csv("first160k.csv")
text_a2 = pd.read_csv("second160k.csv")
text_a3 = pd.read_csv("third160k.csv")
text_a1["TEXT"]=[re.sub(r"\n|_|\*\*"," ",text_a1["TEXT"][i]) for i in range(text_a1.shape[0])]
text_a2["TEXT"]=[re.sub(r"\n|_|\*\*"," ",text_a2["TEXT"][i]) for i in range(text_a2.shape[0])]
text_a3["TEXT"]=[re.sub(r"\n|_|\*\*"," ",text_a3["TEXT"][i]) for i in range(text_a3.shape[0])]
text_a2.index=text_a2.index+160000
text_a3.index=text_a3.index+320000
text_a = text_a1.append([text_a2,text_a3])
#text_a.shape
#(472464, 3)

# Generate synthetic text data with smoke status.

Find records without original smoke info.

In [7]:
# Extract smoking info function
def findWholeWord(w):
    '''
    Returns a search function that takes a string as an input. 
    The retuning function states which pattern does the string match.
    w: patterns to seach.
    '''
    return re.compile(r'\b({0})(\.|\b)'.format(w),
                      flags=re.IGNORECASE).search

# Construct a list of index for if smoking is included in a record
indx = \
[findWholeWord('smoke|smoking|smoked|smoker|tobacco|tobaccos|tob|nicotine|cigar|cigars|cig|cigs|cigarette|cigarettes'
     )(i) for i in text_a['TEXT']]
a = text_a.index
smk = list()
# Search for the specific sentence with smoking information.
for (j, i) in enumerate(indx):
    if i is not None:
        matches = re.search(r"((\S+\s+){0,4}(\S+\W?)?)" + i.groups()[0]
                            + "(\W?(\s+\S+){0,4})", text_a['TEXT'
                            ][a[j]]).group()
        smk.append(matches)
    else:
        smk.append('')

In [10]:
#Filter the original records and obtain records without any somking information.
text_a["smoke"] = smk
no_smoking_text = text_a["TEXT"][text_a["smoke"]==""]

Select records to add smoking information.

In [11]:
# Filter records without somking information 
# that includes admission information or chief complaint information.
# These records are the ones that normally contain smoking information.
nsmk_adm_text = list()
for i in no_smoking_text:
    if "Admission Date:" in  i or "Chief Complaint:" in  i:
        nsmk_adm_text.append("yes")
    else:
        nsmk_adm_text.append("no")


In [13]:
idx=[i for i,j in enumerate(nsmk_adm_text) if j=="yes"]
no_smoking_text_adm = no_smoking_text.iloc[idx]

Generate a set of index for records to generate 45% smokers.

In [14]:
seed(123)
smk_ind = list()
for i in range(no_smoking_text_adm.size):
    if random()<=0.45:
        smk_ind.append(1)
    else:
        smk_ind.append(0)

Insert a random smoking/nonsomoking string in to a random location of the original record.

In [15]:
# Strings to insert that contains smoking information.
smk_text_content = ["10 pack-year smoking.","100 pack-year tobacco.",
                    "Tobacco: Quit 30'.","occasional cigar.","10 cigarettes daily.",
                    "Tobacco: current smoker.","Heavy smoking.","Tobacco: patient quit.",
                    "cigars daily.","former smoker.","non-smoker",
                   "Tobacco: denies","Tobacco: none"]

In [16]:
# smoking string are inserted to 45% of the population.
# Nonsmoking string are inserted to half of the rest patients.
seed(123)
pt_smk_data = no_smoking_text_adm
for i in range(pt_smk_data.size):
    if smk_ind[i] == 1 and ('nonsmoker'
                            in str.lower(pt_smk_data.iloc[i])) == False:
        temp = pt_smk_data.iloc[i].split(' ')
        insert_loc = randint(0, len(temp))
        smk_rand = randint(0, 9)
        temp_list = [temp[:insert_loc], [smk_text_content[smk_rand]],
                     temp[insert_loc:]]
        flat_list = []
        for sublist in temp_list:
            for item in sublist:
                flat_list.append(item)
        pt_smk_data.iloc[i] = ' '.join(flat_list)
    elif random() < 0.5:
        temp = pt_smk_data.iloc[i].split(' ')
        insert_loc = randint(0, len(temp))
        smk_rand = randint(10, 12)
        temp_list = [temp[:insert_loc], [smk_text_content[smk_rand]],
                     temp[insert_loc:]]
        flat_list = []
        for sublist in temp_list:
            for item in sublist:
                flat_list.append(item)
        pt_smk_data.iloc[i] = ' '.join(flat_list)

In [17]:
# Create synthetic data as a data frame with true smoking status and text as columns.
syn_smk_data = pd.DataFrame(list(zip(smk_ind,pt_smk_data)),columns = ['true_smk_status', 'text'])

Search for weight info

In [20]:
indxwt = [findWholeWord('weight|wt|lb|kg')(i) for i in
          syn_smk_data['text']]
a = syn_smk_data.index
wt = list()
for (j, i) in enumerate(indxwt):
    if i is not None:
        matches = re.findall(r'(?:^|\S+\s+\S+\s+)' + i.groups()[0]
                             + '(?:\s*\S+\s+\S+\s+\S+|$)',
                             syn_smk_data['text'][a[j]])
        wt.append(matches)
    else:
        wt.append('')

In [21]:
# Find target strings that contains weight information.
wt1 = list()
for j in range(len(wt)):
    wtn = ''
    if len(wt[j]) != 0:
        wtn = \
            re.findall(r'((weight|wt|\(current\))\s?(\(lb\))?\:?\s?\d{2,3}\.?\d{1,2}\s?k?)'
                       , str.lower(str(wt[j])))
        if len(wtn) > 0:
            wtn = wtn[0][0]
        else:
            wtn = ''
    wt1.append(wtn)

In [22]:
# Search for numerical values in the target strings.
wt2 = list()
for i in range(len(wt1)):
    temp = re.findall(r'(\d{2,3}\.?\d{1,2})', wt1[i], re.MULTILINE)
    if len(temp) > 0:
        temp = float(temp[0])
    else:
        temp = ''
    if bool(re.search(r'k$', wt1[i])) == True:
        temp = temp * 2.2
    wt2.append(temp)

In [25]:
syn_smk_data["weightlb"] = wt2

In [26]:
smk = list()
for i in syn_smk_data.iloc[:]["text"]:
    j=1-bool(("nonsmoker" in str.lower(i)))
    smk.append(j)

In [27]:
for i in syn_smk_data.index:
    if smk[i]==0:
        syn_smk_data.loc[i,"true_smk_status"]=0

Check how many "smokers" are created in the simulation data.

In [28]:
sum(syn_smk_data["true_smk_status"])/syn_smk_data.shape[0]

0.4446730186227804

In [26]:
# Save sim data locally
# 
#syn_smk_data.to_csv ('syn_text_w_smoke.csv', index = False, header=True)

# Find Smoking Information from Simulated Clinical Text

In [33]:
#Drop an outlier with both smoker and nonsmoker tags in the original text.
sim_data = syn_smk_data.drop(17161)

In [34]:
# Search if keywords exists
indx = \
    [findWholeWord('smoke|smoking|smoked|smoker|tobacco|tobaccos|tob|nicotine|cigar|cigars|cig|cigs|cigarette|cigarettes'
     )(i) for i in sim_data['text']]
a = sim_data.index
smk = list()
for (j, i) in enumerate(indx):
    if i is not None:
        matches = re.search(r"((\S+\s+){0,4}(\S+\W?)?)" + i.groups()[0]
                            + "(\W?(\s+\S+){0,4})", sim_data['text'
                            ][a[j]]).group()
        smk.append(matches)
    else:
        smk.append('')

In [35]:
# Seach if negation exists
smk1 = smk
smk = [''.join(smk[i]).lower() for i in range(len(smk))]
nosmk = [i for i in range(len(smk))
         if re.findall('(does not smoke)|(denies smoking)|(no smoking)|(denies tobacco)|(tobacco: denies)|(tobacco: none)|(nonsmoker)|(non-smoker)|(nonsmoking)|(non-smoking)'
         , smk[i], re.MULTILINE) != []]
smk_extract = np.zeros(sim_data.shape[0])
for i in range(len(smk_extract)):
    if i not in nosmk and smk[i] != '':
        smk_extract[i] = 1

In [36]:
smk_extract_result = [int(smk_extract[i]) == sim_data.iloc[i]["true_smk_status"] for i in range(len(smk_extract))]

In [37]:
# Extracted agrees with true smoking status 100%.
sum(smk_extract_result) / len(smk_extract_result) 

1.0

In [None]:
# Save data locally
sim_data["smk_extract"] = smk_extract
sim_data.to_csv ('sim_data.csv', index = False, header=True)