# Extract height/weight/smoke from notes.csv

In [1]:
import pandas as pd
import re
import numpy as np

In [5]:
# Load in data
text_a1 = pd.read_csv("notes.csv")

In [6]:
text = text_a1[["SUBJECT_ID","CATEGORY","TEXT"]]
for i in range(text.shape[0]):
    text.loc[i,"TEXT"] = re.sub(r"\n|_|\*\*"," ",text["TEXT"][i])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


# Smoking

In [7]:
def findWholeWord(w):
    return re.compile(r'\b({0})(\.|\b)'.format(w),
                      flags=re.IGNORECASE).search


indx = \
    [findWholeWord('smoke|smoking|smoked|smoker|tobacco|tobaccos|tob|nicotine|cigar|cigars|cig|cigs|cigarette|cigarettes'
     )(i) for i in text['TEXT']]
a = text.index
sbp = list()
for (j, i) in enumerate(indx):
    if i is not None:
        matches = re.search(r"((\S+\s+){0,4}(\S+\W?)?)" + i.groups()[0]
                            + "(\W?(\s+\S+){0,4})", text['TEXT'
                            ][a[j]]).group()
        sbp.append(matches)
    else:
        sbp.append('')


In [8]:
text["smoke"] = sbp

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text["smoke"] = sbp


In [9]:
sbp1 = sbp
sbp = [''.join(sbp[i]).lower() for i in range(len(sbp))]
nosmk = [i for i in range(len(sbp))
         if re.findall('(does not smoke)|(denies smoking)|(no smoking)|(denies tobacco)|(no tobacco)|(tobacco: denies)|(tobacco: none)|(nonsmoker)|(non-smoker)|(nonsmoking)|(non-smoking)'
         , sbp[i], re.MULTILINE) != []]
nosmkind = np.zeros(text.shape[0])
for i in range(len(nosmkind)):
    if i in nosmk or sbp[i] == '':
        nosmkind[i] = 1

In [10]:
smk = int(1) - nosmkind

In [11]:
print("percent of patients who smoke = ",str(np.mean(smk)*100),"%")

percent of patients who smoke =  28.722680913064597 %


In [12]:
text["smoke"]=smk

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text["smoke"]=smk


# Weight

In [13]:
indxwt = [findWholeWord('weight|wt')(i) for i in text['TEXT']]
a = text.index
wt = list()
for (j, i) in enumerate(indxwt):
    if i is not None:
        matches = re.search(r"((\S+\s+){0,4}(\S+\W?)?)" + i.groups()[0]
                            + "(?:\s*\S+\s+\S+\s+\S+|$)", text['TEXT'
                            ][a[j]]).group()
        wt.append(matches)
    else:
        wt.append('')

In [14]:
wt1 = list()
for j in range(len(wt)):
    wtn = ''
    if len(wt[j]) != 0:
        wtn = \
            re.findall(r'((weight|wt|wgt|\(current\))\s?(\(lb\))?\:?\s?\d{2,3}\.?\d{1,2}\s?k?)'
                       , str.lower(str(wt[j])))
        if len(wtn) > 0:
            wtn = wtn[0][0]
        else:
            wtn = ''
    wt1.append(wtn)

In [15]:
wt2 = list()
for i in range(len(wt1)):
    temp = re.findall(r'(\d{2,3}\.?\d{1,2})', wt1[i], re.MULTILINE)
    if len(temp) > 0:
        temp = float(temp[0])
    else:
        temp = ''
    if bool(re.search(r'k$', wt1[i])) == True:
        temp = temp * 2.2
    wt2.append(temp)

In [16]:
d = {"wt1":wt1,"wt2":wt2}
df = pd.DataFrame(data = d)

In [17]:
text['weight'] = wt2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text['weight'] = wt2


# Height

In [18]:
indxht = [findWholeWord('height|ht|hgt')(i) for i in text['TEXT']]
a = text.index
ht = list()
for (j, i) in enumerate(indxht):
    if i is not None:
        matches = re.search(r"((\S+\s+){0,4}(\S+\W?)?)" + i.groups()[0]
                            + "(?:\s*\S+\s+\S+\s+\S+|$)", text['TEXT'
                            ][a[j]]).group()
        ht.append(matches)
    else:
        ht.append('')


In [19]:
ht1 = list()
for j in range(len(ht)):
    htn = ''
    if len(ht[j]) != 0:
        tempstr = str.lower(''.join(ht[j]).replace('"', '').replace("'"
                            , ';').replace(r"\\", '').replace('[', ''))
        htn = \
            re.findall(r'(height\s*\:(\s*\(in\)\s*)*\d{1,2}\;*\s*\d{1,3})'
                       , tempstr)
        if len(htn) > 0:
            htn = htn[0][0]
        else:
            htn = ''
    ht1.append(htn)

In [20]:
ht2 = list()
for i in range(len(ht1)):
    temp = re.findall(r'(\d\s*\;\s*\d{1,2})', ht1[i], re.MULTILINE)
    if len(temp) > 0:

        # Change foot to inch

        temp = float(temp[0].split(';')[0]) * 12 \
            + float(temp[0].split(';')[1])
    else:
        temp = re.findall(r'(\d{2,3})', ht1[i], re.MULTILINE)
        if len(temp) > 0:
            temp = float(temp[0])
        else:
            temp = ''
    ht2.append(temp) 

In [21]:
text["height"] = ht2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text["height"] = ht2


In [29]:
text.to_csv('hyper_text_ie.csv', index = False, header = True)