In [None]:
import boto3
import botocore
import numpy as np
import os
import pandas as pd
import re
import warnings
warnings.filterwarnings("ignore", 'This pattern is interpreted as a regular expression')
bucket = ""
filter_out = []
#Name should be without '.csv'
export_name = "data/regex_"

In [None]:
#assertion file
n3 = pd.read_csv(f"s3://{bucket}/data/processed_notes_assertions.csv")
#token file
expd = pd.read_csv(f"s3://{bucket}/data/processed_notes_tokens.csv")
notes = pd.read_csv(f"s3://{bucket}/data/note_set_cleaned.csv")
notes = notes.drop_duplicates(subset=["notes_id"])
expd.notes_id = expd.notes_id.astype(str)
n3.notes_id = n3.notes_id.astype(str)

expd = expd.drop_duplicates(subset=["notes_id", "token_start", "token_end"])
expd = expd.dropna(subset="tokens")
expd = expd.reset_index(drop=True)

notes = notes[notes.notes_id.isin(expd.notes_id)]

In [None]:
pd.set_option("mode.chained_assignment", None)
#takes: list of regex searches for terms of interest, context chars(int) default=75
#returns: table with token[list], classification[list], any overlapping assertion chunk & status, 
#text +- context characters from token
def gen_table_text(search, disallow, chars=75):
    #search for terms
    found = []
    for term in search.keys():
        searched = notes[notes.cleaned_notes.str.contains(search[term])]
        searched["concept"] = term
         #solution courtesy of https://stackoverflow.com/questions/26658213        
        searched['regex_output_tuple'] = searched['cleaned_notes'].apply(get_regex_output,
                                                                         args=(search[term],))

        #convert the tuple into separate columns
        columns_from_regex_output = ['start','end']      
        for n, col in enumerate(columns_from_regex_output):
            searched[col] = searched['regex_output_tuple'].apply(lambda x: x[n])
        #delete the unnecessary column
        searched = searched.drop('regex_output_tuple', axis=1)
        found.append(searched)
    found = pd.concat(found).reset_index(drop=True)
    
    #knit together matching tokens and assertion chunks
    toks = []
    found.apply(get_tokens, axis=1, args=(toks,))
    if len(toks) > 0:
        toks = pd.concat(toks).reset_index(drop=True)

        #drop duplicates
        toks = toks.drop_duplicates(subset=["token_start", "notes_id"]).reset_index(drop=True)

        #lastly, get context text
        toks["text"] = toks.apply(get_text, axis=1, args=(chars,))

    else:
        toks = pd.DataFrame()
                
    return toks

#helper function to obtain regex output (start, stop) as a tuple
def get_regex_output(row, term):
    find = re.finditer(term, row)
    starts = []
    ends = []
    for i in find:
        starts.append(i.start())
        ends.append(i.end())
    return(starts, ends)
    
#helper function for gen_table_text
#takes a row from pd with extracted start/stops, new df
#adds to df list of tokens within [start, stop]
def get_tokens(row, df):
    starts = row.start
    ends = row.end
    tok_locs = list(zip(starts, ends))
    note = expd.loc[expd.notes_id.isin([row.notes_id])]
    forbid_start = ["and", "to", "of", "or", "is", "a", "for", "s", "an", "if", "be", "as",
                   "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10",
                    ",", ":", ";", "."]
    tok_list = []
    notes_id_list = []
    start_list = []
    end_list = []
    clinical_list = []
    wips_list = []
    bert_list = []
    concept_list = []
    for tok in tok_locs:
        start = tok[0]
        end = tok[1]
        new_toks = []
        new_clinical = []
        new_wips = []
        new_bert = []
        #get row of start tokens
        #sometimes numbers are off
        start_row = note[note.token_start.isin([start])]
        start_row = start_row[~start_row.tokens.isin(forbid_start)]
        if len(start_row) < 1:
            start_row = note[(note.token_start >= start-5) & (note.token_start <= start+5)]
            start_row = start_row[~start_row.tokens.isin(forbid_start)]
            if len(start_row) < 1:
                #attempt to find target text in rows
                target = row.cleaned_notes[start:end]
                start_row = note[(note.token_start >= start-25) & (note.token_start <= start+15)]
                start_row = start_row[start_row.tokens.str.contains(target) | 
                                     start_row.tokens.str.contains(target[-5:-1])]
                if len(start_row) < 1:
                    start_row = note[(note.token_start >= start-25) & (note.token_start <= start+15)]
                    continue
        start_row = start_row.iloc[0]
        if start_row.token_end+1 < end:
            end_row = note[(note.token_end == end) & (note.token_end <= end)]
            end_row = end_row[~end_row.tokens.isin(forbid_start)]
            if len(end_row) < 1:
                end_row = note[(note.token_end >= end-5) & 
                                        (note.token_end <= end+5)]
                if len(end_row) < 1:
                    #attempt to find end of target text in rows
                    target = row.cleaned_notes[start:end]
                    end_row = note[(note.tokens.str.contains(target))|
                                           (note.tokens.str.contains(target[-4:]))]
                    if len(end_row) < 1:
                        continue
            end_row = end_row.iloc[0].name
        else:
            end_row = start_row.name
        if end_row < start_row.name:
            end_row = start_row.name
        elif end_row > start_row.name + 20:
            end_row = start_row.name + 1
        start_row = start_row.name
        end_row += 1
        for r in range(start_row, end_row):
            new_toks.append(expd.iloc[r].tokens)
            new_clinical.append(expd.iloc[r].clinical_ner)
            new_wips.append(expd.iloc[r].wip_ner)
            new_bert.append(expd.iloc[r].BERT_ner)
        tok_list.append(new_toks)
        clinical_list.append(new_clinical)
        wips_list.append(new_wips)
        bert_list.append(new_bert)
        notes_id_list.append(row.notes_id)
        start_list.append(start)
        end_list.append(end)
        concept_list.append(row.concept)
    if len(tok_list) > 0:
        tok_list = pd.DataFrame([tok_list, start_list, end_list, concept_list, clinical_list,
                                 wips_list, bert_list, notes_id_list]).transpose()
        tok_list.columns = ["tokens", "token_start", "token_end", "concept",
                            "clinical_ner", "wip_ner", "BERT_ner", "notes_id"]
        df.append(tok_list)
        #now let's get chunks...
        tab = []
        tok_list.apply(find_chunks, args=(tab,), axis=1)
        tab = pd.concat(tab).reset_index(drop=True)
        toks = pd.concat([tok_list, tab], axis=1)
        tab = []
        toks.apply(check_chunks, args=(tab, note), axis=1)
        toks = pd.concat(tab, axis=1).transpose().reset_index(drop=True)
        
        df.append(toks)
    
    
#helper function for gen_table_text
#takes df with tokens, returns any overlapping chunks found
def find_chunks(row, df):
    start = int(row.token_start)
    end = int(row.token_end)
    chunks = n3.loc[n3.notes_id.isin([row.notes_id])]
    chunks = chunks[(chunks.chunk_begin >= start)\
             & (chunks.chunk_begin < end)]
    chunks = pd.DataFrame([[list(chunks.chunks),
                            list(chunks.entities),
                            list(chunks.assertion)]],
                          columns=["chunks", "entities", "assertion"])
    df.append(chunks)
    
#takes a row from our regex'd table + blank list
#checks to see if we need to go backwards to get beginning of chunks
def check_chunks(row, tab, note):
    # note = expd[expd.notes_id.isin([row.notes_id])]
    note = note[(note.token_start < row.token_start + 20) &
               (note.token_start > row.token_start - 20)]
    start = row.token_start
    end = row.token_end
    
    start_row = note[note.tokens.isin([row.tokens[0]])]
    #sometimes you get two of the same word close together!
    #make sure we get the one we actually wanted!
    if len(start_row) > 1:
        start_row['dist'] = abs(start_row.token_start - start)
        start_row = start_row[start_row.dist.isin([start_row.dist.min()])]
        
    if len(start_row) > 0:
        start_row = start_row.iloc[0]
        end_row = start_row.name + (len(row.tokens) - 1)
        clin_row = [row.clinical_ner[0], row.wip_ner[0], row.BERT_ner[0]]
        ners = get_ner(clin_row)
        new_start = start_row.name
        if ners == "i":
            #then we need to go back until we find a 'b'
            new_start = start_row.name - 1
            cur_start = expd.iloc[new_start]
            start_ners = get_ner([cur_start.clinical_ner, cur_start.wip_ner, cur_start.BERT_ner])
            if start_ners != "b":
                while start_ners != "b":
                    new_start = start_row.name - 1
                    cur_start = expd.iloc[new_start]
                    start_row = cur_start
                    start_ners = get_ner([cur_start.clinical_ner, cur_start.wip_ner, cur_start.BERT_ner])
        elif ners == "o":
            #check if we have an 'i' or 'b' ahead till end row
            set_new = False
            while new_start < expd.iloc[end_row].name:
                new_start += 1
                cur_start = expd.iloc[new_start]
                start_ners = get_ner([cur_start.clinical_ner, cur_start.wip_ner, cur_start.BERT_ner])
                if start_ners != "o":
                    set_new = True
                    break
            if set_new != True:
                new_start = start_row.name
        #and check after as well
        cur_end = expd.iloc[end_row]
        end_ners = get_ner([cur_end.clinical_ner, cur_end.wip_ner, cur_end.BERT_ner])
        new_end = end_row + 1
        next_ners = get_ner([expd.iloc[new_end].clinical_ner,
                             expd.iloc[new_end].wip_ner,
                             expd.iloc[new_end].BERT_ner])
        if end_ners == "i" or next_ners == "i":
            if end_ners != "i":
                end_ners = next_ners
                end_row = new_end
            while end_ners == "i":
                new_end = end_row + 1
                cur_end = expd.iloc[new_end]
                if cur_end.notes_id != row.notes_id:
                    new_end = end_row - 1
                    break
                else:
                    end_row = cur_end
                    end_ners = get_ner([cur_end.clinical_ner, cur_end.wip_ner, cur_end.BERT_ner])
                    end_row = end_row.name
        #now to yank off those ending 'o's that snuck in
        #first see if we're already too long
        if new_end - new_start > 5:
            new_end = new_start + 5
        if end_ners == "o" and ners != "o":
            while end_ners == "o":
                new_end = end_row - 1
                cur_end = expd.iloc[new_end]
                end_row = cur_end
                end_ners = get_ner([cur_end.clinical_ner, cur_end.wip_ner, cur_end.BERT_ner])
                end_row = end_row.name
            new_end = new_end + 1
        #TODO probable refinement here on how to chop down excessively long NER strings
        if new_end - new_start > 5:
            new_end = new_start + 5
        if new_start >= new_end:
            new_end += 1
        whole_df = expd.iloc[new_start:new_end]
        if len(whole_df) == 0:
            new_end = new_start + 1
            while len(expd.iloc[new_start:new_end]) == 0:
                new_start -= 1
                new_end = new_start + 1
            whole_df = expd.iloc[new_start:new_end]
        row.token_start = whole_df.iloc[0].token_start
        row.token_end = whole_df.iloc[-1].token_end + 1
        row.tokens = list(whole_df.tokens)
        row.clinical_ner = list(whole_df.clinical_ner)
        row.wip_ner = list(whole_df.wip_ner)
        row.BERT_ner = list(whole_df.BERT_ner)
    tab.append(pd.DataFrame(row))
        
#helper function for gen_table
#takes row and chars (int)
#returns the += chars context around the tokens
def get_text(row, chars):
    n = notes[notes.notes_id == row.notes_id].cleaned_notes.values[0]

    start = row.token_start
    start -= chars
    end = row.token_end# + 1
    end += chars
    
    #rare edge case where token_end didn't get recorded properly
    if end < start:
        end = row.token_start + chars
        for i in row.tokens:
            end += len(i)
    
    if start < 0:
        start = 0
    if end > len(n):
        end = len(n)
        
    return n[start:end]

#helper function for find_chunks
#receives list (clinical, wip, bert)
#returns whether one NER starts with 'i'; if not, 'b'; if not, 'o'
def get_ner(row):
    if row[0][0] == "B" or\
       row[1][0] == "B" or\
       row[2][0] == "B":
        if row[0] and row[1] and row[2] != "B-Gender":
            return("b")
        else:
            return("o")
    elif row[0][0] == "I" or\
       row[1][0] == "I" or\
       row[2][0] == "I":
        return("i")
    else:
        return("o")

#helper function for get_chunks
#returns df with tokens with disallowed text (puncutation) removed
def clean_tokens(whole_df, disallow):
    tokens = list(whole_df.tokens)
    clinical = list(whole_df.clinical_ner)
    wip = list(whole_df.wip_ner)
    bert = list(whole_df.BERT_ner)
    #currently we have decided not to do any of this
    # remove = []
    # for i in range(len(tokens)):
    #     if tokens[i] in disallow:
    #         remove.append(i)
    # for i in range(len(remove)):
    #     rm = remove[i]
    #     rm -= i
    #     # print(remove, rm, tokens[rm])
    #     del tokens[rm]
    #     del clinical[rm]
    #     del wip[rm]
    #     del bert[rm]
    #     # print(tokens)
    res = pd.DataFrame([[tokens,
                        whole_df.iloc[0].token_start,
                        whole_df.iloc[-1].token_end,
                        clinical, wip, bert]],
                        columns=["tokens", "token_start", "token_end", "clinical_ner", "wip_ner", "BERT_ner"])
    return res

#Current version of our regex assertions
def newer_regex(row, start=60, end=7):
    nots1 = re.search('(\\bnot?\\b|\\bden(y(ing)?|ies|ied)|\\bneg\\w*|never|resolved|absent)\\W+(?:\\w+\\W+){0,2}?',
                  row.text[100-start:100],
                 flags=re.I)
    if len(row.text[100-start:100].split(',')) > 2:
        nots1 = re.search('(\\bnot?\\b|\\bden(y(ing)?|ies|ied)|\\bneg\\w*|never|resolved|absent)\\W+(?:\\w+\\W+){0,2}?',
              row.text[0:100],
             flags=re.I)
    nots2 = re.search('(\\bnot?\\b|\\bden(y|ies|ied)|\\bneg\\w*|never|resolved|none|0|absent)\\W+(?:\\w+\\W+){0,2}?',
                    row.text[100+len(row.tokens)-3:100+len(row.tokens)+end],
                     flags=re.I)
    nots3 = re.search('\\[\\]|\\(-|--',
                      row.text[85:100],
                      flags=re.I)
    nots4 = re.search('if you have|as needed|go to ED if|if other symptoms|(once|twice) (daily|a day)|\
tablets at|minimum of 24 hours|recurrence of',
                      row.text[40:100],
                      flags=re.I)
    nots5 = re.search('in (his|her|their)*\s*(mother|father|grandmother|grandfather|brother|sister)|\
maternal|paternal|family history|past medical history|PMH|prior history| h o |h/o',
                      row.text[0:100],
                      flags=re.I)
    nots6 = re.search('(disorder|disease(s)*|abuse|Dysfunction|migraines|anxiety)\s*\
(mother|father|grandmother|grandfather|brother|sister|maternal|paternal|uncle|aunt|sibling|sibs)',
                      row.text,
                      flags=re.I)
    nots7 = re.search('\\byou\\b|your\\b|\\bme know|speak to our|how do|contact us',
                      row.text[0:100],
                      flags=re.I)
    nots8 = re.search('(can|may|might|could) be|(can|that) cause|him or her|\
may (feel|indicate|experience|include)|risk (of|for)|or new|cautions discussed|help prevent|\
(further|additional|any other) symptoms|watch out for|reasons to return',
                      row.text[0:100],
                      flags=re.I)
    nots9 = re.search('\?',
                      row.text[40:100],
                      flags=re.I)
    nots10 = re.search('\[include',
                      row.text[60:100],
                      flags=re.I)
    nots11 = re.search('\\d \\d \\d \\d \\d|score\s*\\d|PROMIS',
                      row.text[60:100],
                      flags=re.I)
    nots12 = re.search('possible',
                      row.text[80:100],
                      flags=re.I)
    nots13 = re.search('clinical protocol designed|side effects which|as safety|me or parents',
                      row.text[0:100],
                      flags=re.I)
    nots14 = re.search('denies any residual',
                      row.text[0:100],
                      flags=re.I)
    nots15 = re.search('without',
                      row.text[80:100],
                      flags=re.I)
    nots = [nots1, nots2, nots3, nots4, nots5, nots6, nots7, nots8, nots9, nots10, nots11, nots12, nots13,
           nots14, nots15]
    stat = None
    for i in nots:
        if i != None:
            pos = re.search('positive|affirm|yes|feels|has had',
                  row.text[80:100],
                 flags=re.I)
            pos2 = re.search('positive|affirm|yes',
                      row.text[100+len(row.tokens):100+len(row.tokens)+5],
                     flags=re.I)
            pos3 = re.search('\+',
                      row.text[95:105],
                     flags=re.I)
            if pos == None and pos2 == None and pos3 == None:
                stat = "Absent"
            else:
                stat = "Present"
    if stat == None:
         stat = "Present"
    return stat
      
terms = {
    "Anosmia/Ageusia": 'anosmia|paraosmia|dysgeusia|(olfactory dysfun)|\
(sense of smell altered)|(unusual smell)|\
(smell\\s*(impairment|loss|defici\\w|insensit\\w))|\
(absence|loss|diminished|disorder|inability|problem|altered|lack|reduced|lost|distorted|distortion|\
abnormal|sensit\\w*)\\s*(?:of|with|to)?\\s*(?:sense of)?\\s*(olfact\\w*|smell(?!ing)|taste)|\
((abnormal|change in)\\s*(smell|taste))|(smell|taste)\\s*alteration',
        "Anxiety": 'adjustment disorder|(anxiety|anxious)\\s?(attack|disorder|state|mood)?|excessive fear|(\
feel[a-z]*|felt)\\s(fearful|scared|frightened|jittery|nervous|tense|terrified|uneasy|worried|\
fidgety|panicked|panicky)|nervous(ness| breakdown)|neuro(tic|sis)|panic attack|\
(trouble|difficulty)\\s(with)?\\srelaxing',
         "Appetite Loss": "anorexia(?!\s*nervosa)|early satiety|(no|small|little|limited|poor)\s*appetite|\
(lost|losing|loss|lack[a-z]?)\s*(of)?\sappetite|\
(doesn't want|not wanting) to eat|(little|no|lack of) interest in (food|eating)",
         "Chest Pain": "chest pain|pain[s]? in [a-z]* chest",
         "Cough": '(barking|choking|chronic|painful|non(-)?\\s{,3}stop|productive|persistent|paroxysmal|mild|moderate|mod|\
severe|frequent|hacking|irritating|morning|nocturnal|daytime|explosive|dry|spasmodic|wet)?\\s{,3}cough(?!assist)(?!response)(s|ing|ed)\\s{,3}\
(during|on|after|variant|varient|while|when)?|(frequen[a-z]*)\\s{,3}(of)?\\s{,3}cough(s|ing)?|\
cough(s|ing|ed)?\\s{,3}frequen(cy|tly|t|)?',
         "Dental/Gum Problems": "(problem[s]?|issue[s]?|difficult(y|ies)) (in|with) (teeth|gum)|\
(poor|bad) gum|(crack|chip)(s|ed)?\s*(in)?\s*teeth|mouth sores|dry mouth(?! liquid)|\
(jaw|tooth) pain|pain in (jaw|tooth|teeth)|(swollen|swelling|bleeding)\s*(in)?\s*gums|\
(tooth|teeth|gum[s]?) (problems|issues|difficulty|difficulties)",
         "Depression": '(unhappy|depress(ion|ive|ed)(?! right| left| systolic| biventricular| screen| thermodilution)|\
suicid(e|al))|\
((feel(s|ing)|felt)\\s*(sad|worthless|helpless|like a failure))|\
(no reason (for|to) liv(e|ing))|(nothing to look forward to)',
        "Digestive Issues": '(feel[a-z]*|felt)?\s*nause[a-z]*|vomit[a-z]*|upset stomach|poor appetite|\
throwing up|\\bretch[a-z]*|\
(hyper)? emesis|sick to stomach|queasy|\bpuk[a-z]*|\bbarf[a-z]*|\
(?<!mitral )(?<!valve )(?<!trivial )(?<!aortic )(?<!flow )(?<!tricuspid )regurgit[a-z]*|loose stool|diarrhea|\
(trouble|difficulty|problems)\s*(with)?\s*(defecation|defecating|stooling|pooping)|constipat(ed|ion)|\
(?<!vesicoureteral)(?<!vesicoureteric)(?<!urine) reflux\\b',
         "Dizziness": "dizz(y|iness)|vertigo|room\s*(is|was)? spinning",
         "Excessive Sweating": "(excessive|profuse) sweat(ing|s)?|sweat(s|ing) (too much|excessively|profusely)|\
night sweats|hyperhidrosis|diaphoresis",
         "Excessive Thirst": "excess(ive)? thirst|polydipsia",
         "Fatigue": '((feel\\w*|felt)\\s{,3}(exhaust\\w*|tired|weak|worn out))|\
((fatigu\\w*)|bone-tired|lethargic|listless|malaise|run-down|(general|overall|pervasive|intermittent) weakness|weary|\
sluggish(?! to light)|exhaustion)|((easily|mentally|physically|totally|too)\\s{,3}(tired|drained))|(\
(no|not|out of)\\s{,3}(?:enough )?energ\\w*)|(tire[sd]\\s{,3}(quickly|with exercise|all the time))',
        "Fever": '(?<!hay)fever[a-z]*|pyrexia|(raised|elevated|high)\\s*(body)?\\s*temp(erature)?|shiver[a-z]*|\
temp(erature)?\\s*(spike|raised|elevated|high)|\\bfebrile|(feel[a-z]*|felt)\\s*hot|rigors',
        "Hair Loss": "(loss|shedding) of hair|los(t|ing)\s*(his|her|my|their)*\s*hair|\
hair (loss|shedding)|(?<!scar )alopecia|telogen effluvium",
         "Headache": "head\s*ache[s]?|migraine[s]?(?! team| program| center| psychologist| disability)|\
(?<!goodstart)(?<!extensive)(?<!gerber)(?<!pedsql)(?<!ha to)(?<!reviewed by)(?<!labs due to) \\bha\\b|\
migrainous",
        "Heart Problems": "(palpitations|tachycardia|heart beat(s|ing)\s*[a-z]*?\s*(fast|hard)|\
heart rac(ing|es)|heart pound(s|ing))|syncope|long qt",
        "Irritability": "(?<! while )holding\s*(his|her|their)?\s*breath|irritab(ility|le)(?!\s*bowel)(?!\s*spot)|\
fuss(ing|y|iness)|crabb(y|iness)|agressivity",
        "Myalgia": "myalgia|muscle (ache|pain)[s]?|muscles hurt|(pain|ache)[s]? in muscle[s]?",
        "Pain": '\\bpain(?! clinic| center)|stomachache|headache (?! center)|\\bach(e|ing)|\
cramp(s|ing)?|sore\\b|(?! non-|non )\\btender(ness)?',
        "Respiratory Signs & Symptoms": '(dyspnea)|((difficult[a-z]*|shallow|[^un]labored|rapid)\\sbreathing)|\
\\bsob\\b|(short[a-z]*|\\bout\\b)\\s*(of)?\\s*breath|\
(unable|struggl[a-z]*) to breath[e]?|wheezing|(gasping for|runs out of) air|\
winded|tight[a-z]* chest|chest\\s*(feels|felt)?\\s*tight[a-z]*|restricted airflow|panting|wheezes|\
asthma|\\brhonchi|work of breathing',
        "Skin Signs/Symptoms": "\brash|(?<!mouth )(?<!throat )pruritis|\bitch(y|ing)|\
        skin\s*[a-z]*\s*[a-z]*\s*itch(y|es|ing)?|\\brash|\\bitch(y|ing)|erythema|swelling",
         "Cognitive Impairments": "brain fog|(attention|concentration|memory|focus) (issues|problems)|\
(difficulty|trouble|problem[s]?|hard(er)? time|issues)\s*(with)?\s*(concentrat(ion|ing)|remembering|attention|focus)|\
(decreased|lower)\s*(ability (with|to)?)?\s*(attention|concentrat(e|ion)|remember|focus)\s(?!urine)",
         "Physical Impairments": "(trouble|difficult(y|ies)|problem[s]?|hard time)\s*[a-z]*\s*\
((climbing|taking|using)?\s*stair|walking|running|exercise|(playing )?sports|dressing|getting dressed)|\
functional limitations|postural control",
        "School Difficulty": '(miss[a-z]*|drop[a-z]*|fail[a-z]*|difficul[a-z]*)\s*(in|from|at|with)?\s*(school|class)|\
(below average|poor|bad|low|failing|falling|worse[a-z]*) grade(s)?(?! temp[a-z]*| fever| glioma| astrocytoma)|\
grade(s)?\s*(are|have)?\s*(drop[a-z]*|f[ae]ll[a-z]*|worse[a-z]*)|\
(academic|school|scholastic|education(al)?)\s*(problem|issue|difficult(y|ie)|trouble|struggle)(s)?|\
(doing (poorly|bad(ly)?)|not doing well|problem|issue|difficult(y|ie)|trouble|struggle|struggling|hard time)\
(s)?\s*(at|with|in|regarding)\s*(school|grades|class(es)?)',
        "Sleep Problems": "insomnia|(isn't|not) sleeping\s*((very )?well|enough|much|at all)|\
(trouble|difficult(y|ies)|problems|hard time) sleeping|parasomnia|nightmares|night terrors"
}

pd.set_option("display.max_colwidth", None)

def to_str(tokens, lower=True):
    tokens = eval(tokens)
    if lower:
        res = " ".join(tok.lower().strip() for tok in tokens)
    else:
        res = " ".join(tok.strip() for tok in tokens)
    res = res.replace(" ,", ",")
    res = res.replace(" :", ":")
    res = res.replace(" ;", ";")
    return res

save = True

for i in terms.keys():
    term = {i: re.compile(terms[i], flags=re.I)}
    print(i)
    res = gen_table_text(term, filter_out, chars=100)
    if len(res) > 0:
        res["regex_assertion"] = res.apply(newer_regex, axis=1)
    # print(res.head())
    print(len(res))
    if save:
        if len(res) > 0:
            res.to_csv((f"s3://{bucket}/{export_name}_subset_" + i + ".csv"), index=False)
            res = pd.read_csv(f"s3://{bucket}/{export_name}_subset_" + i + ".csv")

            res.tokens = res.tokens.apply(to_str)
            res.clinical_ner = res.clinical_ner.apply(to_str, args=(False,))
            res.wip_ner = res.wip_ner.apply(to_str, args=(False,))
            res.BERT_ner = res.BERT_ner.apply(to_str, args=(False,))
            res.to_csv((f"s3://{bucket}/{export_name}_subset_" + i + ".csv"), index=False)

In [None]:
terms = ["Anosmia/Ageusia", "Anxiety", "Appetite Loss", "Chest Pain", "Cough",
         "Dental/Gum Problems", "Depression", "Digestive Issues", "Dizziness", 
         "Excessive Sweating", "Excessive Thirst", "Fatigue", "Fever", "Hair Loss", "Headache",
         "Heart Problems",  "Irritability", "Myalgia", "Pain", "Respiratory Signs & Symptoms",
        "Skin Signs/Symptoms", "Cognitive Impairments", "Physical Impairments",
        "School Difficulty", "Sleep Problems"]

conc = []
s3 = boto3.client('s3')
for i in terms:
    try:
        s3.head_object(Bucket=bucket, Key=export_name + "_subset_" + i + ".csv")
    except botocore.exceptions.ClientError as e:
        pass
    else:
        df = pd.read_csv(f"s3://{bucket}/{export_name}_subset_" + i + ".csv")
        conc.append(df)
conc = pd.concat(conc)

conc.to_csv((f"s3://{bucket}/{export_name}_subset_all.csv"), index=False)