### Import Packages

In [1]:
import W_utility.file as ufile
from W_utility.log import ext_print
import os,sys,re
import Valx_core
from tqdm import tqdm_notebook
import pandas as pd

### Driver function

In [2]:
# Valx: A system for extracting and structuring numeric lab test comparison statements from text
# Created by Tony HAO, th2510@columbia.edu
# Please kindly cite the paper: Tianyong Hao, Hongfang Liu, Chunhua Weng. Valx: A system for extracting and structuring numeric lab test comparison statements from text. Methods of Information in Medicine. Vol. 55: Issue 3, pp. 266-275, 2016

debug=False

def extract_variables (trials,fdin, ffea, ffea2, var):
    # read input data
#     if fdin is None or fdin =="": return False
#     trials = pd.read_csv(fdin,header=None)
    if trials is None or len(trials) <= 0:
        print(ext_print ('input data error, please check either no such file or no data --- interrupting'))
        return False
    print(ext_print ('found a total of %d data items' % len(trials)))
    
    # read feature list - domain knowledge
    if ffea is None or ffea =="": return False
    fea_dict_dk = ufile.read_csv_as_dict_with_multiple_items (ffea)
    if fea_dict_dk is None or len(fea_dict_dk) <= 0:
        print(ext_print ('no feature data available --- interrupting'))
        return False

    # get feature info
    features, feature_dict_dk = {}, {}
    if var == "All":
        features = fea_dict_dk
        del features["Variable name"]
    elif var in fea_dict_dk:
        features = {var:fea_dict_dk[var]}
    for key, value in fea_dict_dk.items():
        names = value[0].lower().split('|')
        for name in names:
            if name.strip() != '': feature_dict_dk[name.strip()] =key

    # read feature list - UMLS (can be replaced by full UMLS)
    if ffea2 is None or ffea2 =="": return False
    fea_dict_umls = ufile.read_csv_as_dict (ffea2)
    if fea_dict_umls is None or len(fea_dict_umls) <= 0:
        print(ext_print ('no feature data available --- interrupting'))
        return False

    #load numeric feature list
    Valx_core.init_features()

    output = []
    for i in tqdm_notebook(range(0,len(trials))):
        if i%1000 == 0:
            print ('processing %d' % i)
        # pre-processing eligibility criteria text
        text = Valx_core.preprocessing(trials.iloc[i,1]) # trials[i][1] is the eligibility criteria text
        if debug: print(text)
        (sections_num, candidates_num) = Valx_core.extract_candidates_numeric(text) # extract candidates containing numeric features
        for j in range(0,len(candidates_num)): # for each candidate
            if debug: print(f"Criteria {j} : {text}")
            exp_text = Valx_core.formalize_expressions(candidates_num[j]) # identify and formalize values
            if debug: print(f"formalize_expressions 1 {j} : {exp_text}")
            (exp_text, key_ngrams) = Valx_core.identify_variable(exp_text, feature_dict_dk, fea_dict_umls) # identify variable mentions and map them to names
            if debug: print(f"formalize_expressions 2 {j} : {exp_text}")
            if debug: print(f"key_ngrams {j} : {key_ngrams}")
            (variables, vars_values) = Valx_core.associate_variable_values(exp_text)
            if debug: print(f"variables {j} : {variables}")
            if debug: print(f"vars_values {j} : {vars_values}")
#             print(variables,vars_values)
            all_exps = []
            for k in range(0,len(variables)):
                curr_var = variables[k]
                curr_exps = vars_values[k]
                if curr_var in features:
                    fea_list = features[curr_var]
                    curr_exps = Valx_core.context_validation(curr_exps, fea_list[1], fea_list[2])                           
                    curr_exps = Valx_core.normalization(fea_list[3], curr_exps) # unit conversion and value normalization
                    curr_exps = Valx_core.hr_validation (curr_exps, float(fea_list[4]), float(fea_list[5])) # heuristic rule-based validation
                if len(curr_exps) > 0:
                    if var == "All" or var.lower() == curr_var.lower() or var.lower() in curr_var.lower(): 
                        all_exps += curr_exps                     
#                 print(curr_var)
#                 print(curr_exps)
            if len(all_exps) > 0: 
                output.append((trials.iloc[i,0], sections_num[j], candidates_num[j], exp_text, str(all_exps).replace("u'", "'"))) # output result
#         break
    # output result
    fout = os.path.splitext(fdin)[0] + "_exp_%s_out.csv" % var
    pd.DataFrame(output).to_csv(fout,index=None)
    print(ext_print ('saved processed results into: %s' % fout))
    return output,trials.values


def process_valx_results(original_text, valx_outputs) : 

    word_blocks = get_words_space_blocks(original_text)

    all_words = [word_block['word'] for word_block in word_blocks]

    count_word_blocks = len(word_blocks)

    word_block_index = 0

    result = [] 
    
    for output in valx_outputs : 

        value_exps = output[4]
        value_exps = eval(value_exps)

        for value_exp in value_exps :

            value = value_exp[2]
            unit = value_exp[3]
            value_type = value_exp[0]

            float_count = all_words.count(str(value))
            int_count = all_words.count(str(int(value)))
            value_count =  float_count + int_count

            print(value_exp)
            print('value_count', value_count)
            print(word_block_index)
            if len(result)>0 : 
                if result[-1]['EntityType'] == value_type and (str(int(value) in get_alphanumeric_groups(result[-1]['Entity'])) or str(value) in get_alphanumeric_groups(result[-1]['Entity'])) : 
                    continue

            if word_block_index == count_word_blocks : 
                break 

            elif value_count == 1 :
                if float_count == 1 : 
                    word_block_index = all_words.index(str(value))
                else : 
                    word_block_index = all_words.index(str(int(value)))

                if word_block_index <= count_word_blocks - len(unit.split(" ")) - 2 : 
                    word = word_blocks[word_block_index]["word"]
                    word_start_index = word_blocks[word_block_index]['start_index']

                    next_word_blocks = word_blocks[word_block_index+1:word_block_index+len(unit.split(" "))+1]
                    unit_word = " ".join([word_block['word'] for word_block in next_word_blocks])
                    if unit_word == unit : 
                        if len(next_word_blocks) == 0 : 
                            end_index = word_end_index
                        else : 
                            end_index = next_word_blocks[-1]['end_index']
                        result.append({'Entity':" ".join([word, unit]), 
                                       "EntityType":value_type, 
                                       "StartIndex":word_start_index,
                                       "EndIndex":end_index, 
                                       "Confidence":1})
                        word_block_index = word_block_index + len(unit.split(" ")) + 1 

                    else : 
                        result.append({'Entity':word_blocks[word_block_index]['word'], 
                               'EntityType':value_type, 
                               'StartIndex':word_blocks[word_block_index]['start_index'],
                               'EndIndex':word_blocks[word_block_index]['end_index'], 
                               'Confidence':1
                              })
                        word_block_index = word_block_index + 1 
                else : 
                    result.append({'Entity':word_blocks[word_block_index]['word'], 
                               'EntityType':value_type, 
                               'StartIndex':word_blocks[word_block_index]['start_index'],
                               'EndIndex':word_blocks[word_block_index]['end_index'], 
                               'Confidence':1
                              })
                    word_block_index = word_block_index + 1 

            else : 
                while word_block_index < count_word_blocks : 
                    word_block = word_blocks[word_block_index]
                    word = word_block['word']
                    word_start_index = word_block["start_index"]
                    word_end_index = word_block["end_index"]
                    all_alphanumerics = get_alphanumeric_groups(word)

                    if str(value) in all_alphanumerics or str(int(value)) in all_alphanumerics : 
                        if word_block_index <= count_word_blocks - len(unit.split(" ")) - 1 : 
                            next_word_blocks = word_blocks[word_block_index+1:word_block_index+len(unit.split(" "))+1]
                            unit_word = " ".join([word_block['word'] for word_block in next_word_blocks])
                            if unit_word == unit : 
                                if len(next_word_blocks) == 0 : 
                                    end_index = word_end_index
                                else : 
                                    end_index = next_word_blocks[-1]['end_index']
                                result.append({'Entity':" ".join([word, unit]), 
                                               "EntityType":value_type, 
                                               "StartIndex":word_start_index,
                                               "EndIndex":end_index, 
                                               "Confidence":1
                                               })
                                word_block_index = word_block_index + len(unit.split(" ")) + 1 
                                break 
                    else :
                        if str(value)+unit in all_alphanumerics  or str(int(value))+unit in all_alphanumerics: 
                            result.append({'Entity': word, 
                                           'EntityType':value_type,
                                           'StartIndex':word_start_index,
                                           'EndIndex':word_end_index, 
                                           'Confidence':1
                                          })
                            word_block_index = word_block_index + 1 
                            break 
                    word_block_index = word_block_index + 1

    return result

import argparse
def _process_args():
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('-i', default=r"D:\_My_programs\_CUMC\Extract_Variables\_GitHub\data\example data diabetes_Type 1.csv", help='input: a specific disease')
    parser.add_argument('-f1', default=r"D:\_My_programs\_CUMC\Extract_Variables\_GitHub\data\variable_features_dk.csv", help='input: a feature list')
    parser.add_argument('-f2', default=r"D:\_My_programs\_CUMC\Extract_Variables\_GitHub\data\variable_features_umls.csv", help='input: a feature list')
    parser.add_argument('-v', default="HBA1C", help='Variable name: All, HBA1C, BMI, Glucose, Creatinine, BP-Systolic, BP-Diastolic') # 'All' means to detect all variables
    return parser.parse_args(sys.argv[1:])


### Function call

1. arg1 :- Input file
2. arg2 :- Variable features. Feel free to add rows according to the domain
3. arg3 :- UMLS terms
4. 

In [3]:
ct_gov_df = pd.read_csv("data/nsclc/trail_info.csv")
ct_gov_df = ct_gov_df[['nct_id','eligibility_criteria_textblock','eligibility_criteria_minimum_age',
                          'eligibility_criteria_maximum_age','eligibility_criteria_gender']].dropna()
out,input_ = extract_variables(ct_gov_df,
                               'data/nsclc/output',
                 'data/variable_features_dk.csv',
                 'data/variable_features_umls.csv',
                 'All')

[2020-11-26 16:35:37.923722] found a total of 1350 data items


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=1350.0), HTML(value='')))

processing 0
processing 1000

[2020-11-26 16:36:07.740097] saved processed results into: data/nsclc/output_exp_All_out.csv


In [4]:
out = pd.DataFrame(out)
out.iloc[:,0].nunique()
out.head()
extracted_ids = set(out.iloc[:,0].unique().tolist())
missed_ids = set(ct_gov_df['nct_id']).difference(extracted_ids)

In [5]:
missed_ids

{'NCT00175578',
 'NCT00191841',
 'NCT00280202',
 'NCT00365963',
 'NCT00434668',
 'NCT00464282',
 'NCT00471978',
 'NCT00484016',
 'NCT00530205',
 'NCT00563160',
 'NCT00608868',
 'NCT00684385',
 'NCT00797238',
 'NCT00828022',
 'NCT00897117',
 'NCT00898417',
 'NCT00900172',
 'NCT00904514',
 'NCT01024062',
 'NCT01123460',
 'NCT01124669',
 'NCT01159288',
 'NCT01255150',
 'NCT01332240',
 'NCT01386203',
 'NCT01516983',
 'NCT01605916',
 'NCT01620853',
 'NCT01719536',
 'NCT01744925',
 'NCT01885754',
 'NCT01926171',
 'NCT01947062',
 'NCT01947868',
 'NCT02223611',
 'NCT02416726',
 'NCT02420405',
 'NCT02445924',
 'NCT02502240',
 'NCT02515760',
 'NCT02595450',
 'NCT02758054',
 'NCT02799862',
 'NCT02951897',
 'NCT02954991',
 'NCT02975752',
 'NCT02991924',
 'NCT03090815',
 'NCT03125603',
 'NCT03134534',
 'NCT03141957',
 'NCT03188562',
 'NCT03219970',
 'NCT03240250',
 'NCT03340506',
 'NCT03392506',
 'NCT03454685',
 'NCT03504098',
 'NCT03509779',
 'NCT03546452',
 'NCT03598296',
 'NCT03647098',
 'NCT037

In [6]:
ct_gov_df[ct_gov_df['nct_id']=='NCT01744925']['eligibility_criteria_textblock'].values

array(['\n        Inclusion Criteria:\r\n\r\n          -  Recurrent or progressive Non-Small Cell Lung Cancer stage IV or IIIB patients with\r\n             Histologic or cytologic confirmation.\r\n\r\n          -  Wild type epidermal growth factor receptor status.\r\n\r\n          -  Progressed after first-line chemotherapy.\r\n\r\n          -  No previous systemic anticancer therapy.\r\n\r\n          -  Measurable lesion according to response evaluation criteria in solid tumors with at\r\n             least one measurable lesion not previously irradiated.\r\n\r\n          -  Provision of written informed consent.\r\n\r\n        Exclusion Criteria:\r\n\r\n          -  Evidence of clinically active Interstitial Lung Diseases (Patients with chronic,\r\n             stable, radiographic changes who are asymptomatic need not be excluded).\r\n\r\n          -  Positive epidermal growth factor receptor mutation.\r\n\r\n          -  Known severe hypersensitivity to icotinib or any of the exci

In [None]:
out

In [7]:
# processed_ids = ["NCT00004984","NCT00005665","NCT00021788","NCT00021801","NCT00034255","NCT00042458","NCT00042471","NCT00042601","NCT00046150","NCT00063128","NCT00071448","NCT00095082","NCT00097292","NCT00100178","NCT00105352","NCT00107107","NCT00108004","NCT00109434","NCT00117026","NCT00117780","NCT00118937","NCT00118976","NCT00119041","NCT00129259","NCT00130481","NCT00131755","NCT00133809","NCT00135915","NCT00140543","NCT00141986","NCT00142922","NCT00143949","NCT00145353","NCT00145379","NCT00146484","NCT00147342","NCT00148538","NCT00160732","NCT00175253","NCT00175266","NCT00179777","NCT00184639","NCT00184665","NCT00187564","NCT00190502","NCT00191581","NCT00198146","NCT00206258","NCT00206297","NCT00206401","NCT00211510","NCT00211536","NCT00212329","NCT00214214","NCT00214253","NCT00223613","NCT00226902","NCT00229658","NCT00239148","NCT00252720","NCT00252733","NCT00254501","NCT00260234","NCT00265473","NCT00271284","NCT00272090","NCT00273286","NCT00276250","NCT00276393","NCT00278980","NCT00279305","NCT00279318","NCT00283218","NCT00284232","NCT00285194","NCT00285233","NCT00286624","NCT00286962","NCT00290979","NCT00291772","NCT00297401","NCT00297583","NCT00297635","NCT00298740","NCT00303134","NCT00304538","NCT00305344","NCT00306098"]

In [8]:
# text = ct_gov_df[~ct_gov_df['nct_id'].isin(processed_ids)]['eligibility_criteria_textblock'][6]
# Valx_core.preprocessing(text)

In [9]:
out

Unnamed: 0,0,1,2,3,4
0,NCT00002520,Inclusion,must have smoked 1 or more cigarettes within t...,must have <VL Label=smoked Source=ngram>smoked...,"[['smoked', '>=', '1', 'cigarettes']]"
1,NCT00002520,Inclusion,ecog 0-1,<VL Label=ECOG Source=DK>ecog</VL> <VML Logic=...,"[['ECOG', '>=', 0.0, ''], ['ECOG', '<=', 1.0, ..."
2,NCT00002583,Inclusion,10 if complete mediastinal lymph node resectio...,<VML Logic=equal Unit=>10</VML> if complete me...,"[['Lymph node', '=', 10.0, '']]"
3,NCT00002583,Inclusion,lymph node which measured 1.5 cm or more on pr...,<VL Label=Lymph node Source=DK>lymph node</VL>...,"[['Lymph node', '>=', 1.5, 'cm']]"
4,NCT00002583,Inclusion,found to be free of metastatic involvement dis...,<VL Label=found to be free of metastatic invol...,[['found to be free of metastatic involvement ...
...,...,...,...,...,...
13684,NCT04606303,Inclusion,"bone marrow hematopoietic function is good, le...",<VL Label=bone marrow hematopoietic function i...,[['bone marrow hematopoietic function is good'...
13685,NCT04606303,Inclusion,hemoglobin> 10g/dl,<VL Label=hemoglobin Source=ngram>hemoglobin</...,"[['hemoglobin', '>', '10', 'g/dl']]"
13686,NCT04606303,Inclusion,"good renal function, glomerular filtration rat...","good renal function, <VL Label=glomerular filt...","[['glomerular filtration rate', '>', '60', 'ml..."
13687,NCT04606303,Inclusion,"good liver function, total bilirubin(tbil)<1.5...","good liver function, <VL Label=total bilirubin...","[['total bilirubin level', '<', '1.5', 'uln'],..."
