In [1]:
!git clone https://github.com/purunfer22/Valx.git

fatal: destination path 'Valx' already exists and is not an empty directory.


In [10]:
import Valx_core
from tqdm import tqdm_notebook
import pandas as pd

In [30]:
# Valx: A system for extracting and structuring numeric lab test comparison statements from text
# Created by Tony HAO, th2510@columbia.edu
# Please kindly cite the paper: Tianyong Hao, Hongfang Liu, Chunhua Weng. Valx: A system for extracting and structuring numeric lab test comparison statements from text. Methods of Information in Medicine. Vol. 55: Issue 3, pp. 266-275, 2016

import W_utility.file as ufile
from W_utility.log import ext_print
import os,sys,re
import Valx_core
debug=True

def extract_variables (fdin, ffea, ffea2, var):
    # read input data
    if fdin is None or fdin =="": return False
    trials = pd.read_csv(fdin,header=None)
    if trials is None or len(trials) <= 0:
        print(ext_print ('input data error, please check either no such file or no data --- interrupting'))
        return False
    print(ext_print ('found a total of %d data items' % len(trials)))
    
    # read feature list - domain knowledge
    if ffea is None or ffea =="": return False
    fea_dict_dk = ufile.read_csv_as_dict_with_multiple_items (ffea)
    if fea_dict_dk is None or len(fea_dict_dk) <= 0:
        print(ext_print ('no feature data available --- interrupting'))
        return False

    # get feature info
    features, feature_dict_dk = {}, {}
    if var == "All":
        features = fea_dict_dk
        del features["Variable name"]
    elif var in fea_dict_dk:
        features = {var:fea_dict_dk[var]}
    for key, value in fea_dict_dk.items():
        names = value[0].lower().split('|')
        for name in names:
            if name.strip() != '': feature_dict_dk[name.strip()] =key

    # read feature list - UMLS (can be replaced by full UMLS)
    if ffea2 is None or ffea2 =="": return False
    fea_dict_umls = ufile.read_csv_as_dict (ffea2)
    if fea_dict_umls is None or len(fea_dict_umls) <= 0:
        print(ext_print ('no feature data available --- interrupting'))
        return False

    #load numeric feature list
    Valx_core.init_features()

    output = []
    for i in tqdm_notebook(range(0,len(trials))):
        if i%1000 == 0:
            print ('processing %d' % i)
        # pre-processing eligibility criteria text
        text = Valx_core.preprocessing(trials.iloc[i,1]) # trials[i][1] is the eligibility criteria text
        if debug: print(text)
        (sections_num, candidates_num) = Valx_core.extract_candidates_numeric(text) # extract candidates containing numeric features
        for j in range(0,len(candidates_num)): # for each candidate
            if debug: print(f"Criteria {j} : {text}")
            exp_text = Valx_core.formalize_expressions(candidates_num[j]) # identify and formalize values
            if debug: print(f"formalize_expressions 1 {j} : {exp_text}")
            (exp_text, key_ngrams) = Valx_core.identify_variable(exp_text, feature_dict_dk, fea_dict_umls) # identify variable mentions and map them to names
            if debug: print(f"formalize_expressions 2 {j} : {exp_text}")
            if debug: print(f"key_ngrams {j} : {key_ngrams}")
            (variables, vars_values) = Valx_core.associate_variable_values(exp_text)
            if debug: print(f"variables {j} : {variables}")
            if debug: print(f"vars_values {j} : {vars_values}")
            print(variables,vars_values)
            all_exps = []
            for k in range(0,len(variables)):
                curr_var = variables[k]
                curr_exps = vars_values[k]
                if curr_var in features:
                    fea_list = features[curr_var]
                    curr_exps = Valx_core.context_validation(curr_exps, fea_list[1], fea_list[2])                           
                    curr_exps = Valx_core.normalization(fea_list[3], curr_exps) # unit conversion and value normalization
                    curr_exps = Valx_core.hr_validation (curr_exps, float(fea_list[4]), float(fea_list[5])) # heuristic rule-based validation
                if len(curr_exps) > 0:
                    if var == "All" or var.lower() == curr_var.lower() or var.lower() in curr_var.lower(): 
                        all_exps += curr_exps                     
#                 print(curr_var)
#                 print(curr_exps)
            if len(all_exps) > 0: 
                output.append((trials.iloc[i,0], sections_num[j], candidates_num[j], exp_text, str(all_exps).replace("u'", "'"))) # output result
        break
    # output result
    fout = os.path.splitext(fdin)[0] + "_exp_%s_out.csv" % var
    ufile.write_csv (fout, output)
    print(ext_print ('saved processed results into: %s' % fout))
    return output


# processing the command line options
import argparse
def _process_args():
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('-i', default=r"D:\_My_programs\_CUMC\Extract_Variables\_GitHub\data\example data diabetes_Type 1.csv", help='input: a specific disease')
    parser.add_argument('-f1', default=r"D:\_My_programs\_CUMC\Extract_Variables\_GitHub\data\variable_features_dk.csv", help='input: a feature list')
    parser.add_argument('-f2', default=r"D:\_My_programs\_CUMC\Extract_Variables\_GitHub\data\variable_features_umls.csv", help='input: a feature list')
    parser.add_argument('-v', default="HBA1C", help='Variable name: All, HBA1C, BMI, Glucose, Creatinine, BP-Systolic, BP-Diastolic') # 'All' means to detect all variables
    return parser.parse_args(sys.argv[1:])


# if __name__ == '__main__' :
#     print ''
#     args = _process_args()
#     extract_variables (args.i, args.f1, args.f2, args.v)
#     print ''


In [31]:
extract_variables('data/example data diabetes_Type 1.csv',
                 'data/variable_features_dk.csv',
                 'data/variable_features_umls.csv',
                 'All')

[2020-11-26 15:38:05.695943] found a total of 100 data items


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

processing 0
inclusion criteria:# - individuals 3-45 years old who have an immediate family member with type 1 diabetes (such as a child, parent or sibling)# - individuals 3-20 years old who have an extended family member with type 1 diabetes (such as a cousin, niece, nephew, aunt, uncle, grandparent or half-sibling)# exclusion criteria:# - to be eligible, a person must:# - not have diabetes already.# - have no previous history of being treated with insulin or oral diabetes medications.# - have not received any prior therapy for prevention of type 1 diabetes such as insulin, nicotinamide or immunosuppressive drugs (i.e. have not been involved in any previous clinical studies of these agents.)# - have no known serious diseases.# - if you are a woman, you must not be planning to become pregnant during the course of the study. you will not be excluded from participation but are not encouraged to volunteer in the first place if you plan to have a baby during the trial period).
Criteria 0 :

[]

In [21]:
pd.read_csv("data/example data diabetes_Type 1.csv",header=None).iloc[2,1]

'Inclusion Criteria:#          -  Candidates must be between the ages of 18 and 65#          -  Candidates must have had IDDM for at least 5 years and been under physician care for             at least 6 months prior to enrollment in trial.#          -  Eligible candidates will have poorly controlled insulin-dependent diabetes mellitus             (IDDM) and manifest signs and symptoms severe enough to be incapacitating. These             symptoms can include episodes of hypoglycemic unawareness (failure to recognize blood             glucose levels < 54 mg/dl) or episodes requiring the assistance of others.#          -  Candidates may have poor diabetes control despite intensive insulin therapy (HbA1c >             8.0%).#          -  Creatinine clearance should be > 60 ml/min)#          -  Body Mass Index should be less than 26#          -  Women of child-bearing age must have a negative pregnancy test and agree to follow             effective contraceptive measures for the duration 

In [None]:
ie_text = Valx_core.preprocessing(text)
inclusion_text,exclusion_text = Valx_core.split_text_inclusion_exclusion(ie_text)
(sections_num, candidates_num) = Valx_core.extract_candidates_numeric(ie_text)
exp_text = Valx_core.formalize_expressions(candidates_num[1])
(exp_text, key_ngrams) = Valx_core.identify_variable(exp_text, feature_dict_dk, fea_dict_umls) # identify variable mentions and map them to names
(variables, vars_values) = Valx_core.associate_variable_values(exp_text)