This script adds `_RF` suffix for each available risk factor in the dataset

In [1]:
import pandas as pd
import re
import torch

In [6]:
FNAME = '20110101' #Enter either month/all
DATA_TYPE = 'original' #Enter either downsampled/original

SEQ_LEN = 1000

ALL_DATA_PATH = f"../../../data/AE_CDiff_d00845/output/data/1000/{DATA_TYPE}/preprocessed/{FNAME}.csv"
TRAIN_DATA_PATH = f"../../../data/AE_CDiff_d00845/output/data/1000/{DATA_TYPE}/preprocessed/splits/{FNAME}/train.csv"
VALID_DATA_PATH = f"../../../data/AE_CDiff_d00845/output/data/1000/{DATA_TYPE}/preprocessed/splits/{FNAME}/val.csv"
TEST_DATA_PATH = f"../../../data/AE_CDiff_d00845/output/data/1000/{DATA_TYPE}/preprocessed/splits/{FNAME}/test.csv"

OUT_ALL_DATA_PATH = f"../../../data/AE_CDiff_d00845/output/data/1000/{DATA_TYPE}/preprocessed/{FNAME}_2.csv"
OUT_TRAIN_DATA_PATH = f"../../../data/AE_CDiff_d00845/output/data/1000/{DATA_TYPE}/preprocessed/splits/{FNAME}/train2.csv"
OUT_VALID_DATA_PATH = f"../../../data/AE_CDiff_d00845/output/data/1000/{DATA_TYPE}/preprocessed/splits/{FNAME}/val2.csv"
OUT_TEST_DATA_PATH = f"../../../data/AE_CDiff_d00845/output/data/1000/{DATA_TYPE}/preprocessed/splits/{FNAME}/test2.csv"

#Ground Truth Risk Factors File Path
GT_CODES_PATH = "../../../data/AE_CDiff_d00845/cdiff_risk_factors_codes.csv"
OUT_GT_CODES_PATH = "../../../data/AE_CDiff_d00845/cdiff_risk_factors_codes.csv"

In [3]:
df_codes = pd.read_csv(GT_CODES_PATH)
print(df_codes.shape)
df_codes.head()

(135, 5)


Unnamed: 0,Code,Code_System,Internal_Code,Group,Description
0,555.0,ICD-9 Diagnosis,d_5550,Inflammatory Bowel Disease,"Crohn’s disease of small intestine, including ..."
1,555.1,ICD-9 Diagnosis,d_5551,Inflammatory Bowel Disease,Crohn’s disease of large intestine (regional c...
2,555.2,ICD-9 Diagnosis,d_5552,Inflammatory Bowel Disease,Crohn’s disease of small intestine with large ...
3,555.9,ICD-9 Diagnosis,d_5559,Inflammatory Bowel Disease,Crohn’s disease of unspecified site (regional ...
4,713.1,ICD-9 Diagnosis,d_7131,Inflammatory Bowel Disease,Arthropathy associated with gastrointestinal c...


In [4]:
df_codes.tail()

Unnamed: 0,Code,Code_System,Internal_Code,Group,Description
130,50.*,ICD-9 Procedure,p_50*,Surgery,Operations On Liver
131,51.*,ICD-9 Procedure,p_51*,Surgery,Operations On Gallbladder And Biliary Tract
132,52.*,ICD-9 Procedure,p_52*,Surgery,Operations On Pancreas
133,53.*,ICD-9 Procedure,p_53*,Surgery,Repair Of Hernia
134,54.*,ICD-9 Procedure,p_54*,Surgery,Other Operations On Abdominal Region


In [7]:
def add_rf_suffix0(row):
    """Adds _rf suffix to ground truth codes."""
    code = row['Internal_Code']
    return code+'_rf'
df_codes['Internal_Code2'] = df_codes.apply(add_rf_suffix0, axis=1)

df_codes.to_csv(OUT_GT_CODES_PATH, index=False)

In [8]:
df_codes.head()

Unnamed: 0,Code,Code_System,Internal_Code,Group,Description,Internal_Code2
0,555.0,ICD-9 Diagnosis,d_5550,Inflammatory Bowel Disease,"Crohn’s disease of small intestine, including ...",d_5550_rf
1,555.1,ICD-9 Diagnosis,d_5551,Inflammatory Bowel Disease,Crohn’s disease of large intestine (regional c...,d_5551_rf
2,555.2,ICD-9 Diagnosis,d_5552,Inflammatory Bowel Disease,Crohn’s disease of small intestine with large ...,d_5552_rf
3,555.9,ICD-9 Diagnosis,d_5559,Inflammatory Bowel Disease,Crohn’s disease of unspecified site (regional ...,d_5559_rf
4,713.1,ICD-9 Diagnosis,d_7131,Inflammatory Bowel Disease,Arthropathy associated with gastrointestinal c...,d_7131_rf


In [None]:
def add_rf_suffix_gt(row0):
    """Add _rf suffix to ground truth codes in the given row of the dataset."""
    row = row0.tolist()
    return row['Internal_Code'] + '_rf'


In [55]:
def add_rf_suffix(row0, gt_codes_no_star, gt_codes_star):
    """Add _rf suffix to ground truth codes in the given row of the dataset."""
    row = row0.tolist()
    row = [token+'_rf' if token in gt_codes_no_star or list(filter(token.startswith, gt_codes_star)) != [] else token for token in row]
    num_gt_codes = len([token for token in row if token.endswith('_rf')])
    has_gt_codes = 0
    if num_gt_codes > 0:
        has_gt_codes = 1
    row.append(num_gt_codes)
    row.append(has_gt_codes)
    return row

In [56]:
gt_codes = df_codes.Internal_Code.tolist()
gt_no_stars = [code for code in gt_codes if not code.endswith('*')]
gt_with_stars = [code.replace('*', '') for code in gt_codes if code.endswith('*')]

len(gt_codes), len(gt_with_stars), len(gt_no_stars), len(gt_with_stars) + len(gt_no_stars)

(135, 13, 122, 135)

In [57]:
print(gt_with_stars)

['p_42', 'p_43', 'p_44', 'p_45', 'p_46', 'p_47', 'p_48', 'p_49', 'p_50', 'p_51', 'p_52', 'p_53', 'p_54']


In [58]:
print(gt_no_stars)

['d_5550', 'd_5551', 'd_5552', 'd_5559', 'd_7131', 'd_566', 'd_56981', 'd_56089', 'd_556', 'd_5561', 'd_5562', 'd_5563', 'd_5564', 'd_5565', 'd_5566', 'd_5568', 'd_5569', 'd_5589', 'd_5641', 'd_56210', 'd_56112', 'd_56211', 'd_56213', 'd_1530', 'd_1531', 'd_1532', 'd_1533', 'd_1534', 'd_1535', 'd_1536', 'd_1537', 'd_1538', 'd_1539', 'd_1540', 'd_1541', 'd_1542', 'd_1543', 'd_1548', 'd_1975', 'd_2303', 'd_2304', 'p_4523', 'p_4525', 'p_4542', 'p_4592', 'p_4593', 'p_4594', 'p_485', 'p_4862', 'p_V42', 'p_V420', 'p_V421', 'p_V422', 'p_V423', 'p_V424', 'p_V425', 'p_V426', 'p_V427', 'p_V428', 'p_V4281', 'p_V4282', 'p_V4283', 'p_V4284', 'p_V4289', 'p_V429', 'h_45378', 'h_G0104', 'h_G0105', 'h_G0121', 'h_58558', 'h_44620', 'h_44625', 'h_44626', 'h_44139', 'h_44140', 'h_44141', 'h_44143', 'h_44144', 'h_44145', 'h_44146', 'h_44147', 'h_44150', 'h_44151', 'h_44155', 'h_44156', 'h_44157', 'h_44158', 'h_44160', 'h_44320', 'h_44322', 'h_44799', 'h_45110', 'h_45111', 'h_45112', 'h_45113', 'h_45114', '

In [59]:
columns = [str(i) for i in range(SEQ_LEN-1, -1, -1)]
columns2 = columns + ['num_gt_codes', 'has_gt_codes']
nrows = None

#All Data
df = pd.read_csv(ALL_DATA_PATH, nrows=nrows)
results = df[columns].apply(add_rf_suffix, args=(gt_no_stars, gt_with_stars), axis=1)
if 'has_gt_codes' not in df.columns:
    df['has_gt_codes'] = 0
df[columns2] = results.tolist()
df.to_csv(OUT_ALL_DATA_PATH, index=False)
print(df.shape)
df.head()

KeyboardInterrupt: 

In [None]:
#Train Data
df = pd.read_csv(TRAIN_DATA_PATH, nrows=nrows)
results = df[columns].apply(add_rf_suffix, args=(gt_no_stars, gt_with_stars), axis=1)
df[columns2] = results.tolist()
df.to_csv(OUT_TRAIN_DATA_PATH, index=False)
print(df.shape)
df.head()

In [None]:
#Valid Data
df = pd.read_csv(VALID_DATA_PATH, nrows=nrows)
results = df[columns].apply(add_rf_suffix, args=(gt_no_stars, gt_with_stars), axis=1)
df[columns2] = results.tolist()
df.to_csv(OUT_VALID_DATA_PATH, index=False)
print(df.shape)
df.head()

In [None]:
#Test Data
df = pd.read_csv(TEST_DATA_PATH, nrows=nrows)
results = df[columns].apply(add_rf_suffix, args=(gt_no_stars, gt_with_stars), axis=1)
df[columns2] = results.tolist()
df.to_csv(OUT_TEST_DATA_PATH, index=False)
print(df.shape)
df.head()