In [1]:
# path to user functions
import sys  
sys.path.append('../Src/')

from platform import python_version

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import importlib 

# import user functions
import UserUtilityFunctions as uf
import UserStatisticalFunctions as usf
import UserVisualization as uv

# set seaborn theme
sns.set_theme()

# print versions
print("Numpy Version: " + np.__version__)
print("Pandas Version: " + pd.__version__)
print("Seaborn Version: " + sns.__version__)
print("Matplotlib Version: " + plt.matplotlib.__version__)
print("Python Version: " + python_version())

# adjust pandas display options to max
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
# adjust pandas display options to ensure full display of content
pd.set_option('display.max_colwidth', None)

# initializing variables
UNKNOWN = '** UNKNOWN **'
REMOVE = '** REMOVE **'
DROP = '** DROP **'
LABEL = '** LABEL **'

# initialize DataFrame
df_label = pd.DataFrame(columns=['column'])
df_can = pd.DataFrame(columns=['column'])
df_don = pd.DataFrame(columns=['column'])
df_drop = pd.DataFrame(columns=['column'])
df_both = pd.DataFrame(columns=['column'])
df_nominal = pd.DataFrame(columns=['column'])
df_ordinal = pd.DataFrame(columns=['column'])
df_numeric = pd.DataFrame(columns=['column'])
df_object = pd.DataFrame(columns=['column'])
df_unknown = pd.DataFrame(columns=['column'])
df_date = pd.DataFrame(columns=['column'])

Numpy Version: 1.26.4
Pandas Version: 2.2.3
Seaborn Version: 0.13.2
Matplotlib Version: 3.9.2
Python Version: 3.9.20


## Import Datasets

### Excel Data Dictionary

In [2]:
# open the Excel file
xls_star = pd.ExcelFile('../Docs/optn-star-files-data-dictionary.xlsx')
# get the list of sheet names
print(xls_star.sheet_names)

['Document map', 'DECEASED_DONOR_DATA', 'DECEASED_DONOR_DCD_MEASURES', 'DECEASED_DONOR_INOTROPIC_MEDS', 'INTESTINE_ADDTL_HLA', 'INTESTINE_DATA', 'INTESTINE_FOLLOWUP_DATA', 'INTESTINE_IMMUNO_DISCHARGE_DATA', 'INTESTINE_IMMUNO_FOLLOWUP_DATA', 'INTESTINE_MALIG_FOLLOWUP_DATA', 'INTESTINE_PRA_CROSSMATCH_DATA', 'INTESTINE_WLHISTORY_DATA', 'KIDNEY_FOLLOWUP_DATA', 'KIDNEY_MALIG_FOLLOWUP_DATA', 'KIDPAN_ADDTL_HLA', 'KIDPAN_DATA', 'KIDPAN_FOLLOWUP_DATA', 'KIDPAN_IMMUNO_DISCHARGE_DATA', 'KIDPAN_IMMUNO_FOLLOWUP_DATA', 'KIDPAN_MALIG_FOLLOWUP_DATA', 'KIDPAN_PRA_CROSSMATCH_DATA', 'KIDPAN_WLHISTORY_DATA', 'LIVER_ADDTL_HLA', 'LIVER_DATA', 'LIVER_EXCEPTION_DATA', 'LIVER_EXPLANT_DATA', 'LIVER_FOLLOWUP_DATA', 'LIVER_IMMUNO_DISCHARGE_DATA', 'LIVER_IMMUNO_FOLLOWUP_DATA', 'LIVER_MALIG_FOLLOWUP_DATA', 'LIVER_PRA_CROSSMATCH_DATA', 'LIVER_WLHISTORY_DATA', 'LIVING_DONOR_DATA', 'LIVING_DONOR_FOLLOWUP_DATA', 'PANCREAS_FOLLOWUP_DATA', 'PANCREAS_MALIG_FOLLOWUP_DATA', 'THORACIC_ADDTL_HLA', 'THORACIC_DATA (2)', 'THORAC

In [3]:
# Open XLS workbook - data dictionary
df_dict = pd.read_excel('../Docs/optn-star-files-data-dictionary.xlsx',  sheet_name='THORACIC_DATA', header=1)

In [4]:
df_dict.dtypes

VARIABLE NAME                  object
DESCRIPTION                    object
FORM                           object
VAR START DATE         datetime64[ns]
VAR END DATE           datetime64[ns]
FORM SECTION                   object
DATA TYPE                      object
SAS ANALYSIS FORMAT            object
COMMENT                        object
dtype: object

In [5]:
df_dict.head()

Unnamed: 0,VARIABLE NAME,DESCRIPTION,FORM,VAR START DATE,VAR END DATE,FORM SECTION,DATA TYPE,SAS ANALYSIS FORMAT,COMMENT
0,ABN_CONGEN_DON,DDR:Structural Abnormalities //Congenital:,DDR,2004-06-30,NaT,ORGAN RECOVERY,CHAR(1),,
1,ABN_LVH_DON,DDR:Structural Abnormalities //LVH:,DDR,2004-06-30,NaT,ORGAN RECOVERY,CHAR(1),,
2,ABN_VALVES_DON,DDR:Structural Abnormalities //Valves:,DDR,2004-06-30,NaT,ORGAN RECOVERY,CHAR(1),,
3,ABO,RECIPIENT BLOOD GROUP @ REGISTRATION,TCR,1987-10-01,NaT,CLINICAL INFORMATION,CHAR(3),ABO,
4,ABO_DON,DONOR BLOOD TYPE,DDR/LDR,1987-10-01,NaT,DONOR INFORMATION,CHAR(3),ABO,


In [6]:
# rename column name dictionary
renameCols = {"VARIABLE NAME": "Feature", "DESCRIPTION": "Description", "FORM": "Form", \
                  "VAR START DATE": "FeatureStartDate" ,"VAR END DATE": "FeatureEndDate", "FORM SECTION": "FormSection", \
                  "DATA TYPE": "DataType", "SAS ANALYSIS FORMAT": "SASAnalysisFormat", "COMMENT": "Comment"}

# rename columns
df_dict = df_dict.rename(columns=renameCols)

# create additional feature information
df_dict['OrginalFeature'] = df_dict['Feature']
df_dict['FeatureType'] = 'Unknown'
df_dict['Information'] = 'Unknown'
df_dict[['FormSection', 'SASAnalysisFormat', 'Comment']] = df_dict[['FormSection', 'SASAnalysisFormat', 'Comment']].fillna('')

# display
df_dict.head()

Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
0,ABN_CONGEN_DON,DDR:Structural Abnormalities //Congenital:,DDR,2004-06-30,NaT,ORGAN RECOVERY,CHAR(1),,,ABN_CONGEN_DON,Unknown,Unknown
1,ABN_LVH_DON,DDR:Structural Abnormalities //LVH:,DDR,2004-06-30,NaT,ORGAN RECOVERY,CHAR(1),,,ABN_LVH_DON,Unknown,Unknown
2,ABN_VALVES_DON,DDR:Structural Abnormalities //Valves:,DDR,2004-06-30,NaT,ORGAN RECOVERY,CHAR(1),,,ABN_VALVES_DON,Unknown,Unknown
3,ABO,RECIPIENT BLOOD GROUP @ REGISTRATION,TCR,1987-10-01,NaT,CLINICAL INFORMATION,CHAR(3),ABO,,ABO,Unknown,Unknown
4,ABO_DON,DONOR BLOOD TYPE,DDR/LDR,1987-10-01,NaT,DONOR INFORMATION,CHAR(3),ABO,,ABO_DON,Unknown,Unknown


### Flat File - Mapping Category Features

In [7]:
# flat file for category codes
df_flat = pd.read_csv('../Data/New_FlatFile.csv')

In [8]:
df_flat.head()

Unnamed: 0,LABEL,FMTNAME,TYPE,CODE
0,Not Reported,ABNBRONC,N,Null or Missing
1,No Bronchoscopy,ABNBRONC,N,1
2,Bronchoscopy Results normal,ABNBRONC,N,2
3,"Bronchoscopy Results, Abnormal-purulent secretions",ABNBRONC,N,3
4,"Bronchoscopy Results, Abnormal-aspiration of foreign body",ABNBRONC,N,4


In [9]:
# open the flat file from the workbook
df_flat_excel = pd.read_excel('../Docs/optn-star-files-data-dictionary.xlsx',  sheet_name='Flatfile Formats', header=1)

In [10]:
df_flat_excel.head()

Unnamed: 0,SASAnalysis Format,Data Field Value,Data Field Formatted Value,Data Type
0,ABNBRONC,Null or Missing,Not Reported,N
1,ABNBRONC,1,No Bronchoscopy,N
2,ABNBRONC,2,Bronchoscopy Results normal,N
3,ABNBRONC,3,"Bronchoscopy Results, Abnormal-purulent secretions",N
4,ABNBRONC,4,"Bronchoscopy Results, Abnormal-aspiration of foreign body",N


In [11]:
# rename column name dictionary
renameCols = {'SASAnalysis Format': 'SASAnalysisFormat', 'Data Field Value':'DataFieldValue', \
               'Data Field Formatted Value':'DataFieldFormattedValue','Data Type':'DataType'}

# rename columns
df_flat_excel.rename(columns=renameCols, inplace=True)

# display
df_flat_excel.head()

Unnamed: 0,SASAnalysisFormat,DataFieldValue,DataFieldFormattedValue,DataType
0,ABNBRONC,Null or Missing,Not Reported,N
1,ABNBRONC,1,No Bronchoscopy,N
2,ABNBRONC,2,Bronchoscopy Results normal,N
3,ABNBRONC,3,"Bronchoscopy Results, Abnormal-purulent secretions",N
4,ABNBRONC,4,"Bronchoscopy Results, Abnormal-aspiration of foreign body",N


### Heart Dataset

In [12]:
# load heart transplant dataset & display first five rows
df_heart = pd.read_csv('../Data/New_HR_main.csv',encoding='latin', low_memory=False)
df_heart.head()

Unnamed: 0,WL_ORG,COD_WL,COD_OSTXT_WL,TRANSPLANT_COUNTRY,NUM_PREV_TX,THORACIC_DGN,GROUPING,TAH,VAS,ONVENT,ICU,INOTROPIC,INUTERO,GENDER,ABO,WGT_KG_TCR,HGT_CM_TCR,BMI_TCR,TCR_CDC_GROWTH_BMI,TCR_CDC_GROWTH_HGT,TCR_CDC_GROWTH_WGT,CITIZENSHIP,CITIZEN_COUNTRY,PERM_STATE,EDUCATION,ECMO_TCR,IABP_TCR,PROS_INFUS_TCR,PROSTACYCLIN_TCR,INHALED_NO,INOTROPES_TCR,PGE_TCR,OTH_LIFE_SUP_TCR,OTH_LIFE_SUP_OSTXT_TCR,VAD_DEVICE_TY_TCR,VAD_BRAND1_TCR,VAD_BRAND1_OSTXT_TCR,VAD_BRAND2_TCR,VAD_BRAND2_OSTXT_TCR,VAD_TAH_TCR,VAD_TAH_OSTXT_TCR,FUNC_STAT_TCR,PRI_PAYMENT_TCR,PRI_PAYMENT_CTRY_TCR,TCR_DGN,TCR_DGN_OSTXT,DIAB,DIAL_TY_TCR,CEREB_VASC,MALIG_TCR,MALIG_TY_TCR,MALIG_TY_OSTXT_TCR,MOST_RCNT_CREAT,TOT_SERUM_ALBUM,SUD_DEATH,IMPL_DEFIBRIL,RESIST_INF,HEMO_SYS_TCR,HEMO_PA_DIA_TCR,HEMO_PA_MN_TCR,HEMO_PCW_TCR,HEMO_CO_TCR,INOTROP_VASO_SYS_TCR,INOTROP_VASO_DIA_TCR,INOTROP_VASO_MN_TCR,INOTROP_VASO_PCW_TCR,INOTROP_VASO_CO_TCR,CIG_USE,TCR_DUR_ABSTAIN,PRIOR_CARD_SURG_TCR,PRIOR_CARD_SURG_TYPE_TCR,PRIOR_CARD_SURG_TYPE_OSTXT_TCR,HISTRY_CIG_OLD,CONTIN_CIG_OLD,CIG_GRT_10_OLD,STERNOTOMY_TCR,THORACOT_LT_OLD,THORACOT_RT_OLD,PNEUMOTHORAX_OLD,PNEUMORED_OLD,LEFT_VENT_REMODEL_OLD,SSDMF_DEATH_DATE,DAYS_STAT1,DAYS_STAT1A,DAYS_STAT2,DAYS_STAT1B,DAYS_STATA4,DAYS_STATA5,DAYS_STATA2,DAYS_STATA3,DAYS_STATA1,DAYS_STATA6,LAST_INACT_REASON,INIT_STAT,INIT_O2,END_O2,INIT_CREAT,END_CREAT,INIT_CALC_LAS,INIT_MATCH_LAS,END_CALC_LAS,END_MATCH_LAS,CALC_LAS_LISTDATE,INIT_PRIORITY,END_PRIORITY,REM_CD,TXED,DAYSWAIT_CHRON,END_STAT,INIT_AGE,LIFE_SUP_TCR,ACTIVATE_DATE,DEATH_DATE,END_DATE,INIT_DATE,ETHNICITY,ETHCAT,PT_CODE,INIT_HGT_CM_CALC,INIT_WGT_KG_CALC,INIT_BMI_CALC,END_HGT_CM_CALC,END_WGT_KG_CALC,END_BMI_CALC,COMPOSITE_DEATH_DATE,WLHR,WLHL,WLIN,WLKI,WLKP,WLLI,WLLU,WLPA,WLPI,WLVC,VENTILATOR_TCR,REGION,LVAD_AT_LISTING,LVAD_WHILE_LISTED,RVAD_AT_LISTING,RVAD_WHILE_LISTED,VAD_AT_LISTING,VAD_WHILE_LISTED,WL_ID_CODE,INIT_LLU_FLG,INIT_RLU_FLG,INIT_BLU_FLG,END_LLU_FLG,END_RLU_FLG,END_BLU_FLG,VAL_DT_TCR,YR_ENTRY_US_TCR,WORK_INCOME_TCR,ACADEMIC_PRG_TCR,ACADEMIC_LEVEL_TCR,EXERCISE_O2,INACT_REASON_CD,BW4,BW6,C1,C2,DR51,DR51_2,DR52,DR52_2,DR53,DR53_2,DQ1,DQ2,ACADEMIC_LEVEL_TRR,ACADEMIC_PRG_TRR,FUNC_STAT_TRR,MED_COND_TRR,STATUS_TRR,ADMISSION_DATE,PRI_PAYMENT_TRR,PRI_PAYMENT_CTRY_TRR,ECMO_TRR,PERM_STATE_TRR,WORK_INCOME_TRR,COGNITIVE_DEV_TRR,MOTOR_DEV_TRR,MEASUREMENT_DATE_TRR,PGE_TRR,CREAT_TRR,DIAL_AFTER_LIST,FEV1_TRR,FVC_TRR,HEMO_CO_TRR,HEMO_PA_DIA_TRR,HEMO_PA_MN_TRR,HEMO_PCW_TRR,HEMO_SYS_TRR,IABP_TRR,INFECT_IV_DRUG_TRR,INOTROPES_TRR,INOTROP_VASO_CO_TRR,INOTROP_VASO_DIA_TRR,INOTROP_VASO_MN_TRR,INOTROP_VASO_PCW_TRR,INOTROP_VASO_SYS_TRR,OTH_LIFE_SUP_OSTXT_TRR,OTH_LIFE_SUP_TRR,PCO2_TRR,PRIOR_LUNG_SURG_TRR,PRIOR_LUNG_SURG_TYPE_TRR,PRIOR_LUNG_SURG_TYPE_OSTXT_TRR,PST_AIRWAY,ACUTE_REJ_EPI,PST_STROKE,PST_DIAL,PST_PACEMAKER,STEROID,TBILI,TRANSFUSIONS,VAD_DEVICE_TY_TRR,VAD_BRAND1_TRR,VAD_BRAND1_OSTXT_TRR,VAD_BRAND2_TRR,VAD_BRAND2_OSTXT_TRR,VENT_SUPPORT_TRR,VENT_TIMEFRAME_TRR,VENTILATOR_TRR,INHALED_NO_TRR,PRIOR_CARD_SURG_TYPE_TRR,PRIOR_CARD_SURG_TYPE_OSTXT_TRR,PROS_INFUS_TRR,PROSTACYCLIN_TRR,TRACHEOSTOMY_TRR,ECMO_72HOURS,FIO2_72HOURS,INHALEDNO_72HOURS,INTUBATED_72HOURS,PAO2_72HOURS,POST_TX_VENT_SUPPORT,REINTUBATED,PERFUSED_PRIOR,PERFUSION_LOCATION,PERFUSED_BY,TOTAL_PERFUSION_TIME,LU_RECEIVED,LU2_RECEIVED,PRETITERA,PRETITERA_DATE,PRETITERB,PRETITERB_DATE,HBV_CORE,HBV_SUR_ANTIGEN,HBV_SURF_TOTAL,CMV_STATUS,HIV_SEROSTATUS,HCV_SEROSTATUS,EBV_SEROSTATUS,HIV_NAT,HCV_NAT,HBV_NAT,COD,COD_OSTXT,COD2,COD2_OSTXT,COD3,COD3_OSTXT,GSTATUS,GTIME,LASTFUNO,PSTATUS,PTIME,PX_STAT,FUNC_STAT_TRF,DANTIARR_OLD,CREAT2_OLD,TXHRT,TXINT,TXKID,TXLIV,TXLNG,TXPAN,TXVCA,TX_PROCEDUR_TY,STATUS_TCR,INHALED_NO_TCR,PRVTXDIF,RETXDATE,STERNOTOMY_TRR,DON_RETYP,CRSMATCH_DONE,CPRA,CPRA_PEAK,TRTREJ1Y,PREV_TX,PREV_TX_ANY,PREV_TX_ANY_N,DA1,DA2,DB1,DB2,DDR1,DDR2,RA1,RA2,RB1,RB2,RDR1,RDR2,AMIS,BMIS,DRMIS,HLAMIS,PRAMR,PRAPK,PRAMR_CL1,PRAMR_CL2,PRAPK_CL1,PRAPK_CL2,MALIG_TRR,MALIG_TY_TRR,CMV_IGG,CMV_IGM,CITIZENSHIP_DON,HIST_COCAINE_DON,AGE_DON,ETHCAT_DON,HBV_CORE_DON,HBV_SUR_ANTIGEN_DON,HBV_DNA_DON,HCV_RNA_DON,ABO_DON,ALCOHOL_HEAVY_DON,DON_TY,GENDER_DON,HOME_STATE_DON,HCV_RIBA_DON,HCV_ANTIBODY_DON,HEP_C_ANTI_DON,LIV_DON_TY,COD_OSTXT_DON,NON_HRT_DON,ANTIHYPE_DON,BLOOD_INF_DON,BLOOD_INF_CONF_DON,BUN_DON,CREAT_DON,DOBUT_DON_OLD,DOPAMINE_DON_OLD,HTLV1_OLD_DON,HTLV2_OLD_DON,OTH_DON_MED1_OSTXT_DON_OLD,OTH_DON_MED2_OSTXT_DON_OLD,OTH_DON_MED3_OSTXT_DON_OLD,OTHER_INF_DON,OTHER_INF_CONF_DON,OTHER_INF_OSTXT_DON,PRETREAT_MED_DON_OLD,PT_DIURETICS_DON,PT_STEROIDS_DON,PT_T3_DON,PT_T4_DON,PT_OTH2_OSTXT_DON,PT_OTH3_OSTXT_DON,PT_OTH4_OSTXT_DON,PT_OTH1_OSTXT_DON,PULM_INF_DON,PULM_INF_CONF_DON,SGOT_DON,SGPT_DON,TBILI_DON,URINE_INF_DON,URINE_INF_CONF_DON,VASODIL_DON,VDRL_DON,CLIN_INFECT_DON,HYPERTENS_DUR_DON,CANCER_FREE_INT_DON,CANCER_OTH_OSTXT_DON,CONTIN_ALCOHOL_OLD_DON,CONTIN_CIG_DON,CONTIN_IV_DRUG_OLD_DON,CONTIN_COCAINE_DON,CONTIN_OTH_DRUG_DON,DIET_DON,DIURETICS_DON,EXTRACRANIAL_CANCER_DON,HIST_ALCOHOL_OLD_DON,CANCER_SITE_DON,HIST_CIG_DON,DIABDUR_DON,HIST_HYPERTENS_DON,HIST_IV_DRUG_OLD_DON,INTRACRANIAL_CANCER_DON,OTHER_HYPERTENS_MED_DON,HIST_CANCER_DON,HIST_INSULIN_DEP_DON,INSULIN_DUR_DON,HIST_DIABETES_DON,DIABETES_DON,HIST_OTH_DRUG_DON,SKIN_CANCER_DON,CMV_IGG_DON,CMV_IGM_DON,CMV_OLD_LIV_DON,CMV_DON,CMV_TEST_DON,EBV_TEST_DON,HBV_TEST_DON,HCV_TEST_DON,COD_CAD_DON,COD_LIV_DON,EBV_DNA_DON,EBV_IGG_DON,EBV_IGM_DON,DDAVP_DON,CMV_NUCLEIC_DON,DEATH_CIRCUM_DON,DEATH_MECH_DON,RECOVERY_DATE_DON,HEPARIN_DON,ARGININE_DON,INSULIN_DON,HGT_CM_DON_CALC,WGT_KG_DON_CALC,BMI_DON_CALC,HBV_NAT_DON,HCV_NAT_DON,HIV_NAT_DON,PX_STAT_DATE,TX_DATE,DISCHARGE_DATE,TX_TYPE,MULTIORG,ABO_MAT,AGE,DIAG,DIAL_PRIOR_TX,ISCHTIME,GRF_FAIL_CAUSE,GRF_FAIL_CAUSE_OSTXT,GRF_FAIL_DATE,GRF_STAT,SHARE_TY,LOS,DIAG_OSTXT,AGE_GROUP,O2_REQ_CALC,LIFE_SUP_TRR,TITERA,TITERA_DATE,TITERB,TITERB_DATE,ORGAN,PRIOR_CARD_SURG_TRR,MALIG_TY,MALIG,MALIG_TY_OSTXT,MALIG_TY_OSTXT_TRR,HGT_CM_CALC,WGT_KG_CALC,BMI_CALC,DISTANCE,VENT_SUPPORT_AFTER_LIST,PROC_TY_HR,VAD_TAH_TRR,VAD_TAH_OSTXT_TRR,TRR_ID_CODE,VAL_DT_TRR,EDUCATION_DON,STATUS_LDR,VAL_DT_LDR,RECOV_OUT_US,RECOV_COUNTRY,ADMIT_DATE_DON,PROTEIN_URINE,CARDARREST_NEURO,RESUSCIT_DUR,PO2,HIST_MI,LV_EJECT_METH,LV_EJECT,CORONARY_ANGIO,VESSELS_50STEN,BIOPSY_DGN,OTH_DGN_OSTXT,TATTOOS,CONTROLLED,STATUS_DDR,VAL_DT_DDR,DONOR_ID,HBSAB_DON,EBV_IGG_CAD_DON,EBV_IGM_CAD_DON,CDC_RISK_HIV_DON,INO_PROCURE_AGENT_1,INO_PROCURE_AGENT_2,INO_PROCURE_AGENT_3,INO_PROCURE_OSTXT_1,INO_PROCURE_OSTXT_2,INO_PROCURE_OSTXT_3,ECD_DONOR,TX_YEAR,INOTROP_SUPPORT_DON,LT_ONE_WEEK_DON,REFERRAL_DATE,LISTYR,TRANSFUS_TERM_DON,TRANSFUS_INTRAOP_NUM_OLD_DON,TRANSFUS_PRIOR_NUM_OLD_DON,PO2_DONE_DON,PO2_FIO2_DON,PCO2_DON,PULM_CATH_DON,MAP_INIT_DON,MAP_POST_DON,CVP_CATH_INIT_DON,CVP_CATH_POST_DON,CVP_CATH_OLD_DON,PCWP_INIT_DON,PCWP_POST_DON,SVR_INIT_DON,SVR_POST_DON,SYST_PA_CATH_INIT_DON,SYST_PA_CATH_POST_DON,SYSTOLIC_PA_CATH_OLD_DON,DIAST_PA_CATH_INIT_DON,DIAST_PA_CATH_POST_DON,DIASTOLIC_PA_CATH_OLD_DON,CARDIAC_OUTPUT_CATH_INIT_DON,CARDIAC_OUTPUT_CATH_POST_DON,CARDIAC_OUTPUT_CATH_OLD_DON,CARD_IDX_INIT_DON,CARD_IDX_POST_DON,BRONCHO_LT_DON,BRONCHO_RT_DON,CHEST_XRAY_DON,PH_DON,HEMATOCRIT_DON,ABN_VALVES_DON,ABN_LVH_DON,ABN_CONGEN_DON,WALL_ABN_SEG_DON,WALL_ABN_GLOB_DON,DATA_TRANSPLANT,DATA_WAITLIST,CTR_CODE,OPO_CTR_CODE,INIT_OPO_CTR_CODE,END_OPO_CTR_CODE,LISTING_CTR_CODE
0,HR,,,,0,1999,,,,,,,N,M,B,71.0,182.0,21.4346,,,,1.0,,NC,2.0,0,0,0,0,0,1,0,0,,,,,,,,,996.0,1.0,,1007.0,,5.0,1.0,N,N,,,1.3,,N,N,,47.0,25.0,30.0,20.0,3.0,Y,Y,Y,Y,Y,,,U,,,,Y,Y,1.0,,,N,N,N,,0,0,0,0,0,0,0,0,0,0,,,,,,,,,,,,,,4,1,27,,52,Y,09/08/1987,,10/05/1987,09/08/1987,0,1,355426,182.0,71.0,21.4,182.0,71.0,21.4,06/19/2015,,,,,,,,,,,0,11,,,,,,,567389,,,,0,0,0,30JUN1992:00:00:00.000,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,,,996.0,1.0,V,,,,0,,,,,,0,,,,,,,,,,0,,0,,,,,,,0,,U,,,,,,,,,,,,,,,,U,,0,0,,,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,998.0,,,,,,1.0,10119.0,999,1.0,10119.0,D,,,,Y,,,,,,,501,V,0,,,,Y,Y,,,,N,N,,3.0,28.0,53.0,58.0,2.0,5.0,1.0,2.0,8.0,18.0,3.0,5.0,2.0,2.0,1.0,5.0,0.0,,,,,,U,,,,1.0,U,24.0,2,,N,,,O,,C,M,NC,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,0.0,,,N,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,N,,,,,3.0,,,,,U,,1.0,997.0,10/05/1987,,,,167.6,74.8,26.631409,,,,06/19/2015,10/05/1987,10/06/1987,O,,2.0,52,1200.0,,2.3,,,,Y,3,,,A,,N,,,,,HR,U,,N,,,182.0,71.0,21.4,78.0,U,2.0,,,A290609,14MAY1991:00:00:00.000,,,,N,,,U,U,,,N,,,1.0,,1.0,,U,,V,26OCT1987:00:00:00.000,218478,,,,,,,,,,,,1987,,N,,1987,,998.0,998.0,,,,N,,,,,,,,,,,,,,,,,,,,,998.0,998.0,,,,,,,,,Y,Y,4464,14911,14911,14911,04464
1,HR,,,,0,1999,,,,,,,N,F,A,70.0,157.0,28.3987,,,,1.0,,MS,,0,0,0,0,0,0,0,0,,,,,,,,,998.0,,,1200.0,,,,,,,,,,,,,,,,,,,,,N,,,,U,,,,,,998.0,,,U,U,U,11/12/1998,0,0,0,0,0,0,0,0,0,0,,,,,,,,,,,,,,4,1,31,,48,,09/01/1987,,10/02/1987,09/01/1987,0,1,389631,157.0,70.0,28.4,157.0,70.0,28.4,11/12/1998,,,,,,,,,,,0,3,,,,,,,299504,,,,0,0,0,24JAN1994:00:00:00.000,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,,,998.0,3.0,V,,,,0,,,,,,0,,,,,,,,,,0,,0,,,,,,,0,,U,,,,,,,,,,,,,,,,U,,0,0,,,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,998.0,,,,,,1.0,4059.0,110,1.0,4059.0,D,996.0,,,Y,,,,,,,501,V,0,,,,Y,Y,,,,N,N,,11.0,28.0,7.0,53.0,2.0,9.0,,,,,,,,,,,5.0,5.0,,,,,U,,,,1.0,U,22.0,1,,N,,,A,,C,M,AL,,,,,N/A @ TIME OF D.E.,,,0.0,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,0.0,,,N,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,ND,,,,,999.0,,,,,U,,997.0,995.0,10/01/1987,,,,165.1,103.0,37.787058,,,,11/12/1998,10/02/1987,10/03/1987,O,,1.0,48,1200.0,,2.6,,,,,4,,,A,,N,,,,,HR,U,,U,,,157.0,70.0,28.4,111.0,U,2.0,,,A148156,14MAY1991:00:00:00.000,,,,N,,,U,U,,,N,,,1.0,,1.0,,U,,V,26OCT1987:00:00:00.000,204753,,,,,,,,,,,,1987,,N,,1987,,998.0,998.0,,,,N,,,,,,,,,,,,,,,,,,,,,998.0,998.0,999.0,,,,,,,,Y,Y,19034,11346,24149,24149,19034
2,HR,,,,0,1999,,,,,,,N,M,A,79.0,177.0,25.2163,,,,1.0,,MI,,0,0,0,0,0,0,0,0,,,,,,,,,998.0,,,1000.0,,,,,,,,,,,,,,,,,,,,,N,,,,U,,,,,,998.0,,,U,U,U,11/09/2004,0,0,0,0,0,0,0,0,0,0,,,,,,,,,,,,,,3,1,34,,57,,09/03/1987,,10/07/1987,09/03/1987,0,1,467563,177.0,79.0,25.2,177.0,79.0,25.2,11/09/2004,,,,,,,,,,,0,5,,,,,,,41669,,,,0,0,0,07JUL1992:00:00:00.000,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,,,998.0,3.0,V,,,,0,,,,,,0,,,,,,,,,,0,,0,,,,,,,0,,U,,,,,,,,,,,,,,,,U,,0,0,,,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2704.0,,,,,,1.0,6243.0,999,1.0,6243.0,D,2.0,,,Y,,,,,,,501,V,0,,,,Y,Y,,,,N,N,,2.0,3.0,7.0,97.0,2.0,4.0,2.0,3.0,7.0,57.0,2.0,7.0,0.0,0.0,1.0,1.0,0.0,,,,,,U,,,,1.0,U,20.0,1,,N,,,O,,C,M,NM,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,0.0,,,N,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,P,,,,,3.0,,,,,U,,1.0,997.0,10/07/1987,,,,,,,,,,11/09/2004,10/07/1987,10/08/1987,O,,2.0,57,1000.0,,3.5,,,,Y,4,,,A,,N,,,,,HR,U,,U,,,177.0,79.0,25.2,544.0,U,2.0,,,A12381,14MAY1991:00:00:00.000,,,,N,,,U,U,,,N,,,1.0,,1.0,,U,,V,26OCT1987:00:00:00.000,277758,,,,,,,,,,,,1987,,N,,1987,,998.0,998.0,,,,N,,,,,,,,,,,,,,,,,,,,,998.0,998.0,,,,,,,,,Y,Y,16616,8866,19809,19809,Unknown
3,HR,,,,0,1999,,,,,,,N,M,A,84.0,182.0,25.3593,,,,1.0,,WI,,0,0,0,0,0,0,0,0,,,,,,,,,996.0,,,1200.0,,,,,,,,,,,,,,,,,,,,,N,,,,U,,,,,,998.0,,,U,U,U,10/15/1987,0,0,0,0,0,0,0,0,0,0,,,,,,,,,,,,,,4,1,3,,51,N,09/29/1987,,10/02/1987,09/29/1987,0,1,359589,182.0,84.0,25.4,182.0,84.0,25.4,10/04/1987,,,,,,,,,,,0,7,,,,,,,859348,,,,0,0,0,21MAR1995:00:00:00.000,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,,,996.0,1.0,V,,,,0,,,,,,0,,,,,,,,,,0,,0,,,,,,,0,,U,,,,,,,,,,,,,,,,U,,0,0,,,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2000.0,,,,,,1.0,2.0,1,1.0,2.0,D,996.0,,,Y,,,,,,,501,V,0,,,,Y,Y,,,,N,N,,2.0,11.0,7.0,41.0,2.0,,1.0,3.0,8.0,62.0,4.0,97.0,2.0,2.0,1.0,5.0,0.0,,,,,,U,,,,1.0,U,24.0,1,,N,,,A,,C,M,NC,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,0.0,,,N,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,P,,,,,2.0,,,,,U,,997.0,11.0,10/02/1987,,,,,113.4,,,,,10/04/1987,10/02/1987,10/04/1987,O,,1.0,51,1200.0,,4.1,,,,,5,,,A,,N,,,,,HR,U,,U,,,182.0,84.0,25.4,595.0,U,2.0,,,A89139,14MAY1991:00:00:00.000,,,,N,,,U,U,,,N,,,1.0,,1.0,,U,,V,26OCT1987:00:00:00.000,239651,,,,,,,,,,,,1987,,N,,1987,,998.0,998.0,,,,N,,,,,,,,,,,,,,,,,,,,,998.0,998.0,,,,,,,,,Y,Y,7905,12772,25172,25172,07905
4,HR,,,,0,1999,,,,,,,N,F,B,61.0,172.0,20.6193,,,,1.0,,SC,,0,0,0,0,0,0,0,0,,,,,,,,,998.0,,,1200.0,,,,,,,,,,,,,,,,,,,,,N,,,,U,,,,,,998.0,,,U,U,U,01/07/1988,0,0,0,0,0,0,0,0,0,0,,2999.0,,,,,,,,,,,,18,1,0,2999.0,44,,12/17/1987,,10/11/1987,10/11/1987,0,1,100209,172.0,61.0,20.6,172.0,61.0,20.6,01/07/1988,,,,,,,,,,,0,3,,,,,,,232222,,,,0,0,0,04NOV1993:00:00:00.000,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,,,998.0,3.0,V,,,,0,,,,,,0,,,,,,,,,,0,,0,,,,,,,0,,U,,,,,,,,,,,,,,,,U,,0,0,,,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2119.0,UNKNOWN,,,,,1.0,88.0,1,1.0,88.0,D,998.0,,,Y,,,,,,,501,V,0,,,,Y,Y,,,,N,N,,28.0,33.0,35.0,58.0,5.0,6.0,1.0,24.0,7.0,14.0,,,2.0,2.0,,,0.0,,,,,,U,,,,1.0,U,25.0,2,,N,,,B,,C,F,AL,,,,,N/A @ TIME OF D.E.,,,0.0,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,0.0,,,N,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,ND,,,,,999.0,,,,,U,,997.0,995.0,10/11/1987,,,,162.6,61.7,23.344121,,,,01/07/1988,10/11/1987,01/07/1988,O,,1.0,44,1200.0,,2.7,,,,,4,,,A,,N,,,,,HR,U,,U,,,172.0,61.0,20.6,270.0,U,2.0,,,A232368,14MAY1991:00:00:00.000,,,,N,,,U,U,,,N,,,1.0,,1.0,,U,,V,26OCT1987:00:00:00.000,95449,,,,,,,,,,,,1987,,N,,1987,,998.0,998.0,,,,N,,,,,,,,,,,,,,,,,,,,,998.0,998.0,,,,,,,,,Y,Y,3627,11346,4743,4743,03627


In [13]:
# count heart rows & columns
print(f"Total number of rows: {df_heart.shape[0]:,} & Number of columns: {df_heart.shape[1]}")

Total number of rows: 72,411 & Number of columns: 546


### 10 Years of Data

In [14]:
# initialize variable
year = 2010
# last ten year worth of data & max year is 2021
print(f"The Max year for LISTYR is {np.max(df_heart.LISTYR)} & record count > 2010 is:  {df_heart.LISTYR[df_heart.LISTYR > year].count():,} rows.")

# last ten year worth of data & max year is 2021
print(f"The Max year for TX_YEAR is {np.max(df_heart.TX_YEAR)} & record count > 2010 is:  {df_heart.TX_YEAR[df_heart.TX_YEAR > year].count():,} rows.")

The Max year for LISTYR is 2021 & record count > 2010 is:  27,494 rows.
The Max year for TX_YEAR is 2021 & record count > 2010 is:  28,751 rows.


In [15]:
# display data dictionary
df_dict[df_dict.Feature.isin(['LISTYR','TX_YEAR'])]

Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
317,LISTYR,ACTUAL YEAR REGISTRANT LISTED (WITHOUT DATE OFFSET),CALCULATED,1987-10-01,NaT,,NUM,,,LISTYR,Unknown,Unknown
478,TX_YEAR,TRANSPLANT YEAR,CALCULATED,NaT,NaT,,NUM,,,TX_YEAR,Unknown,Unknown


#### Select Records Greater than 2010 (TX_YEAR)

In [16]:
# copy LISTYR greater than 2010 & reindex
df = df_heart[df_heart.TX_YEAR > year].reset_index(drop=True).copy()

# display
df.head()

Unnamed: 0,WL_ORG,COD_WL,COD_OSTXT_WL,TRANSPLANT_COUNTRY,NUM_PREV_TX,THORACIC_DGN,GROUPING,TAH,VAS,ONVENT,ICU,INOTROPIC,INUTERO,GENDER,ABO,WGT_KG_TCR,HGT_CM_TCR,BMI_TCR,TCR_CDC_GROWTH_BMI,TCR_CDC_GROWTH_HGT,TCR_CDC_GROWTH_WGT,CITIZENSHIP,CITIZEN_COUNTRY,PERM_STATE,EDUCATION,ECMO_TCR,IABP_TCR,PROS_INFUS_TCR,PROSTACYCLIN_TCR,INHALED_NO,INOTROPES_TCR,PGE_TCR,OTH_LIFE_SUP_TCR,OTH_LIFE_SUP_OSTXT_TCR,VAD_DEVICE_TY_TCR,VAD_BRAND1_TCR,VAD_BRAND1_OSTXT_TCR,VAD_BRAND2_TCR,VAD_BRAND2_OSTXT_TCR,VAD_TAH_TCR,VAD_TAH_OSTXT_TCR,FUNC_STAT_TCR,PRI_PAYMENT_TCR,PRI_PAYMENT_CTRY_TCR,TCR_DGN,TCR_DGN_OSTXT,DIAB,DIAL_TY_TCR,CEREB_VASC,MALIG_TCR,MALIG_TY_TCR,MALIG_TY_OSTXT_TCR,MOST_RCNT_CREAT,TOT_SERUM_ALBUM,SUD_DEATH,IMPL_DEFIBRIL,RESIST_INF,HEMO_SYS_TCR,HEMO_PA_DIA_TCR,HEMO_PA_MN_TCR,HEMO_PCW_TCR,HEMO_CO_TCR,INOTROP_VASO_SYS_TCR,INOTROP_VASO_DIA_TCR,INOTROP_VASO_MN_TCR,INOTROP_VASO_PCW_TCR,INOTROP_VASO_CO_TCR,CIG_USE,TCR_DUR_ABSTAIN,PRIOR_CARD_SURG_TCR,PRIOR_CARD_SURG_TYPE_TCR,PRIOR_CARD_SURG_TYPE_OSTXT_TCR,HISTRY_CIG_OLD,CONTIN_CIG_OLD,CIG_GRT_10_OLD,STERNOTOMY_TCR,THORACOT_LT_OLD,THORACOT_RT_OLD,PNEUMOTHORAX_OLD,PNEUMORED_OLD,LEFT_VENT_REMODEL_OLD,SSDMF_DEATH_DATE,DAYS_STAT1,DAYS_STAT1A,DAYS_STAT2,DAYS_STAT1B,DAYS_STATA4,DAYS_STATA5,DAYS_STATA2,DAYS_STATA3,DAYS_STATA1,DAYS_STATA6,LAST_INACT_REASON,INIT_STAT,INIT_O2,END_O2,INIT_CREAT,END_CREAT,INIT_CALC_LAS,INIT_MATCH_LAS,END_CALC_LAS,END_MATCH_LAS,CALC_LAS_LISTDATE,INIT_PRIORITY,END_PRIORITY,REM_CD,TXED,DAYSWAIT_CHRON,END_STAT,INIT_AGE,LIFE_SUP_TCR,ACTIVATE_DATE,DEATH_DATE,END_DATE,INIT_DATE,ETHNICITY,ETHCAT,PT_CODE,INIT_HGT_CM_CALC,INIT_WGT_KG_CALC,INIT_BMI_CALC,END_HGT_CM_CALC,END_WGT_KG_CALC,END_BMI_CALC,COMPOSITE_DEATH_DATE,WLHR,WLHL,WLIN,WLKI,WLKP,WLLI,WLLU,WLPA,WLPI,WLVC,VENTILATOR_TCR,REGION,LVAD_AT_LISTING,LVAD_WHILE_LISTED,RVAD_AT_LISTING,RVAD_WHILE_LISTED,VAD_AT_LISTING,VAD_WHILE_LISTED,WL_ID_CODE,INIT_LLU_FLG,INIT_RLU_FLG,INIT_BLU_FLG,END_LLU_FLG,END_RLU_FLG,END_BLU_FLG,VAL_DT_TCR,YR_ENTRY_US_TCR,WORK_INCOME_TCR,ACADEMIC_PRG_TCR,ACADEMIC_LEVEL_TCR,EXERCISE_O2,INACT_REASON_CD,BW4,BW6,C1,C2,DR51,DR51_2,DR52,DR52_2,DR53,DR53_2,DQ1,DQ2,ACADEMIC_LEVEL_TRR,ACADEMIC_PRG_TRR,FUNC_STAT_TRR,MED_COND_TRR,STATUS_TRR,ADMISSION_DATE,PRI_PAYMENT_TRR,PRI_PAYMENT_CTRY_TRR,ECMO_TRR,PERM_STATE_TRR,WORK_INCOME_TRR,COGNITIVE_DEV_TRR,MOTOR_DEV_TRR,MEASUREMENT_DATE_TRR,PGE_TRR,CREAT_TRR,DIAL_AFTER_LIST,FEV1_TRR,FVC_TRR,HEMO_CO_TRR,HEMO_PA_DIA_TRR,HEMO_PA_MN_TRR,HEMO_PCW_TRR,HEMO_SYS_TRR,IABP_TRR,INFECT_IV_DRUG_TRR,INOTROPES_TRR,INOTROP_VASO_CO_TRR,INOTROP_VASO_DIA_TRR,INOTROP_VASO_MN_TRR,INOTROP_VASO_PCW_TRR,INOTROP_VASO_SYS_TRR,OTH_LIFE_SUP_OSTXT_TRR,OTH_LIFE_SUP_TRR,PCO2_TRR,PRIOR_LUNG_SURG_TRR,PRIOR_LUNG_SURG_TYPE_TRR,PRIOR_LUNG_SURG_TYPE_OSTXT_TRR,PST_AIRWAY,ACUTE_REJ_EPI,PST_STROKE,PST_DIAL,PST_PACEMAKER,STEROID,TBILI,TRANSFUSIONS,VAD_DEVICE_TY_TRR,VAD_BRAND1_TRR,VAD_BRAND1_OSTXT_TRR,VAD_BRAND2_TRR,VAD_BRAND2_OSTXT_TRR,VENT_SUPPORT_TRR,VENT_TIMEFRAME_TRR,VENTILATOR_TRR,INHALED_NO_TRR,PRIOR_CARD_SURG_TYPE_TRR,PRIOR_CARD_SURG_TYPE_OSTXT_TRR,PROS_INFUS_TRR,PROSTACYCLIN_TRR,TRACHEOSTOMY_TRR,ECMO_72HOURS,FIO2_72HOURS,INHALEDNO_72HOURS,INTUBATED_72HOURS,PAO2_72HOURS,POST_TX_VENT_SUPPORT,REINTUBATED,PERFUSED_PRIOR,PERFUSION_LOCATION,PERFUSED_BY,TOTAL_PERFUSION_TIME,LU_RECEIVED,LU2_RECEIVED,PRETITERA,PRETITERA_DATE,PRETITERB,PRETITERB_DATE,HBV_CORE,HBV_SUR_ANTIGEN,HBV_SURF_TOTAL,CMV_STATUS,HIV_SEROSTATUS,HCV_SEROSTATUS,EBV_SEROSTATUS,HIV_NAT,HCV_NAT,HBV_NAT,COD,COD_OSTXT,COD2,COD2_OSTXT,COD3,COD3_OSTXT,GSTATUS,GTIME,LASTFUNO,PSTATUS,PTIME,PX_STAT,FUNC_STAT_TRF,DANTIARR_OLD,CREAT2_OLD,TXHRT,TXINT,TXKID,TXLIV,TXLNG,TXPAN,TXVCA,TX_PROCEDUR_TY,STATUS_TCR,INHALED_NO_TCR,PRVTXDIF,RETXDATE,STERNOTOMY_TRR,DON_RETYP,CRSMATCH_DONE,CPRA,CPRA_PEAK,TRTREJ1Y,PREV_TX,PREV_TX_ANY,PREV_TX_ANY_N,DA1,DA2,DB1,DB2,DDR1,DDR2,RA1,RA2,RB1,RB2,RDR1,RDR2,AMIS,BMIS,DRMIS,HLAMIS,PRAMR,PRAPK,PRAMR_CL1,PRAMR_CL2,PRAPK_CL1,PRAPK_CL2,MALIG_TRR,MALIG_TY_TRR,CMV_IGG,CMV_IGM,CITIZENSHIP_DON,HIST_COCAINE_DON,AGE_DON,ETHCAT_DON,HBV_CORE_DON,HBV_SUR_ANTIGEN_DON,HBV_DNA_DON,HCV_RNA_DON,ABO_DON,ALCOHOL_HEAVY_DON,DON_TY,GENDER_DON,HOME_STATE_DON,HCV_RIBA_DON,HCV_ANTIBODY_DON,HEP_C_ANTI_DON,LIV_DON_TY,COD_OSTXT_DON,NON_HRT_DON,ANTIHYPE_DON,BLOOD_INF_DON,BLOOD_INF_CONF_DON,BUN_DON,CREAT_DON,DOBUT_DON_OLD,DOPAMINE_DON_OLD,HTLV1_OLD_DON,HTLV2_OLD_DON,OTH_DON_MED1_OSTXT_DON_OLD,OTH_DON_MED2_OSTXT_DON_OLD,OTH_DON_MED3_OSTXT_DON_OLD,OTHER_INF_DON,OTHER_INF_CONF_DON,OTHER_INF_OSTXT_DON,PRETREAT_MED_DON_OLD,PT_DIURETICS_DON,PT_STEROIDS_DON,PT_T3_DON,PT_T4_DON,PT_OTH2_OSTXT_DON,PT_OTH3_OSTXT_DON,PT_OTH4_OSTXT_DON,PT_OTH1_OSTXT_DON,PULM_INF_DON,PULM_INF_CONF_DON,SGOT_DON,SGPT_DON,TBILI_DON,URINE_INF_DON,URINE_INF_CONF_DON,VASODIL_DON,VDRL_DON,CLIN_INFECT_DON,HYPERTENS_DUR_DON,CANCER_FREE_INT_DON,CANCER_OTH_OSTXT_DON,CONTIN_ALCOHOL_OLD_DON,CONTIN_CIG_DON,CONTIN_IV_DRUG_OLD_DON,CONTIN_COCAINE_DON,CONTIN_OTH_DRUG_DON,DIET_DON,DIURETICS_DON,EXTRACRANIAL_CANCER_DON,HIST_ALCOHOL_OLD_DON,CANCER_SITE_DON,HIST_CIG_DON,DIABDUR_DON,HIST_HYPERTENS_DON,HIST_IV_DRUG_OLD_DON,INTRACRANIAL_CANCER_DON,OTHER_HYPERTENS_MED_DON,HIST_CANCER_DON,HIST_INSULIN_DEP_DON,INSULIN_DUR_DON,HIST_DIABETES_DON,DIABETES_DON,HIST_OTH_DRUG_DON,SKIN_CANCER_DON,CMV_IGG_DON,CMV_IGM_DON,CMV_OLD_LIV_DON,CMV_DON,CMV_TEST_DON,EBV_TEST_DON,HBV_TEST_DON,HCV_TEST_DON,COD_CAD_DON,COD_LIV_DON,EBV_DNA_DON,EBV_IGG_DON,EBV_IGM_DON,DDAVP_DON,CMV_NUCLEIC_DON,DEATH_CIRCUM_DON,DEATH_MECH_DON,RECOVERY_DATE_DON,HEPARIN_DON,ARGININE_DON,INSULIN_DON,HGT_CM_DON_CALC,WGT_KG_DON_CALC,BMI_DON_CALC,HBV_NAT_DON,HCV_NAT_DON,HIV_NAT_DON,PX_STAT_DATE,TX_DATE,DISCHARGE_DATE,TX_TYPE,MULTIORG,ABO_MAT,AGE,DIAG,DIAL_PRIOR_TX,ISCHTIME,GRF_FAIL_CAUSE,GRF_FAIL_CAUSE_OSTXT,GRF_FAIL_DATE,GRF_STAT,SHARE_TY,LOS,DIAG_OSTXT,AGE_GROUP,O2_REQ_CALC,LIFE_SUP_TRR,TITERA,TITERA_DATE,TITERB,TITERB_DATE,ORGAN,PRIOR_CARD_SURG_TRR,MALIG_TY,MALIG,MALIG_TY_OSTXT,MALIG_TY_OSTXT_TRR,HGT_CM_CALC,WGT_KG_CALC,BMI_CALC,DISTANCE,VENT_SUPPORT_AFTER_LIST,PROC_TY_HR,VAD_TAH_TRR,VAD_TAH_OSTXT_TRR,TRR_ID_CODE,VAL_DT_TRR,EDUCATION_DON,STATUS_LDR,VAL_DT_LDR,RECOV_OUT_US,RECOV_COUNTRY,ADMIT_DATE_DON,PROTEIN_URINE,CARDARREST_NEURO,RESUSCIT_DUR,PO2,HIST_MI,LV_EJECT_METH,LV_EJECT,CORONARY_ANGIO,VESSELS_50STEN,BIOPSY_DGN,OTH_DGN_OSTXT,TATTOOS,CONTROLLED,STATUS_DDR,VAL_DT_DDR,DONOR_ID,HBSAB_DON,EBV_IGG_CAD_DON,EBV_IGM_CAD_DON,CDC_RISK_HIV_DON,INO_PROCURE_AGENT_1,INO_PROCURE_AGENT_2,INO_PROCURE_AGENT_3,INO_PROCURE_OSTXT_1,INO_PROCURE_OSTXT_2,INO_PROCURE_OSTXT_3,ECD_DONOR,TX_YEAR,INOTROP_SUPPORT_DON,LT_ONE_WEEK_DON,REFERRAL_DATE,LISTYR,TRANSFUS_TERM_DON,TRANSFUS_INTRAOP_NUM_OLD_DON,TRANSFUS_PRIOR_NUM_OLD_DON,PO2_DONE_DON,PO2_FIO2_DON,PCO2_DON,PULM_CATH_DON,MAP_INIT_DON,MAP_POST_DON,CVP_CATH_INIT_DON,CVP_CATH_POST_DON,CVP_CATH_OLD_DON,PCWP_INIT_DON,PCWP_POST_DON,SVR_INIT_DON,SVR_POST_DON,SYST_PA_CATH_INIT_DON,SYST_PA_CATH_POST_DON,SYSTOLIC_PA_CATH_OLD_DON,DIAST_PA_CATH_INIT_DON,DIAST_PA_CATH_POST_DON,DIASTOLIC_PA_CATH_OLD_DON,CARDIAC_OUTPUT_CATH_INIT_DON,CARDIAC_OUTPUT_CATH_POST_DON,CARDIAC_OUTPUT_CATH_OLD_DON,CARD_IDX_INIT_DON,CARD_IDX_POST_DON,BRONCHO_LT_DON,BRONCHO_RT_DON,CHEST_XRAY_DON,PH_DON,HEMATOCRIT_DON,ABN_VALVES_DON,ABN_LVH_DON,ABN_CONGEN_DON,WALL_ABN_SEG_DON,WALL_ABN_GLOB_DON,DATA_TRANSPLANT,DATA_WAITLIST,CTR_CODE,OPO_CTR_CODE,INIT_OPO_CTR_CODE,END_OPO_CTR_CODE,LISTING_CTR_CODE
0,HR,,,,0,1000,,,,,,,,M,A,65.771,173.0,22.047,,,,1.0,,CA,4.0,0,0,0,0,0,0,0,1,BILATERAL CENTRIMAGS,5.0,227.0,,320.0,,,,2010.0,2.0,,1000.0,,1.0,1.0,N,N,,,1.3,4.8,,Y,,50.0,34.0,37.0,36.0,2.5,Y,Y,Y,Y,Y,N,,Y,2.0,,,,,,,,,,,,0,11,0,0,0,0,0,0,0,0,,2010.0,,,,,,,,,,,,4,1,11,2010.0,45,Y,01/18/2013,,01/29/2013,01/18/2013,1,4,963460,172.7,65.8,22.0,172.7,65.8,22.0,12/24/2013,,,,,,,,,,,0,5,1.0,,1.0,,,,1075943,0.0,0.0,0.0,0,0,0,28FEB1995:00:00:00.000,,N,,,,,0,0,0,0,0,0,0,0,0,0,0,0,,,2010.0,1.0,V,12/25/2012,2.0,,0,CA,N,,,,0,1.0,N,,,,,,,,0,N,0,,,,,,VAD,1,,N,,,N,3.0,N,Y,N,N,3.6,N,5.0,227.0,,320.0,,Y,2.0,0,0,16.0,VAD,0,0,,,,,,,,,,,,,,,,,,,N,N,,P,N,N,ND,,,,2204.0,,2201.0,,,,1.0,329.0,999,1.0,329.0,D,2100.0,,,Y,,,,,,,501,V,0,,,,N,N,,,N,N,N,,2.0,29.0,44.0,49.0,4.0,15.0,24.0,24.0,35.0,61.0,4.0,4.0,2.0,2.0,1.0,5.0,,,2.0,0.0,,,N,,P,N,1.0,N,31.0,2,N,N,,,A,N,C,M,CA,,,N,,,N,Y,0.0,,18.0,1.3,,,,,,,,0.0,,,,Y,N,N,N,"KCL,","DOPAMINE, VASOPRESSIN,",,"ZOSYN, VANCOMYCIN",1.0,Y,46.0,40.0,0.4,0.0,,N,N,Y,,,,,,,,,,,N,,1.0,N,,N,,N,,N,,,1.0,N,N,N,,,,P,,,,,3.0,,,,,Y,,3.0,7.0,01/29/2013,Y,N,Y,172.7,107.0,35.867286,,,,12/24/2013,01/29/2013,04/15/2013,O,,1.0,45,1000.0,N,4.9,999.0,TCAD,12/24/2013,N,3,76.0,,A,,Y,,,,,HR,Y,,N,,,172.7,65.3,21.9,83.0,Y,1.0,,,A602500,07MAR2013:19:06:35.000,,,,N,,01/27/2013,N,N,,182.0,N,1.0,60.0,1.0,,1.0,,Y,,V,27FEB2013:16:31:42.000,424207,ND,P,N,N,,,,,,,0.0,2013,N,N,01/27/2013,2013,1.0,,,Y,47.0,40.0,N,,,,,,,,,,,,,,,,,,,,,2.0,2.0,4.0,7.38,26.2,,,,,,Y,Y,23901,7657,7657,7657,23901
1,HR,,,,0,1049,,,,,,,N,M,O,72.575,185.0,21.1093,,,,1.0,,TX,3.0,0,0,0,0,0,0,0,0,,2.0,205.0,,,,,,2010.0,3.0,,1006.0,,1.0,1.0,N,N,,,2.3,2.9,,Y,,53.0,39.0,46.0,31.0,2.1,N,N,N,N,N,N,,N,,,,,,,,,,,,,0,3,0,263,0,0,0,0,0,0,11.0,2010.0,,,,,,,,,,,,4,1,348,2020.0,35,Y,02/15/2012,,01/28/2013,02/15/2012,1,4,917326,185.4,72.6,21.1,185.4,72.6,21.1,,,,,,,,,,,,0,4,,,,,,1.0,1021545,0.0,0.0,0.0,0,0,0,09MAY1994:00:00:00.000,,N,,,,,0,0,0,0,0,0,0,0,0,0,0,0,,,2090.0,3.0,V,01/28/2013,3.0,,0,TX,N,,,,0,0.7,N,,,4.6,8.0,18.0,4.0,28.0,0,N,0,N,N,N,N,N,,0,,N,,,N,3.0,N,N,N,N,0.9,N,2.0,205.0,,,,N,,0,0,,,0,0,,,,,,,,,,,,,,,,,,,N,N,,P,N,N,P,,,,,,,,,,0.0,2592.0,80,0.0,2592.0,A,,,,Y,,,,,,,501,V,0,,,,Y,Y,,,,N,N,,2.0,68.0,49.0,60.0,10.0,16.0,30.0,68.0,48.0,65.0,8.0,9.0,1.0,2.0,2.0,5.0,,,0.0,0.0,,,N,,P,N,4.0,N,30.0,4,N,N,,,O,N,C,M,TX,,,N,,,N,N,0.0,,29.0,0.7,,,,,,,,0.0,,,,Y,N,N,Y,,,,"ANCEF, FENTANYL, VERSED,LASIX",1.0,Y,38.0,171.0,0.8,0.0,,N,N,Y,,,,,,,,,,,N,,1.0,N,,N,,N,,N,,,1.0,N,N,N,,,,P,,,,,3.0,,,,,N,,5.0,9.0,01/28/2013,Y,Y,N,172.7,67.1,22.492476,,,,03/04/2020,01/28/2013,02/07/2013,O,,1.0,36,1000.0,N,3.1,,,,Y,3,10.0,,A,,Y,,,,,HR,N,,N,,,185.4,94.8,27.6,0.0,N,1.0,,,A602494,11FEB2013:16:24:00.000,,,,N,,01/12/2013,N,N,,323.0,N,1.0,55.0,1.0,,1.0,,N,,V,03MAR2013:21:46:46.000,424199,ND,P,N,N,1.0,,,,,,0.0,2013,Y,N,01/26/2013,2012,2.0,,,Y,100.0,28.0,N,,,,,,,,,,,,,,,,,,,,,2.0,2.0,3.0,7.5,27.8,,,,,,Y,Y,5487,11377,11377,11377,5487
2,HR,,,,0,1007,,,,,,,N,M,O,106.0,175.0,34.5096,,,,1.0,,NY,4.0,1,1,0,0,0,0,0,0,,1.0,,,,,,,2010.0,1.0,,1007.0,,1.0,1.0,N,N,,,1.4,2.2,,N,,52.0,23.0,32.0,,5.0,Y,Y,Y,,Y,N,,Y,17.0,VSD,,,,,,,,,,,0,53,0,2,0,0,0,0,0,0,,2010.0,,,,,,,,,,,,4,1,55,2010.0,58,Y,12/06/2012,,01/30/2013,12/06/2012,0,1,957421,175.3,106.0,34.5,175.3,106.0,34.5,,,,,,,,,,,,1,9,,,,,,,1069137,0.0,0.0,0.0,0,0,0,,,N,,,,,0,0,0,0,0,0,0,0,0,0,0,0,,,2020.0,2.0,V,11/21/2012,1.0,,0,NY,N,,,,0,2.19,N,,,4.4,21.0,22.0,13.0,28.0,0,N,1,Y,Y,Y,Y,Y,,0,,N,,,N,3.0,N,N,N,N,1.2,Y,1.0,,,,,Y,2.0,0,0,17.0,VSD,0,0,,,,,,,,,,,,,,,,,,,N,N,,N,N,N,P,,,,,,,,,,0.0,2913.0,80,0.0,2913.0,A,,,,Y,,,,,,,501,V,0,,,,N,Y,,,N,N,N,,2.0,3.0,7.0,44.0,11.0,15.0,2.0,3.0,35.0,44.0,4.0,13.0,0.0,1.0,2.0,3.0,,,0.0,0.0,,,N,,N,N,1.0,N,15.0,1,N,N,,,O,N,C,M,NY,,,N,,,N,N,0.0,,15.0,0.74,,,,,,,,0.0,,,,Y,Y,N,Y,"AMPHOTERICIN, BETADINE",ZANTAC,,"ZOSYN, VANCOMYCIN",1.0,Y,445.0,217.0,0.7,0.0,,N,N,Y,,,,,,,,Y,,,N,,1.0,N,,N,,N,,N,,,1.0,N,Y,N,,,,N,,,,,1.0,,,,,N,,997.0,3.0,01/30/2013,Y,Y,N,170.0,59.0,20.415225,,,,01/21/2021,01/30/2013,02/20/2013,O,,1.0,58,1007.0,N,1.4,,,,Y,3,21.0,,A,,Y,,,,,HR,Y,,N,,,175.3,90.7,29.5,0.0,Y,1.0,,,A602586,22APR2013:17:29:07.000,,,,N,,01/26/2013,N,N,,401.0,N,1.0,60.0,1.0,,1.0,,N,,V,15FEB2013:14:07:33.000,424109,ND,N,N,N,999.0,4.0,,MILRINONE,,,0.0,2013,Y,N,01/27/2013,2012,0.0,,,Y,100.0,41.0,N,,,,,,,,,,,,,,,,,,,,,2.0,,5.0,7.4,25.4,,,,,,Y,Y,12834,14012,14012,14012,12834
3,HR,,,,0,1007,,,,,,,N,M,A,111.0,178.0,35.0335,,,,1.0,,NY,3.0,0,0,0,0,0,1,0,0,,5.0,227.0,,320.0,,,,2020.0,4.0,,1007.0,,1.0,1.0,N,N,,,1.1,3.1,,N,,,,,,,,,,,,Y,1.0,Y,16.0,VAD PLACEMENT,,,,,,,,,,,0,96,0,296,0,0,0,0,0,0,,2010.0,,,,,,,,,,,,4,1,392,2010.0,65,Y,01/03/2012,,01/29/2013,01/03/2012,0,1,912325,178.0,111.0,35.0,178.0,111.6,35.2,06/07/2016,,,,,,,,,,,0,9,,1.0,,,,,1014528,0.0,0.0,0.0,0,0,0,08MAY1992:00:00:00.000,,N,,,,,0,0,0,0,0,0,0,0,0,0,0,0,,,2080.0,3.0,V,01/28/2013,1.0,,0,NY,N,,,,0,1.35,N,,,4.8,12.0,22.0,12.0,34.0,0,N,0,N,N,N,N,N,,0,,N,,,N,1.0,N,N,N,N,0.7,Y,2.0,205.0,,,,Y,2.0,0,0,16.0,BIVAD XPLANT/LVAD INSERTION,0,0,,,,,,,,,,,,,,,,,,,N,N,,N,N,N,P,,,,2003.0,,2299.0,VASODILATORY SHOCK,2304.0,,1.0,1225.0,999,1.0,1225.0,D,2030.0,,,Y,,,,,,,501,V,0,,,,Y,Y,,,Y,N,N,,33.0,68.0,39.0,51.0,1.0,8.0,24.0,26.0,35.0,38.0,4.0,11.0,2.0,2.0,2.0,6.0,,,0.0,0.0,,,N,,N,ND,1.0,N,47.0,1,N,N,,,A1,N,C,F,NY,,,N,,,N,Y,1.0,N,29.0,1.34,,,,,,,,0.0,,,,Y,Y,N,Y,"D50, DOPAMINE, DIFLUCAN",,,"KCL, VANCO, MAG, ZOSYN",1.0,Y,248.0,165.0,0.3,1.0,N,N,N,Y,2.0,0.5,,,Y,,,Y,N,N,N,,2.0,Y,,Y,,N,Y,Y,,,1.0,N,Y,N,,,,P,,,,,2.0,,,,,N,,6.0,11.0,01/29/2013,Y,Y,Y,163.0,115.2,43.358802,,,,06/07/2016,01/29/2013,03/02/2013,O,,1.0,66,1007.0,N,3.5,3.0,,06/07/2016,N,4,32.0,,A,,Y,,,,,HR,Y,,N,,,177.8,113.0,35.7,252.0,Y,1.0,,,A602368,07MAR2013:17:18:31.000,,,,N,,01/27/2013,Y,N,,267.0,N,1.0,60.0,2.0,,1.0,,Y,,V,27FEB2013:11:41:18.000,424065,ND,P,N,N,,,,,,,0.0,2013,N,N,01/27/2013,2012,0.0,,,Y,100.0,36.0,N,,,,,,,,,,,,,,,,,,,,,,,5.0,7.37,38.3,,,,,,Y,Y,124,17639,14012,14012,124
4,HR,,,,0,1000,,,,,,,N,M,A,72.0,172.0,24.3375,,,,1.0,,NC,5.0,0,0,0,0,0,0,0,0,,2.0,205.0,,,,,,2090.0,4.0,,1202.0,,1.0,1.0,N,N,,,0.8,,,Y,,30.0,15.0,20.0,15.0,4.75,N,N,N,N,N,N,,Y,16.0,LVAD IMPLANT,,,,,,,,,,,0,19,0,23,0,0,0,0,0,0,,2020.0,,,,,,,,,,,,4,1,42,2010.0,70,Y,12/19/2012,,01/30/2013,12/19/2012,0,1,959488,172.0,72.0,24.3,172.0,72.0,24.3,,,,,,,,,,,,0,11,1.0,,,,,,1071205,0.0,0.0,0.0,0,0,0,02FEB1995:08:20:06.000,,N,,,,,0,0,0,0,0,0,0,0,0,0,0,0,,,2090.0,3.0,V,01/29/2013,4.0,,0,NC,N,,,,0,0.8,N,,,4.75,15.0,20.0,15.0,30.0,0,N,0,N,N,N,N,N,,0,,N,,,N,3.0,N,N,N,N,0.8,N,2.0,205.0,,,,N,,0,0,,,0,0,,,,,,,,,,,,,,,,,,,N,N,,P,N,N,P,,,,,,,,,,0.0,2785.0,80,0.0,2785.0,A,,,,Y,,,,,,,501,V,0,,,,Y,Y,,,N,N,N,,3.0,24.0,61.0,64.0,7.0,15.0,3.0,24.0,7.0,38.0,13.0,14.0,0.0,2.0,2.0,4.0,,,0.0,0.0,,,N,,P,N,1.0,N,22.0,1,N,N,,,A2,N,C,F,NC,,,N,,,N,Y,0.0,,70.0,5.1,,,,,,,,0.0,,,,Y,Y,N,Y,"ZOSYN, CIPROFLOXACIN","BUMETIDE, ANCEF",,"ROCURONIUM, VECURONIUM",1.0,Y,137.0,174.0,0.5,0.0,,N,N,Y,,,,,,,,Y,,,N,,1.0,N,,N,,N,,N,,,1.0,N,Y,N,,,,P,,,,,1.0,,,,,N,,2.0,3.0,01/30/2013,Y,N,Y,182.9,113.0,33.786719,,,,09/15/2020,01/30/2013,02/25/2013,O,,1.0,70,1000.0,N,2.8,,,,Y,3,26.0,,A,,Y,,,,,HR,N,,N,,,173.0,72.0,24.1,60.0,N,1.0,,,A602501,13MAR2013:17:15:33.000,,,,N,,01/25/2013,N,Y,15.0,209.0,N,1.0,55.0,1.0,,1.0,,Y,,V,11FEB2013:12:44:02.000,424107,P,P,N,N,,,,,,,0.0,2013,N,N,01/28/2013,2012,0.0,,,Y,100.0,33.0,N,,,,,,,,,,,,,,,,,,,,,2.0,,5.0,7.51,30.7,,,,,,Y,Y,19499,14911,14911,14911,19499


#### Initial Data Wrangling

In [17]:
# get columns
dictList = df_dict.Feature.to_list()
dfList = df.columns.to_list()

# remove from dictiony
removeCols = list((set(dictList)) - (set(dfList)))

# remove not existing features in DatFrame
df_dict = uf.removeRowUsingMask(df_dict, removeCols, 'Feature').copy()

Remove 26 row(s) from Feature column in a DataFrame.


In [18]:
# remove columns where NaNs >= 80%
dataNaN = uf.percentageNull(df)
# get features
removeCols = dataNaN.index[dataNaN.percentage >= 80]

# remove features & rows
df = uf.removeColumn(df, removeCols).copy()
df_dict = uf.removeRowUsingMask(df_dict, removeCols, 'Feature').copy()


Removed Features: ['ABN_CONGEN_DON', 'ABN_LVH_DON', 'ABN_VALVES_DON', 'ACADEMIC_LEVEL_TCR', 'ACADEMIC_LEVEL_TRR', 'ACADEMIC_PRG_TCR', 'ACADEMIC_PRG_TRR', 'BLOOD_INF_CONF_DON', 'CALC_LAS_LISTDATE', 'CANCER_FREE_INT_DON', 'CANCER_OTH_OSTXT_DON', 'CARDIAC_OUTPUT_CATH_INIT_DON', 'CARDIAC_OUTPUT_CATH_OLD_DON', 'CARDIAC_OUTPUT_CATH_POST_DON', 'CARD_IDX_INIT_DON', 'CARD_IDX_POST_DON', 'CIG_GRT_10_OLD', 'CITIZEN_COUNTRY', 'CMV_IGG_DON', 'CMV_IGM_DON', 'CMV_NUCLEIC_DON', 'CMV_OLD_LIV_DON', 'CMV_TEST_DON', 'COD', 'COD2', 'COD2_OSTXT', 'COD3', 'COD3_OSTXT', 'COD_LIV_DON', 'COD_OSTXT', 'COD_OSTXT_DON', 'COD_OSTXT_WL', 'COD_WL', 'COGNITIVE_DEV_TRR', 'COMPOSITE_DEATH_DATE', 'CONTIN_ALCOHOL_OLD_DON', 'CONTIN_CIG_DON', 'CONTIN_CIG_OLD', 'CONTIN_IV_DRUG_OLD_DON', 'CONTROLLED', 'CREAT2_OLD', 'CVP_CATH_INIT_DON', 'CVP_CATH_OLD_DON', 'CVP_CATH_POST_DON', 'DANTIARR_OLD', 'DEATH_DATE', 'DIABDUR_DON', 'DIAG_OSTXT', 'DIASTOLIC_PA_CATH_OLD_DON', 'DIAST_PA_CATH_INIT_DON', 'DIAST_PA_CATH_POST_DON', 'DIET_DON', 

##### REMOVE ENCRYPTED Features
- **WL_ORG**: ORGAN LISTED FOR
- **WL_ID_CODE**: ENCRYPTED REGISTRATION IDENTIFIER
- **PT_CODE**: ENCRYPTED RECIPIENT IDENTIFIER
- **DONOR_ID**: ENCRYPTED DONOR IDENTIFIER
- **CTR_CODE**: ENCRYPTED REMOVAL/CURRENT OPO MAPPED FROM LISTING CENTER AND ENDING DATE
- **TRR_ID_CODE**: ENCRYPTED TRANSPLANT IDENTIFIER
- **OPO_CTR_CODE**: ENCRYPTED INITIAL OPO MAPPED FROM LISTING CENTER AND BEGINNING DATE
- **INIT_OPO_CTR_CODE**: ENCRYPTED INITIAL OPO MAPPED FROM LISTING CENTER AND BEGINNING DATE
- **LISTING_CTR_CODE**: ENCRYPTED WL LISTING CENTER
- **END_OPO_CTR_CODE**: ENCRYPTED REMOVAL/CURRENT OPO MAPPED FROM LISTING CENTER AND ENDING DATE
- **AGE_GROUP**: RECIPIENT AGE GROUP A=ADULT P=PEDS

In [19]:
df.AGE_GROUP.value_counts()

AGE_GROUP
A    28751
Name: count, dtype: int64

In [20]:
# remove fetures
removeCols = ['WL_ORG', 'WL_ID_CODE', 'PT_CODE', 'DONOR_ID', 'CTR_CODE', 'OPO_CTR_CODE', 'INIT_OPO_CTR_CODE', 
               'LISTING_CTR_CODE','END_OPO_CTR_CODE', 'AGE_GROUP', 'TRR_ID_CODE']

# dipaly data dictionary
uf.dataDictSearch(df_dict, removeCols)

               Feature                                                                   Description FormSection  DataType SASAnalysisFormat Comment Information
9            AGE_GROUP                                            RECIPIENT AGE GROUP A=ADULT P=PEDS               CHAR(1)                               Unknown
49            CTR_CODE                                              ENCRYPTED TRANSPLANT CENTER CODE               CHAR(7)                               Unknown
82            DONOR_ID                                                    ENCRYPTED DONOR IDENTIFIER                   NUM                               Unknown
104   END_OPO_CTR_CODE      ENCRYPTED REMOVAL/CURRENT OPO MAPPED FROM LISTING CENTER AND ENDING DATE               CHAR(7)                               Unknown
172  INIT_OPO_CTR_CODE           ENCRYPTED INITIAL OPO MAPPED FROM LISTING CENTER AND BEGINNING DATE               CHAR(7)                               Unknown
197   LISTING_CTR_CODE            

In [21]:
# remove features & rows
df = uf.removeColumn(df, removeCols).copy()
df_dict = uf.removeRowUsingMask(df_dict, removeCols, 'Feature').copy()


Removed Features: ['AGE_GROUP', 'CTR_CODE', 'DONOR_ID', 'END_OPO_CTR_CODE', 'INIT_OPO_CTR_CODE', 'LISTING_CTR_CODE', 'OPO_CTR_CODE', 'PT_CODE', 'TRR_ID_CODE', 'WL_ID_CODE', 'WL_ORG']

Total Row(s) & Column(s) Before Removing Column(s): 28,751 & columns: 317
Total Row(s) & Column(s) After Removing Column(s): 28,751 & columns: 306
Remove 11 row(s) from Feature column in a DataFrame.


#### Display DataFrame

In [22]:
df.head()

Unnamed: 0,NUM_PREV_TX,THORACIC_DGN,GENDER,ABO,WGT_KG_TCR,HGT_CM_TCR,BMI_TCR,CITIZENSHIP,PERM_STATE,EDUCATION,ECMO_TCR,IABP_TCR,PROS_INFUS_TCR,PROSTACYCLIN_TCR,INHALED_NO,INOTROPES_TCR,PGE_TCR,OTH_LIFE_SUP_TCR,VAD_DEVICE_TY_TCR,VAD_BRAND1_TCR,FUNC_STAT_TCR,PRI_PAYMENT_TCR,TCR_DGN,DIAB,DIAL_TY_TCR,CEREB_VASC,MALIG_TCR,MOST_RCNT_CREAT,TOT_SERUM_ALBUM,IMPL_DEFIBRIL,HEMO_SYS_TCR,HEMO_PA_DIA_TCR,HEMO_PA_MN_TCR,HEMO_PCW_TCR,HEMO_CO_TCR,INOTROP_VASO_SYS_TCR,INOTROP_VASO_DIA_TCR,INOTROP_VASO_MN_TCR,INOTROP_VASO_PCW_TCR,INOTROP_VASO_CO_TCR,CIG_USE,TCR_DUR_ABSTAIN,PRIOR_CARD_SURG_TCR,PRIOR_CARD_SURG_TYPE_TCR,PRIOR_CARD_SURG_TYPE_OSTXT_TCR,DAYS_STAT1,DAYS_STAT1A,DAYS_STAT2,DAYS_STAT1B,DAYS_STATA4,DAYS_STATA5,DAYS_STATA2,DAYS_STATA3,DAYS_STATA1,DAYS_STATA6,LAST_INACT_REASON,INIT_STAT,REM_CD,TXED,DAYSWAIT_CHRON,END_STAT,INIT_AGE,LIFE_SUP_TCR,ACTIVATE_DATE,END_DATE,INIT_DATE,ETHNICITY,ETHCAT,INIT_HGT_CM_CALC,INIT_WGT_KG_CALC,INIT_BMI_CALC,END_HGT_CM_CALC,END_WGT_KG_CALC,END_BMI_CALC,VENTILATOR_TCR,REGION,INIT_LLU_FLG,INIT_RLU_FLG,INIT_BLU_FLG,END_LLU_FLG,END_RLU_FLG,END_BLU_FLG,VAL_DT_TCR,WORK_INCOME_TCR,BW4,BW6,C1,C2,DR51,DR51_2,DR52,DR52_2,DR53,DR53_2,DQ1,DQ2,FUNC_STAT_TRR,MED_COND_TRR,STATUS_TRR,ADMISSION_DATE,PRI_PAYMENT_TRR,ECMO_TRR,PERM_STATE_TRR,WORK_INCOME_TRR,PGE_TRR,CREAT_TRR,DIAL_AFTER_LIST,HEMO_CO_TRR,HEMO_PA_DIA_TRR,HEMO_PA_MN_TRR,HEMO_PCW_TRR,HEMO_SYS_TRR,IABP_TRR,INFECT_IV_DRUG_TRR,INOTROPES_TRR,INOTROP_VASO_CO_TRR,INOTROP_VASO_DIA_TRR,INOTROP_VASO_MN_TRR,INOTROP_VASO_PCW_TRR,INOTROP_VASO_SYS_TRR,OTH_LIFE_SUP_TRR,PRIOR_LUNG_SURG_TRR,PST_AIRWAY,ACUTE_REJ_EPI,PST_STROKE,PST_DIAL,PST_PACEMAKER,STEROID,TBILI,TRANSFUSIONS,VAD_DEVICE_TY_TRR,VAD_BRAND1_TRR,VENT_SUPPORT_TRR,VENTILATOR_TRR,INHALED_NO_TRR,PRIOR_CARD_SURG_TYPE_TRR,PROS_INFUS_TRR,PROSTACYCLIN_TRR,HBV_CORE,HBV_SUR_ANTIGEN,HBV_SURF_TOTAL,CMV_STATUS,HIV_SEROSTATUS,HCV_SEROSTATUS,EBV_SEROSTATUS,HIV_NAT,HCV_NAT,HBV_NAT,GSTATUS,GTIME,LASTFUNO,PSTATUS,PTIME,PX_STAT,FUNC_STAT_TRF,TXHRT,TX_PROCEDUR_TY,STATUS_TCR,INHALED_NO_TCR,DON_RETYP,CRSMATCH_DONE,CPRA,CPRA_PEAK,TRTREJ1Y,PREV_TX,PREV_TX_ANY,DA1,DA2,DB1,DB2,DDR1,DDR2,RA1,RA2,RB1,RB2,RDR1,RDR2,AMIS,BMIS,DRMIS,HLAMIS,PRAMR_CL1,PRAMR_CL2,MALIG_TRR,CMV_IGG,CMV_IGM,CITIZENSHIP_DON,HIST_COCAINE_DON,AGE_DON,ETHCAT_DON,HBV_CORE_DON,HBV_SUR_ANTIGEN_DON,ABO_DON,ALCOHOL_HEAVY_DON,DON_TY,GENDER_DON,HOME_STATE_DON,HEP_C_ANTI_DON,NON_HRT_DON,ANTIHYPE_DON,BLOOD_INF_DON,BUN_DON,CREAT_DON,OTHER_INF_DON,PT_DIURETICS_DON,PT_STEROIDS_DON,PT_T3_DON,PT_T4_DON,PT_OTH2_OSTXT_DON,PT_OTH3_OSTXT_DON,PT_OTH1_OSTXT_DON,PULM_INF_DON,SGOT_DON,SGPT_DON,TBILI_DON,URINE_INF_DON,VASODIL_DON,VDRL_DON,CLIN_INFECT_DON,CONTIN_COCAINE_DON,CONTIN_OTH_DRUG_DON,EXTRACRANIAL_CANCER_DON,CANCER_SITE_DON,HIST_CIG_DON,HIST_HYPERTENS_DON,INTRACRANIAL_CANCER_DON,HIST_CANCER_DON,HIST_DIABETES_DON,DIABETES_DON,HIST_OTH_DRUG_DON,SKIN_CANCER_DON,CMV_DON,COD_CAD_DON,DDAVP_DON,DEATH_CIRCUM_DON,DEATH_MECH_DON,RECOVERY_DATE_DON,HEPARIN_DON,ARGININE_DON,INSULIN_DON,HGT_CM_DON_CALC,WGT_KG_DON_CALC,BMI_DON_CALC,HBV_NAT_DON,HCV_NAT_DON,HIV_NAT_DON,PX_STAT_DATE,TX_DATE,DISCHARGE_DATE,TX_TYPE,ABO_MAT,AGE,DIAG,DIAL_PRIOR_TX,ISCHTIME,GRF_STAT,SHARE_TY,LOS,LIFE_SUP_TRR,ORGAN,PRIOR_CARD_SURG_TRR,MALIG,HGT_CM_CALC,WGT_KG_CALC,BMI_CALC,DISTANCE,VENT_SUPPORT_AFTER_LIST,PROC_TY_HR,VAL_DT_TRR,RECOV_OUT_US,ADMIT_DATE_DON,PROTEIN_URINE,CARDARREST_NEURO,PO2,HIST_MI,LV_EJECT_METH,LV_EJECT,CORONARY_ANGIO,BIOPSY_DGN,TATTOOS,STATUS_DDR,VAL_DT_DDR,HBSAB_DON,EBV_IGG_CAD_DON,EBV_IGM_CAD_DON,CDC_RISK_HIV_DON,INO_PROCURE_AGENT_1,ECD_DONOR,TX_YEAR,INOTROP_SUPPORT_DON,LT_ONE_WEEK_DON,REFERRAL_DATE,LISTYR,TRANSFUS_TERM_DON,PO2_DONE_DON,PO2_FIO2_DON,PCO2_DON,PULM_CATH_DON,BRONCHO_LT_DON,BRONCHO_RT_DON,CHEST_XRAY_DON,PH_DON,HEMATOCRIT_DON,DATA_TRANSPLANT,DATA_WAITLIST
0,0,1000,M,A,65.771,173.0,22.047,1.0,CA,4.0,0,0,0,0,0,0,0,1,5.0,227.0,2010.0,2.0,1000.0,1.0,1.0,N,N,1.3,4.8,Y,50.0,34.0,37.0,36.0,2.5,Y,Y,Y,Y,Y,N,,Y,2.0,,0,11,0,0,0,0,0,0,0,0,,2010.0,4,1,11,2010.0,45,Y,01/18/2013,01/29/2013,01/18/2013,1,4,172.7,65.8,22.0,172.7,65.8,22.0,0,5,0.0,0.0,0.0,0,0,0,28FEB1995:00:00:00.000,N,0,0,0,0,0,0,0,0,0,0,0,0,2010.0,1.0,V,12/25/2012,2.0,0,CA,N,0,1.0,N,,,,,,0,N,0,,,,,,1,N,N,3.0,N,Y,N,N,3.6,N,5.0,227.0,Y,0,0,16.0,0,0,N,N,,P,N,N,ND,,,,1.0,329.0,999,1.0,329.0,D,2100.0,Y,501,V,0,N,N,,,N,N,N,2.0,29.0,44.0,49.0,4.0,15.0,24.0,24.0,35.0,61.0,4.0,4.0,2.0,2.0,1.0,5.0,2.0,0.0,N,P,N,1.0,N,31.0,2,N,N,A,N,C,M,CA,N,N,Y,0.0,18.0,1.3,0.0,Y,N,N,N,"KCL,","DOPAMINE, VASOPRESSIN,","ZOSYN, VANCOMYCIN",1.0,46.0,40.0,0.4,0.0,N,N,Y,,,N,1.0,N,N,N,N,1.0,N,N,N,P,3.0,Y,3.0,7.0,01/29/2013,Y,N,Y,172.7,107.0,35.867286,,,,12/24/2013,01/29/2013,04/15/2013,O,1.0,45,1000.0,N,4.9,N,3,76.0,Y,HR,Y,N,172.7,65.3,21.9,83.0,Y,1.0,07MAR2013:19:06:35.000,N,01/27/2013,N,N,182.0,N,1.0,60.0,1.0,1.0,Y,V,27FEB2013:16:31:42.000,ND,P,N,N,,0.0,2013,N,N,01/27/2013,2013,1.0,Y,47.0,40.0,N,2.0,2.0,4.0,7.38,26.2,Y,Y
1,0,1049,M,O,72.575,185.0,21.1093,1.0,TX,3.0,0,0,0,0,0,0,0,0,2.0,205.0,2010.0,3.0,1006.0,1.0,1.0,N,N,2.3,2.9,Y,53.0,39.0,46.0,31.0,2.1,N,N,N,N,N,N,,N,,,0,3,0,263,0,0,0,0,0,0,11.0,2010.0,4,1,348,2020.0,35,Y,02/15/2012,01/28/2013,02/15/2012,1,4,185.4,72.6,21.1,185.4,72.6,21.1,0,4,0.0,0.0,0.0,0,0,0,09MAY1994:00:00:00.000,N,0,0,0,0,0,0,0,0,0,0,0,0,2090.0,3.0,V,01/28/2013,3.0,0,TX,N,0,0.7,N,4.6,8.0,18.0,4.0,28.0,0,N,0,N,N,N,N,N,0,N,N,3.0,N,N,N,N,0.9,N,2.0,205.0,N,0,0,,0,0,N,N,,P,N,N,P,,,,0.0,2592.0,80,0.0,2592.0,A,,Y,501,V,0,Y,Y,,,,N,N,2.0,68.0,49.0,60.0,10.0,16.0,30.0,68.0,48.0,65.0,8.0,9.0,1.0,2.0,2.0,5.0,0.0,0.0,N,P,N,4.0,N,30.0,4,N,N,O,N,C,M,TX,N,N,N,0.0,29.0,0.7,0.0,Y,N,N,Y,,,"ANCEF, FENTANYL, VERSED,LASIX",1.0,38.0,171.0,0.8,0.0,N,N,Y,,,N,1.0,N,N,N,N,1.0,N,N,N,P,3.0,N,5.0,9.0,01/28/2013,Y,Y,N,172.7,67.1,22.492476,,,,03/04/2020,01/28/2013,02/07/2013,O,1.0,36,1000.0,N,3.1,Y,3,10.0,Y,HR,N,N,185.4,94.8,27.6,0.0,N,1.0,11FEB2013:16:24:00.000,N,01/12/2013,N,N,323.0,N,1.0,55.0,1.0,1.0,N,V,03MAR2013:21:46:46.000,ND,P,N,N,1.0,0.0,2013,Y,N,01/26/2013,2012,2.0,Y,100.0,28.0,N,2.0,2.0,3.0,7.5,27.8,Y,Y
2,0,1007,M,O,106.0,175.0,34.5096,1.0,NY,4.0,1,1,0,0,0,0,0,0,1.0,,2010.0,1.0,1007.0,1.0,1.0,N,N,1.4,2.2,N,52.0,23.0,32.0,,5.0,Y,Y,Y,,Y,N,,Y,17.0,VSD,0,53,0,2,0,0,0,0,0,0,,2010.0,4,1,55,2010.0,58,Y,12/06/2012,01/30/2013,12/06/2012,0,1,175.3,106.0,34.5,175.3,106.0,34.5,1,9,0.0,0.0,0.0,0,0,0,,N,0,0,0,0,0,0,0,0,0,0,0,0,2020.0,2.0,V,11/21/2012,1.0,0,NY,N,0,2.19,N,4.4,21.0,22.0,13.0,28.0,0,N,1,Y,Y,Y,Y,Y,0,N,N,3.0,N,N,N,N,1.2,Y,1.0,,Y,0,0,17.0,0,0,N,N,,N,N,N,P,,,,0.0,2913.0,80,0.0,2913.0,A,,Y,501,V,0,N,Y,,,N,N,N,2.0,3.0,7.0,44.0,11.0,15.0,2.0,3.0,35.0,44.0,4.0,13.0,0.0,1.0,2.0,3.0,0.0,0.0,N,N,N,1.0,N,15.0,1,N,N,O,N,C,M,NY,N,N,N,0.0,15.0,0.74,0.0,Y,Y,N,Y,"AMPHOTERICIN, BETADINE",ZANTAC,"ZOSYN, VANCOMYCIN",1.0,445.0,217.0,0.7,0.0,N,N,Y,,Y,N,1.0,N,N,N,N,1.0,N,Y,N,N,1.0,N,997.0,3.0,01/30/2013,Y,Y,N,170.0,59.0,20.415225,,,,01/21/2021,01/30/2013,02/20/2013,O,1.0,58,1007.0,N,1.4,Y,3,21.0,Y,HR,Y,N,175.3,90.7,29.5,0.0,Y,1.0,22APR2013:17:29:07.000,N,01/26/2013,N,N,401.0,N,1.0,60.0,1.0,1.0,N,V,15FEB2013:14:07:33.000,ND,N,N,N,999.0,0.0,2013,Y,N,01/27/2013,2012,0.0,Y,100.0,41.0,N,2.0,,5.0,7.4,25.4,Y,Y
3,0,1007,M,A,111.0,178.0,35.0335,1.0,NY,3.0,0,0,0,0,0,1,0,0,5.0,227.0,2020.0,4.0,1007.0,1.0,1.0,N,N,1.1,3.1,N,,,,,,,,,,,Y,1.0,Y,16.0,VAD PLACEMENT,0,96,0,296,0,0,0,0,0,0,,2010.0,4,1,392,2010.0,65,Y,01/03/2012,01/29/2013,01/03/2012,0,1,178.0,111.0,35.0,178.0,111.6,35.2,0,9,0.0,0.0,0.0,0,0,0,08MAY1992:00:00:00.000,N,0,0,0,0,0,0,0,0,0,0,0,0,2080.0,3.0,V,01/28/2013,1.0,0,NY,N,0,1.35,N,4.8,12.0,22.0,12.0,34.0,0,N,0,N,N,N,N,N,0,N,N,1.0,N,N,N,N,0.7,Y,2.0,205.0,Y,0,0,16.0,0,0,N,N,,N,N,N,P,,,,1.0,1225.0,999,1.0,1225.0,D,2030.0,Y,501,V,0,Y,Y,,,Y,N,N,33.0,68.0,39.0,51.0,1.0,8.0,24.0,26.0,35.0,38.0,4.0,11.0,2.0,2.0,2.0,6.0,0.0,0.0,N,N,ND,1.0,N,47.0,1,N,N,A1,N,C,F,NY,N,N,Y,1.0,29.0,1.34,0.0,Y,Y,N,Y,"D50, DOPAMINE, DIFLUCAN",,"KCL, VANCO, MAG, ZOSYN",1.0,248.0,165.0,0.3,1.0,N,N,Y,,Y,N,2.0,Y,Y,N,Y,1.0,N,Y,N,P,2.0,N,6.0,11.0,01/29/2013,Y,Y,Y,163.0,115.2,43.358802,,,,06/07/2016,01/29/2013,03/02/2013,O,1.0,66,1007.0,N,3.5,N,4,32.0,Y,HR,Y,N,177.8,113.0,35.7,252.0,Y,1.0,07MAR2013:17:18:31.000,N,01/27/2013,Y,N,267.0,N,1.0,60.0,2.0,1.0,Y,V,27FEB2013:11:41:18.000,ND,P,N,N,,0.0,2013,N,N,01/27/2013,2012,0.0,Y,100.0,36.0,N,,,5.0,7.37,38.3,Y,Y
4,0,1000,M,A,72.0,172.0,24.3375,1.0,NC,5.0,0,0,0,0,0,0,0,0,2.0,205.0,2090.0,4.0,1202.0,1.0,1.0,N,N,0.8,,Y,30.0,15.0,20.0,15.0,4.75,N,N,N,N,N,N,,Y,16.0,LVAD IMPLANT,0,19,0,23,0,0,0,0,0,0,,2020.0,4,1,42,2010.0,70,Y,12/19/2012,01/30/2013,12/19/2012,0,1,172.0,72.0,24.3,172.0,72.0,24.3,0,11,0.0,0.0,0.0,0,0,0,02FEB1995:08:20:06.000,N,0,0,0,0,0,0,0,0,0,0,0,0,2090.0,3.0,V,01/29/2013,4.0,0,NC,N,0,0.8,N,4.75,15.0,20.0,15.0,30.0,0,N,0,N,N,N,N,N,0,N,N,3.0,N,N,N,N,0.8,N,2.0,205.0,N,0,0,,0,0,N,N,,P,N,N,P,,,,0.0,2785.0,80,0.0,2785.0,A,,Y,501,V,0,Y,Y,,,N,N,N,3.0,24.0,61.0,64.0,7.0,15.0,3.0,24.0,7.0,38.0,13.0,14.0,0.0,2.0,2.0,4.0,0.0,0.0,N,P,N,1.0,N,22.0,1,N,N,A2,N,C,F,NC,N,N,Y,0.0,70.0,5.1,0.0,Y,Y,N,Y,"ZOSYN, CIPROFLOXACIN","BUMETIDE, ANCEF","ROCURONIUM, VECURONIUM",1.0,137.0,174.0,0.5,0.0,N,N,Y,,Y,N,1.0,N,N,N,N,1.0,N,Y,N,P,1.0,N,2.0,3.0,01/30/2013,Y,N,Y,182.9,113.0,33.786719,,,,09/15/2020,01/30/2013,02/25/2013,O,1.0,70,1000.0,N,2.8,Y,3,26.0,Y,HR,N,N,173.0,72.0,24.1,60.0,N,1.0,13MAR2013:17:15:33.000,N,01/25/2013,N,Y,209.0,N,1.0,55.0,1.0,1.0,Y,V,11FEB2013:12:44:02.000,P,P,N,N,,0.0,2013,N,N,01/28/2013,2012,0.0,Y,100.0,33.0,N,2.0,,5.0,7.51,30.7,Y,Y


In [23]:
print(sorted(df.columns.to_list()))

['ABO', 'ABO_DON', 'ABO_MAT', 'ACTIVATE_DATE', 'ACUTE_REJ_EPI', 'ADMISSION_DATE', 'ADMIT_DATE_DON', 'AGE', 'AGE_DON', 'ALCOHOL_HEAVY_DON', 'AMIS', 'ANTIHYPE_DON', 'ARGININE_DON', 'BIOPSY_DGN', 'BLOOD_INF_DON', 'BMIS', 'BMI_CALC', 'BMI_DON_CALC', 'BMI_TCR', 'BRONCHO_LT_DON', 'BRONCHO_RT_DON', 'BUN_DON', 'BW4', 'BW6', 'C1', 'C2', 'CANCER_SITE_DON', 'CARDARREST_NEURO', 'CDC_RISK_HIV_DON', 'CEREB_VASC', 'CHEST_XRAY_DON', 'CIG_USE', 'CITIZENSHIP', 'CITIZENSHIP_DON', 'CLIN_INFECT_DON', 'CMV_DON', 'CMV_IGG', 'CMV_IGM', 'CMV_STATUS', 'COD_CAD_DON', 'CONTIN_COCAINE_DON', 'CONTIN_OTH_DRUG_DON', 'CORONARY_ANGIO', 'CPRA', 'CPRA_PEAK', 'CREAT_DON', 'CREAT_TRR', 'CRSMATCH_DONE', 'DA1', 'DA2', 'DATA_TRANSPLANT', 'DATA_WAITLIST', 'DAYSWAIT_CHRON', 'DAYS_STAT1', 'DAYS_STAT1A', 'DAYS_STAT1B', 'DAYS_STAT2', 'DAYS_STATA1', 'DAYS_STATA2', 'DAYS_STATA3', 'DAYS_STATA4', 'DAYS_STATA5', 'DAYS_STATA6', 'DB1', 'DB2', 'DDAVP_DON', 'DDR1', 'DDR2', 'DEATH_CIRCUM_DON', 'DEATH_MECH_DON', 'DIAB', 'DIABETES_DON', 'DIAG

## User Function(s)

In [24]:
def EqualBinningNumeric(data, StrCol, binn, StrNewCol):
    # perform qcut
    binned = pd.qcut(data[StrCol], q=binn)
    
    # extract bin ranges and convert to string labels
    bin_ranges = binned.cat.categories
    labels = [f"({int(interval.left)}-{int(interval.right)})" for interval in bin_ranges]
    
    # apply qcut with the bin ranges as labels
    data[StrNewCol] = pd.qcut(data[StrCol], q=binn, labels=labels)

    return data
    

def findMappingDfFlat(dataSeries, dfFlat, formatStr, NaN):
    # fill NaNs
    dataSeries = dataSeries.fillna(NaN)
    # check datatype of NaN
    if not isinstance(NaN, str):
        # convert to integer
        dataSeries = dataSeries.astype(int)
    # initialize variables
    codeList = list(dataSeries.unique().astype(str))
    # code from f;atfile
    flatList = dfFlat['CODE'][dfFlat.FMTNAME == formatStr].to_list()
    # intersection
    intersectionList = list(set(codeList).intersection(set(flatList)))
    # compare length
    print(f"Compare Length: {len(codeList)} & {len(intersectionList)}\n")
    # print
    print(dfFlat[['CODE','LABEL']][(dfFlat.FMTNAME == formatStr) & dfFlat.CODE.isin(intersectionList)].to_string(index=False, index_names=False))

## Wrangling

### LABELS

#### ACUTE_REJ_EPI
- AcuteRejectionEpisode

In [25]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'ACUTE', True)

                 count      mean      std  min  25%  50%  75%  max
ACUTE_REJ_EPI  28279.0  2.704657  0.64727  1.0  3.0  3.0  3.0  3.0

NaNs:
ACUTE_REJ_EPI    472
dtype: int64

Datatypes:
ACUTE_REJ_EPI    float64
dtype: object


         Feature                                                     Description                           FormSection DataType SASAnalysisFormat Comment Information
4  ACUTE_REJ_EPI  DID RECIPIENT HAVE ANY ACUTE REJECTION EPISODES PRE DISCHARGE?  POST TRANSPLANT CLINICAL INFORMATION      NUM          REJEPIKI             Unknown


ACUTE_REJ_EPI: [ 3.  1.  2. nan]


In [26]:
# fill NaN with 999: Missing
df[features] = df[features].fillna(999).astype(int)

# df_flat FMTNAME: REJEPIKI
mapping = {
    1: "Yes, at least one episode treated with anti-rejection agent",
    2: "Yes, none treated with additional anti-rejection agent",
    3: "No",
    999: "Missing"
}

# map
df = uf.mappingCol(df, 'ACUTE_REJ_EPI', mapping, False)

# mapping
colMap = {'ACUTE_REJ_EPI': 'AcuteRejectionEpisode'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt=f"{LABEL} FMTNAME: REJEPIKI")

# update dataframe
df_label  = uf.insertIntoDataFrame(df_label, list(colMap.values()))
df_nominal  = uf.insertIntoDataFrame(df_nominal, list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.iloc[idx]

Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
4,AcuteRejectionEpisode,DID RECIPIENT HAVE ANY ACUTE REJECTION EPISODES PRE DISCHARGE?,TRR,2004-06-30,NaT,POST TRANSPLANT CLINICAL INFORMATION,NUM,REJEPIKI,,ACUTE_REJ_EPI,Category,** LABEL ** FMTNAME: REJEPIKI


#### PST_
- PST_AIRWAY & PST_STROKE & PST_DIAL & PST_PACEMAKER
    - Airway Dehiscence Post Transplant
    - Stroke Post Transplant
    - Dialysis Post Discharge
    - Pacemaker Post Transplant

In [27]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'PST_', True)

               count unique top   freq
PST_AIRWAY     28287      3   N  28069
PST_STROKE     28288      3   N  27200
PST_DIAL       28285      3   N  24270
PST_PACEMAKER  28283      3   N  27447

NaNs:
PST_AIRWAY       464
PST_STROKE       463
PST_DIAL         466
PST_PACEMAKER    468
dtype: int64

Datatypes:
PST_AIRWAY       object
PST_STROKE       object
PST_DIAL         object
PST_PACEMAKER    object
dtype: object


           Feature                                     Description                           FormSection DataType SASAnalysisFormat Comment Information
234     PST_AIRWAY    EVENTS PRIOR TO DISCHARGE: AIRWAY DEHISCENCE  POST TRANSPLANT CLINICAL INFORMATION  CHAR(1)                               Unknown
235       PST_DIAL             EVENTS PRIOR TO DISCHARGE: DIALYSIS  POST TRANSPLANT CLINICAL INFORMATION  CHAR(1)                               Unknown
236  PST_PACEMAKER  EVENTS PRIOR TO DISCHARGE: PERMANENT PACEMAKER  POST TRANSPLANT CLINICAL INFORMATION  CHAR(1)        

In [28]:
# fill NaN with X: Missing
df[features] = df[features].fillna('X')

# feature value mapping
mapping = {'N': 'No', 'Y': 'Yes', 'U': 'Unknown', 'X': 'Missing'}

# iterate
for col in features:
    # mapping feature
    df = uf.mappingCol(df, col, mapping, display=True)

# mapping
colMap = {'PST_AIRWAY': 'AirwayDehiscencePostTransplant', 'PST_STROKE': 'StrokePostTransplant', 
          'PST_PACEMAKER': 'PacemakerPostTransplant','PST_DIAL':'DialysisPostDischarge'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt=f"{LABEL} N/Y/U/X to No/Yes/Unknow/Missing")

# update dataframe
df_label  = uf.insertIntoDataFrame(df_label, list(colMap.values()))
df_nominal  = uf.insertIntoDataFrame(df_nominal, list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.iloc[idx]

Converted Column PST_AIRWAY Unique Vaue(s) ['No', 'Unknown', 'Yes', 'Missing']
Categories (4, object): ['Missing', 'No', 'Unknown', 'Yes']
Converted Column PST_STROKE Unique Vaue(s) ['No', 'Yes', 'Unknown', 'Missing']
Categories (4, object): ['Missing', 'No', 'Unknown', 'Yes']
Converted Column PST_DIAL Unique Vaue(s) ['Yes', 'No', 'Unknown', 'Missing']
Categories (4, object): ['Missing', 'No', 'Unknown', 'Yes']
Converted Column PST_PACEMAKER Unique Vaue(s) ['No', 'Unknown', 'Yes', 'Missing']
Categories (4, object): ['Missing', 'No', 'Unknown', 'Yes']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
234,AirwayDehiscencePostTransplant,EVENTS PRIOR TO DISCHARGE: AIRWAY DEHISCENCE,TRR,1994-04-01,NaT,POST TRANSPLANT CLINICAL INFORMATION,CHAR(1),,,PST_AIRWAY,Category,** LABEL ** N/Y/U/X to No/Yes/Unknow/Missing
235,DialysisPostDischarge,EVENTS PRIOR TO DISCHARGE: DIALYSIS,TRR,1994-04-01,NaT,POST TRANSPLANT CLINICAL INFORMATION,CHAR(1),,,PST_DIAL,Category,** LABEL ** N/Y/U/X to No/Yes/Unknow/Missing
236,PacemakerPostTransplant,EVENTS PRIOR TO DISCHARGE: PERMANENT PACEMAKER,TRR,1994-04-01,NaT,POST TRANSPLANT CLINICAL INFORMATION,CHAR(1),,,PST_PACEMAKER,Category,** LABEL ** N/Y/U/X to No/Yes/Unknow/Missing
237,StrokePostTransplant,EVENTS PRIOR TO DISCHARGE: STROKE,TRR,1994-04-01,NaT,POST TRANSPLANT CLINICAL INFORMATION,CHAR(1),,,PST_STROKE,Category,** LABEL ** N/Y/U/X to No/Yes/Unknow/Missing


#### GSTATUS
- GraftFailStatus

In [29]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'GSTATUS', True)

           count     mean       std  min  25%  50%  75%  max
GSTATUS  28325.0  0.20286  0.402136  0.0  0.0  0.0  0.0  1.0

NaNs:
GSTATUS    426
dtype: int64

Datatypes:
GSTATUS    float64
dtype: object


     Feature           Description FormSection DataType SASAnalysisFormat Comment Information
114  GSTATUS  GRAFT FAILED (1=YES)                  NUM                               Unknown


GSTATUS: [ 1.  0. nan]


In [30]:
# fill NaN with 9: Missing
df[features] = df[features].fillna(9).astype(int)

# feature value mapping
mapping = {0: 'Success', 1: 'Failure', 9: 'Missing'}

# iterate
for col in features:
    # mapping feature
    df = uf.mappingCol(df, col, mapping, display=True)


# mapping
colMap = {'GSTATUS': 'GraftFailStatus'}


# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt=f"{LABEL}")

# update dataframe
df_label  = uf.insertIntoDataFrame(df_label, list(colMap.values()))
df_nominal  = uf.insertIntoDataFrame(df_nominal, list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.iloc[idx]

Converted Column GSTATUS Unique Vaue(s) ['Failure', 'Success', 'Missing']
Categories (3, object): ['Failure', 'Missing', 'Success']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
114,GraftFailStatus,GRAFT FAILED (1=YES),CALCULATED,NaT,NaT,,NUM,,,GSTATUS,Category,** LABEL **


#### GTIME
- GraftLifeSpanDay

In [31]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'GTIME', True)

         count         mean          std  min    25%     50%     75%     max
GTIME  28325.0  1328.348491  1064.303625  0.0  365.0  1098.0  2168.0  3956.0

NaNs:
GTIME    426
dtype: int64

Datatypes:
GTIME    float64
dtype: object


    Feature                                                          Description FormSection DataType SASAnalysisFormat Comment Information
115   GTIME  GRAFT LIFESPAN-Days From Transplant to Failure/Death/Last Follow-Up                  NUM                               Unknown


GTIME: [ 329. 2592. 2913. ... 3096. 2626. 2012.]


In [32]:
# mapping
colMap = {'GTIME': 'GraftLifeSpanDay'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Numeric', txt=f"{LABEL}")

# update dataframe
df_label  = uf.insertIntoDataFrame(df_label, list(colMap.values()))
df_numeric  = uf.insertIntoDataFrame(df_numeric, list(colMap.values()))

# display
df_dict.iloc[idx]

Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
115,GraftLifeSpanDay,GRAFT LIFESPAN-Days From Transplant to Failure/Death/Last Follow-Up,CALCULATED,NaT,NaT,,NUM,,,GTIME,Numeric,** LABEL **


#### LASTFUNO
- LastFollowupNumber

In [33]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'LASTFUNO', True)

            count       mean         std  min   25%   50%   75%    max
LASTFUNO  28751.0  186.89416  348.595181  1.0  10.0  40.0  80.0  999.0

NaNs:
LASTFUNO    0
dtype: int64

Datatypes:
LASTFUNO    int64
dtype: object


      Feature           Description FormSection DataType SASAnalysisFormat Comment Information
189  LASTFUNO  LAST FOLLOWUP NUMBER                  NUM                               Unknown


LASTFUNO: [999  80  20 800   1  30  70 998  60  50  10  40   6 100  90]


In [34]:
# mapping
colMap = {'LASTFUNO': 'LastFollowupNumber'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Numeric', txt=f"{LABEL}")

# update dataframe
df_label  = uf.insertIntoDataFrame(df_label, list(colMap.values()))
df_numeric  = uf.insertIntoDataFrame(df_numeric, list(colMap.values()))

# display
df_dict.loc[idx]

Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
189,LastFollowupNumber,LAST FOLLOWUP NUMBER,CALCULATED,NaT,NaT,,NUM,,,LASTFUNO,Numeric,** LABEL **


#### GRF_STAT
- GraftStatus

In [35]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'GRF_STAT', True)

          count unique top   freq
GRF_STAT  28099      2   Y  26576

NaNs:
GRF_STAT    652
dtype: int64

Datatypes:
GRF_STAT    object
dtype: object


      Feature   Description           FormSection DataType SASAnalysisFormat Comment Information
113  GRF_STAT  GRAFT STATUS  CLINICAL INFORMATION  CHAR(1)           GRFSTAT             Unknown


GRF_STAT: ['N' 'Y' nan]


In [36]:
# fill NaN with X: Missing
df[features] = df[features].fillna('X')

# feature value mapping
mapping = {'N': 'No', 'Y': 'Yes', 'U': 'Unknown', 'X': 'Missing'}

# iterate
for col in features:
    # mapping feature
    df = uf.mappingCol(df, col, mapping, display=True)

# mapping
colMap = {'GRF_STAT':'GraftStatus'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt=f"{LABEL} N/Y/U/X to No/Yes/Unknown/Missing")

# update dataframe
df_label  = uf.insertIntoDataFrame(df_label, list(colMap.values()))
df_nominal  = uf.insertIntoDataFrame(df_nominal, list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.iloc[idx]

Converted Column GRF_STAT Unique Vaue(s) ['No', 'Yes', 'Missing']
Categories (3, object): ['Missing', 'No', 'Yes']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
113,GraftStatus,GRAFT STATUS,TRR/TRF,2004-06-30,NaT,CLINICAL INFORMATION,CHAR(1),GRFSTAT,,GRF_STAT,Category,** LABEL ** N/Y/U/X to No/Yes/Unknown/Missing


#### PSTATUS
- TransplantStatus

In [37]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'PSTATUS', True)

           count      mean       std  min  25%  50%  75%  max
PSTATUS  28326.0  0.196321  0.397221  0.0  0.0  0.0  0.0  1.0

NaNs:
PSTATUS    425
dtype: int64

Datatypes:
PSTATUS    float64
dtype: object


     Feature                                                                           Description     FormSection DataType SASAnalysisFormat Comment Information
238  PSTATUS  Boolean Most Recent Patient Status (based on composite death date) (1=Dead, 0=Alive)  PATIENT STATUS      NUM                               Unknown


PSTATUS: [ 1.  0. nan]


In [38]:
# fill NaN with 9: Missing
df[features] = df[features].fillna(9).astype(int)

# feature value mapping
mapping = {0: 'Alive', 1: 'Dead', 9: 'Missing'}

# iterate
for col in features:
    # mapping feature
    df = uf.mappingCol(df, col, mapping, display=True)


# mapping
colMap = {'PSTATUS': 'TransplantStatus'}


# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt=f"{LABEL}")

# update dataframe
df_label  = uf.insertIntoDataFrame(df_label, list(colMap.values()))
df_nominal  = uf.insertIntoDataFrame(df_nominal, list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.iloc[idx]

Converted Column PSTATUS Unique Vaue(s) ['Dead', 'Alive', 'Missing']
Categories (3, object): ['Alive', 'Dead', 'Missing']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
238,TransplantStatus,"Boolean Most Recent Patient Status (based on composite death date) (1=Dead, 0=Alive)",TRR/TRF-CALCULATED,1987-10-01,NaT,PATIENT STATUS,NUM,,,PSTATUS,Category,** LABEL **


#### PTIME
- TransplantSurvivalDay

In [39]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'PTIME', False)

         count         mean          std  min    25%     50%     75%     max
PTIME  28326.0  1330.118442  1063.836426  0.0  365.0  1098.5  2168.0  3956.0

NaNs:
PTIME    425
dtype: int64

Datatypes:
PTIME    float64
dtype: object


    Feature                                                    Description FormSection DataType SASAnalysisFormat Comment Information
243   PTIME  Patient Survival Time in days (based on composite death date)                  NUM                               Unknown




In [40]:
# mapping
colMap = {'PTIME': 'TransplantSurvivalDay'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Numeric', txt=f"{LABEL}")

# update dataframe
df_label  = uf.insertIntoDataFrame(df_label, list(colMap.values()))
df_numeric  = uf.insertIntoDataFrame(df_numeric, list(colMap.values()))

# display
df_dict.loc[idx]

Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
243,TransplantSurvivalDay,Patient Survival Time in days (based on composite death date),CALCULATED,NaT,NaT,,NUM,,,PTIME,Numeric,** LABEL **


#### PX_STAT
- RecipientStatus

In [41]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'PX_STAT', True)

              count unique         top   freq
PX_STAT       28361      4           A  22412
PX_STAT_DATE  28325   2997  07/14/2021    114

NaNs:
PX_STAT         390
PX_STAT_DATE    426
dtype: int64

Datatypes:
PX_STAT         object
PX_STAT_DATE    object
dtype: object


          Feature                                Description     FormSection DataType SASAnalysisFormat Comment Information
246       PX_STAT  RECIPIENT STATUS(Died, ReTX, Lost, Alive)  PATIENT STATUS  CHAR(1)            PXSTAT             Unknown
247  PX_STAT_DATE                      RECIPIENT STATUS DATE  PATIENT STATUS      NUM                               Unknown


PX_STAT: ['D' 'A' 'R' 'L' nan]
PX_STAT_DATE: ['12/24/2013' '03/04/2020' '01/21/2021' ... '06/29/2018' '10/21/2014'
 '01/02/2013']


In [42]:
# fill NaN with X: Missing
df[features] = df[features].fillna('X')

# df_flat FMTNAME: PXSTAT
mapping = { 
    "A": "Living",
    "D": "Dead",
    "L": "Lost to Follow Up",
    "N": "Not Seen",
    "R": "Retransplanted",
    "X": "Missing"
}

# iterate
for col in features:
    # mapping feature
    df = uf.mappingCol(df, col, mapping, display=True)


# mapping
colMap = {'PX_STAT': 'RecipientStatus'}


# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt=f"{LABEL} FMTNAME: PXSTAT")

# update dataframe
df_label  = uf.insertIntoDataFrame(df_label, list(colMap.values()))
df_nominal  = uf.insertIntoDataFrame(df_nominal, list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.iloc[idx]

Converted Column PX_STAT Unique Vaue(s) ['Dead', 'Living', 'Retransplanted', 'Lost to Follow Up', 'Missing']
Categories (5, object): ['Dead', 'Living', 'Lost to Follow Up', 'Missing', 'Retransplanted']
Converted Column PX_STAT_DATE Unique Vaue(s) ['12/24/2013', '03/04/2020', '01/21/2021', '06/07/2016', '09/15/2020', ..., '02/26/2017', '06/21/2017', '06/29/2018', '10/21/2014', '01/02/2013']
Length: 2998
Categories (2998, object): ['01/01/2013', '01/01/2014', '01/01/2015', '01/01/2016', ..., '12/31/2019', '12/31/2020', '12/31/2021', 'Missing']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
246,RecipientStatus,"RECIPIENT STATUS(Died, ReTX, Lost, Alive)",TRR/TRF-CALCULATED,1987-10-01,NaT,PATIENT STATUS,CHAR(1),PXSTAT,,PX_STAT,Category,** LABEL ** FMTNAME: PXSTAT
247,PX_STAT_DATE,RECIPIENT STATUS DATE,TRR/TRF-CALCULATED,1987-10-02,NaT,PATIENT STATUS,NUM,,,PX_STAT_DATE,Category,** LABEL ** FMTNAME: PXSTAT


#### TRTREJ1Y

In [43]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'TRTREJ1Y', True)

          count unique top   freq
TRTREJ1Y  22091      2   N  18050

NaNs:
TRTREJ1Y    6660
dtype: int64

Datatypes:
TRTREJ1Y    object
dtype: object


      Feature                          Description FormSection DataType SASAnalysisFormat Comment Information
276  TRTREJ1Y  TREATED FOR REJECTION WITHIN 1 YEAR              CHAR(1)                               Unknown


TRTREJ1Y: ['N' nan 'Y']


In [44]:
# fill NaN with X: Missing
df[features] = df[features].fillna('X')

# feature value mapping
mapping = {'N': 'No', 'Y': 'Yes', 'X': 'Missing'}

# iterate
for col in features:
    # mapping feature
    df = uf.mappingCol(df, col, mapping, display=True)


# mapping
colMap = {'TRTREJ1Y': 'RejectionTreatmentWithinOneYear'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt=f"{LABEL} N/Y/X to No/Yes/Missing")

# update dataframe
df_label  = uf.insertIntoDataFrame(df_label, list(colMap.values()))
df_nominal  = uf.insertIntoDataFrame(df_nominal, list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.iloc[idx]

Converted Column TRTREJ1Y Unique Vaue(s) ['No', 'Missing', 'Yes']
Categories (3, object): ['Missing', 'No', 'Yes']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
276,RejectionTreatmentWithinOneYear,TREATED FOR REJECTION WITHIN 1 YEAR,CALCULATED,NaT,NaT,,CHAR(1),,,TRTREJ1Y,Category,** LABEL ** N/Y/X to No/Yes/Missing


### ENTIRE DATE FEATURES

In [45]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'DATE|VAL_DT|LISTYR', False)

                     count unique                     top freq         mean       std     min     25%     50%     75%     max
ACTIVATE_DATE        28751   3926              12/21/2012   32          NaN       NaN     NaN     NaN     NaN     NaN     NaN
END_DATE             28751   4007              02/12/2021   23          NaN       NaN     NaN     NaN     NaN     NaN     NaN
INIT_DATE            28751   3852              12/21/2012   32          NaN       NaN     NaN     NaN     NaN     NaN     NaN
VAL_DT_TCR           21058   5917  28FEB1995:00:00:00.000  943          NaN       NaN     NaN     NaN     NaN     NaN     NaN
ADMISSION_DATE       28370   4049              06/28/2018   26          NaN       NaN     NaN     NaN     NaN     NaN     NaN
RECOVERY_DATE_DON    28738   4013              02/12/2021   21          NaN       NaN     NaN     NaN     NaN     NaN     NaN
PX_STAT_DATE         28751   2998                 Missing  426          NaN       NaN     NaN     NaN     NaN     NaN 

#### Convert to Datetime

In [46]:
# deep copy
convertDate = features.copy()

# remove YYYY
convertDate.remove('LISTYR')
convertDate.remove('VAL_DT_TCR')
convertDate.remove('VAL_DT_TRR')
convertDate.remove('VAL_DT_DDR')

# converting date columns in the 'convertDate' list
for col in convertDate:
    df[col] = pd.to_datetime(df[col], format='%m/%d/%Y', errors='coerce')


# new list
convertDate = ['VAL_DT_TCR','VAL_DT_TRR','VAL_DT_DDR']
for col in convertDate:
    df[col] = pd.to_datetime(df[col], format='%d%b%Y:%H:%M:%S.%f', errors='coerce')

In [47]:
# mapping
colMap = {'ACTIVATE_DATE': 'AllocationBeginDate_CAN', 'ADMISSION_DATE':'AdmissionDate_CAN', 'ADMIT_DATE_DON':'AdmissionDate_DON',
          'VAL_DT_TCR':'ValidationDateTCR_CAN','VAL_DT_TRR':'ValidationDateTRR_CAN', 'VAL_DT_DDR':'ValidationDateTCR_DDR','LISTYR':'ListingYear',
          'DISCHARGE_DATE':'CenterDischargeDate_CAN', 'END_DATE':'RemovalWaitListDate_CAN', 'INIT_DATE':'InitialWaitListDate_CAN',
          'PX_STAT_DATE':'StatusDate_CAN', 'RECOVERY_DATE_DON':'OrganRecoveryDate_DON', 'REFERRAL_DATE':'ReferralDate_DON', 'TX_DATE':'TransplantDate_CAN'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='DateTime', txt='mm/dd/yyyy')
df_dict = uf.updateDictionaryInformation(df_dict, [192], txt=f"YYYY")
df_dict = uf.updateDictionaryInformation(df_dict, [288,289,290], txt=f"ddmonthyyyy:time").copy()

# update dataframe
df_can  = uf.insertIntoDataFrame(df_can , ['AllocationBeginDate_CAN','AdmissionDate_CAN','CenterDischargeDate_CAN','RemovalWaitListDate_CAN','InitialWaitListDate_CAN','StatusDate_CAN',
               'TransplantDate_CAN', 'ValidationDate_CAN'])
df_don  = uf.insertIntoDataFrame(df_don , ['AdmissionDate_DON','OrganRecoveryDate_DON','ReferralDate_DON'])
df_date  = uf.insertIntoDataFrame(df_date , list(colMap.values()))

# display
df_dict.iloc[idx]

Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
3,AllocationBeginDate_CAN,ALLOCATION TIME BEGINNING DATE,WAITING LIST DATA,1987-10-01,NaT,WAITING LIST DATA,NUM,,,ACTIVATE_DATE,DateTime,mm/dd/yyyy
5,AdmissionDate_CAN,RECIPIENT DATE OF ADMISSION TO TX CENTER,TRR,1999-10-25,NaT,PATIENT STATUS,NUM,,,ADMISSION_DATE,DateTime,mm/dd/yyyy
6,AdmissionDate_DON,DONOR ADMIT DATE,DDR,2006-04-26,NaT,DONOR INFORMATION,NUM,,,ADMIT_DATE_DON,DateTime,mm/dd/yyyy
76,CenterDischargeDate_CAN,RECIPIENT DISCHARGE DATE FROM TX CENTER,TRR,1994-04-01,NaT,PATIENT STATUS,NUM,,,DISCHARGE_DATE,DateTime,mm/dd/yyyy
98,RemovalWaitListDate_CAN,"EARLIEST OF DATES OF REMOVAL FROM WAITING LIST, TRANSPLANT, DEATH, OR TIME COPY OF DATA CREATED",WAITING LIST DATA,1987-10-01,NaT,WAITING LIST DATA,NUM,,"IF PATIENT TRANSPLANTED OR DIED, BUT WAS REMOVED AFTER THE EVENT, END_DATE IS BACKDATED TO GIVE THE DATE OF EVENT",END_DATE,DateTime,mm/dd/yyyy
165,InitialWaitListDate_CAN,BEGINNING DATE FOR REGISTRATION,WAITING LIST DATA,1987-10-01,NaT,WAITING LIST DATA,NUM,,,INIT_DATE,DateTime,mm/dd/yyyy
192,ListingYear,ACTUAL YEAR REGISTRANT LISTED (WITHOUT DATE OFFSET),CALCULATED,1987-10-01,NaT,,NUM,,,LISTYR,DateTime,YYYY
247,StatusDate_CAN,RECIPIENT STATUS DATE,TRR/TRF-CALCULATED,1987-10-02,NaT,PATIENT STATUS,NUM,,,PX_STAT_DATE,DateTime,mm/dd/yyyy
255,OrganRecoveryDate_DON,ORGAN RECOVERY DATE,DDR / LDR,1987-10-01,NaT,ORGAN RECOVERY,NUM,,,RECOVERY_DATE_DON,DateTime,mm/dd/yyyy
256,ReferralDate_DON,DATE OF REFERRAL CALL,DDR,1994-04-01,NaT,PROVIDER INFORMATION,NUM,,,REFERRAL_DATE,DateTime,mm/dd/yyyy


In [48]:
df[list(colMap.values())].describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
AllocationBeginDate_CAN,28751.0,2016-06-02 19:32:20.676846080,1995-01-20 00:00:00,2013-09-18 00:00:00,2016-08-17 00:00:00,2019-04-15 00:00:00,2021-12-28 00:00:00,
AdmissionDate_CAN,28370.0,2016-12-12 04:32:37.222418176,2001-05-01 00:00:00,2014-06-01 00:00:00,2017-03-21 00:00:00,2019-08-16 00:00:00,2021-12-24 00:00:00,
AdmissionDate_DON,28715.0,2017-01-17 06:45:08.744558336,1960-11-24 00:00:00,2014-06-30 00:00:00,2017-04-28 00:00:00,2019-09-28 12:00:00,2021-12-28 00:00:00,
ValidationDateTCR_CAN,21058.0,1994-04-14 00:06:47.153821056,1987-10-20 00:00:00,1993-10-22 00:00:00,1995-02-28 00:00:00,1995-06-08 00:00:00,2015-04-07 19:24:14,
ValidationDateTRR_CAN,28249.0,2017-02-18 21:09:12.229783040,2011-01-13 15:37:00,2014-08-12 15:44:18,2017-05-30 11:17:45,2019-10-11 10:24:04,2021-12-30 14:14:46,
ValidationDateTCR_DDR,28522.0,2017-02-06 01:11:59.139260416,2011-01-06 12:43:10,2014-07-23 14:53:37.249999872,2017-05-17 12:15:05,2019-10-10 14:26:18,2021-12-31 15:58:11,
ListingYear,28751.0,2015.963758,1997.0,2013.0,2016.0,2019.0,2021.0,3.336659
CenterDischargeDate_CAN,27924.0,2017-01-11 17:37:00.025784320,2011-01-07 00:00:00,2014-07-09 00:00:00,2017-04-19 00:00:00,2019-09-11 00:00:00,2021-12-30 00:00:00,
RemovalWaitListDate_CAN,28751.0,2017-01-25 03:06:28.035198720,2011-01-01 00:00:00,2014-07-08 00:00:00,2017-05-06 00:00:00,2019-10-05 00:00:00,2021-12-31 00:00:00,
InitialWaitListDate_CAN,28751.0,2016-06-17 23:21:02.022190592,1997-04-29 00:00:00,2013-10-09 12:00:00,2016-08-30 00:00:00,2019-04-24 00:00:00,2021-12-28 00:00:00,


### ABO

In [49]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'ABO', True)

           count unique  top   freq      mean       std  min  25%  50%  75%  max
ABO        28751      8    O  11367       NaN       NaN  NaN  NaN  NaN  NaN  NaN
ABO_DON    28751      8    O  14640       NaN       NaN  NaN  NaN  NaN  NaN  NaN
ABO_MAT  28751.0    NaN  NaN    NaN  1.143752  0.350943  1.0  1.0  1.0  1.0  3.0

NaNs:
ABO        0
ABO_DON    0
ABO_MAT    0
dtype: int64

Datatypes:
ABO         object
ABO_DON     object
ABO_MAT    float64
dtype: object


   Feature                           Description           FormSection DataType SASAnalysisFormat Comment Information
0      ABO  RECIPIENT BLOOD GROUP @ REGISTRATION  CLINICAL INFORMATION  CHAR(3)               ABO             Unknown
1  ABO_DON                      DONOR BLOOD TYPE     DONOR INFORMATION  CHAR(3)               ABO             Unknown
2  ABO_MAT       DONOR-RECIPIENT ABO MATCH LEVEL                        CHAR(1)            ABOMAT             Unknown


ABO: ['A' 'O' 'AB' 'B' 'A1' 'A1B' 'A2B' 'A2']
ABO_DON: ['A

In [50]:
# update tointeger
df.ABO_MAT = df.ABO_MAT.astype(int) 

# df_flat FMTNAME: ABOMAT
mapping = {1: 'Identical', 2: 'Compatible', 3: 'Incompatible'}

# mapping feature
df = uf.mappingCol(df, 'ABO_MAT', mapping, display=True)

# mapping
colMap = {'ABO': 'BloodGroup_CAN', 'ABO_DON':'BloodGroup_DON', 'ABO_MAT':'BloodGroupMatchLevel'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt='')
df_dict = uf.updateDictionaryInformation(df_dict, [2], txt='FMTNAME: ABOMAT').copy()

# update dataframe
df_can = uf.insertIntoDataFrame(df_can, ['BloodGroup_CAN'])
df_don = uf.insertIntoDataFrame(df_don, ['BloodGroup_DON'])
df_both = uf.insertIntoDataFrame(df_both, ['BloodGroupMatchLevel'])
# df_ordinal = uf.insertIntoDataFrame(df_ordinal, ['BloodGroupMatchLevel'])
df_nominal = uf.insertIntoDataFrame(df_nominal,list(colMap.values()))
# convert to category
df = uf.toCategory(df, list(colMap.values()))

# display
df_dict.loc[idx]

Converted Column ABO_MAT Unique Vaue(s) ['Identical', 'Compatible', 'Incompatible']
Categories (3, object): ['Compatible', 'Identical', 'Incompatible']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
0,BloodGroup_CAN,RECIPIENT BLOOD GROUP @ REGISTRATION,TCR,1987-10-01,NaT,CLINICAL INFORMATION,CHAR(3),ABO,,ABO,Category,
1,BloodGroup_DON,DONOR BLOOD TYPE,DDR/LDR,1987-10-01,NaT,DONOR INFORMATION,CHAR(3),ABO,,ABO_DON,Category,
2,BloodGroupMatchLevel,DONOR-RECIPIENT ABO MATCH LEVEL,CALCULATED,NaT,NaT,,CHAR(1),ABOMAT,,ABO_MAT,Category,FMTNAME: ABOMAT


### AGE

In [51]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'AGE', True)

                       count       mean         std   min   25%   50%   75%    max
INIT_AGE             28751.0  52.802998   12.906853  12.0  46.0  56.0  63.0   78.0
AGE_DON              28751.0  32.122883   10.963701   8.0  23.0  31.0  40.0   72.0
AGE                  28751.0  53.411429   12.860902  18.0  46.0  56.0  63.0   79.0
INO_PROCURE_AGENT_1  11193.0  60.856071  231.429180   1.0   4.0   4.0   5.0  999.0

NaNs:
INIT_AGE                   0
AGE_DON                    0
AGE                        0
INO_PROCURE_AGENT_1    17558
dtype: int64

Datatypes:
INIT_AGE                 int64
AGE_DON                float64
AGE                      int64
INO_PROCURE_AGENT_1    float64
dtype: object


                 Feature                                  Description            FormSection DataType SASAnalysisFormat Comment Information
7                    AGE                          RECIPIENT AGE (YRS)  RECIPIENT INFORMATION      NUM                               Unknown
8                

In [52]:
# fill NaN with 998: Unknown & convert to integer
df['INO_PROCURE_AGENT_1'] = df['INO_PROCURE_AGENT_1'].fillna(998).astype(int)

# SASAnalysisFormat: INOMED                                                                                       
mapping = {
    1: "Dopamine",                                                                                                
    2: "Dobutamine",                                                                                              
    3: "Epinephrine",                                                                                             
    4: "Levophed",                                                                                                
    5: "Neosynephrine",                                                                                           
    6: "Isoproterenol (Isuprel)",
    998: "Missing",
    999: "Other, specify"
}

# mapping feature
df = uf.mappingCol(df, 'INO_PROCURE_AGENT_1', mapping, display=True)

# mapping
colMap = {'AGE_DON':'Age_DON', 'AGE': 'Age_CAN', 'INIT_AGE':'Age_Listing_CAN', 'INO_PROCURE_AGENT_1':'InotropicAgent_DON'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Numeric', txt='')
df_dict = uf.updateDictionaryInformation(df_dict, [171], txt=f"SASAnalysisFormat: INOMED" , FeatureType='Category').copy()

# update dataframe
df_can = uf.insertIntoDataFrame(df_can , ['Age_CAN','Age_Listing_CAN'])
df_don = uf.insertIntoDataFrame(df_don , ['Age_DON','InotropicAgent_DON'])
df_numeric = uf.insertIntoDataFrame(df_numeric , ['Age_DON','Age_CAN','Age_Listing_CAN'])
df_nominal = uf.insertIntoDataFrame(df_nominal , ['InotropicAgent_DON'])

# display
df_dict.iloc[idx]

Converted Column INO_PROCURE_AGENT_1 Unique Vaue(s) ['Missing', 'Dopamine', 'Other, specify', 'Levophed', 'Neosynephrine', 'Dobutamine', 'Epinephrine']
Categories (7, object): ['Dobutamine', 'Dopamine', 'Epinephrine', 'Levophed', 'Missing', 'Neosynephrine', 'Other, specify']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
7,Age_CAN,RECIPIENT AGE (YRS),TRR-CALCULATED,1987-10-01,NaT,RECIPIENT INFORMATION,NUM,,,AGE,Numeric,
8,Age_DON,DONOR AGE (YRS),DDR/LDR-CALCULATED,1987-10-01,NaT,DONOR INFORMATION,NUM,,,AGE_DON,Numeric,
162,Age_Listing_CAN,AGE IN YEARS AT TIME OF LISTING,CALCULATED,1987-10-01,NaT,,NUM,,,INIT_AGE,Numeric,
171,InotropicAgent_DON,DECEASED DONOR-INOTROPIC MEDICATION AGENT 1,DDR,2003-01-27,NaT,CLINICAL INFORMATION,NUM,INOMED,,INO_PROCURE_AGENT_1,Category,SASAnalysisFormat: INOMED


### ALCOHOL

In [53]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'ALCOHOL', True)

                   count unique top   freq
ALCOHOL_HEAVY_DON  28738      3   N  23139

NaNs:
ALCOHOL_HEAVY_DON    13
dtype: int64

Datatypes:
ALCOHOL_HEAVY_DON    object
dtype: object


             Feature                              Description        FormSection DataType SASAnalysisFormat Comment Information
9  ALCOHOL_HEAVY_DON  Heavy Alcohol Use (heavy=2+ drinks/day)  LIFESTYLE FACTORS  CHAR(1)                               Unknown


ALCOHOL_HEAVY_DON: ['N' 'Y' 'U' nan]


In [54]:
# fill NaN with X: Missing
df[features] = df[features].fillna('X')

# feature value mapping
mapping = {'N': 'No', 'Y': 'Yes', 'U': 'Unknown', 'X': 'Missing'}

# iterate
for col in features:
    # mapping feature
    df = uf.mappingCol(df, col, mapping, display=True)


# mapping
colMap = {'ALCOHOL_HEAVY_DON': 'HeavyAlcoholUse_DON'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt=f"N/Y/U/X to No/Yes/Unknown/Missing")

# update dataframe
df_don  = uf.insertIntoDataFrame(df_don, list(colMap.values()))
df_nominal  = uf.insertIntoDataFrame(df_nominal, list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.loc[idx]

Converted Column ALCOHOL_HEAVY_DON Unique Vaue(s) ['No', 'Yes', 'Unknown', 'Missing']
Categories (4, object): ['Missing', 'No', 'Unknown', 'Yes']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
9,HeavyAlcoholUse_DON,Heavy Alcohol Use (heavy=2+ drinks/day),DDR,2004-06-30,NaT,LIFESTYLE FACTORS,CHAR(1),,,ALCOHOL_HEAVY_DON,Category,N/Y/U/X to No/Yes/Unknown/Missing


### [LOCUS MISMATCH LEVEL](https://www.sciencedirect.com/science/article/pii/S1071916423002324?casa_token=Lq4LHsmUCLQAAAAA:TvHFPaDMW0rIPYF7E4SDPC20Cg1jEpCws6ztyM-jKLMrkxbuK0oDq4q5huBI0OmiSi3JSsTfd9Yu)
- AMIS & BMIS & DRMIS & HLAMIS

In [55]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'MIS', True)

          count      mean       std  min  25%  50%  75%  max
AMIS    26298.0  1.447829  0.618660  0.0  1.0  2.0  2.0  2.0
BMIS    26297.0  1.698901  0.502500  0.0  1.0  2.0  2.0  2.0
DRMIS   26294.0  1.498859  0.590286  0.0  1.0  2.0  2.0  2.0
HLAMIS  26291.0  4.645582  1.068757  0.0  4.0  5.0  5.0  6.0

NaNs:
AMIS      2453
BMIS      2454
DRMIS     2457
HLAMIS    2460
dtype: int64

Datatypes:
AMIS      float64
BMIS      float64
DRMIS     float64
HLAMIS    float64
dtype: object


    Feature              Description FormSection DataType SASAnalysisFormat Comment Information
10     AMIS   A LOCUS MISMATCH LEVEL                  NUM                               Unknown
18     BMIS   B LOCUS MISMATCH LEVEL                  NUM                               Unknown
88    DRMIS  DR Locus MISMATCH LEVEL                  NUM                               Unknown
153  HLAMIS       HLA MISMATCH LEVEL                  NUM                               Unknown


AMIS: [ 2.  1.  0. nan]
BMIS: [ 2

In [56]:
# fill NaN with 999: Missing
df[features] = df[features].fillna(999).astype(int)

# mapping
colMap = {'AMIS': 'MismatchLevel_AMIS', 'HLAMIS':'MismatchLevel_HLMIS', 'BMIS': 'MismatchLevel_BMIS', 'DRMIS': 'MismatchLevel_DRMIS'}


# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt=f"")

# update dataframe
df_both  = uf.insertIntoDataFrame(df_both, list(colMap.values()))
# df_ordinal  = uf.insertIntoDataFrame(df_ordinal, list(colMap.values()))
df_nominal  = uf.insertIntoDataFrame(df_nominal, list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.loc[idx]

Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
10,MismatchLevel_AMIS,A LOCUS MISMATCH LEVEL,CALCULATED,NaT,NaT,,NUM,,,AMIS,Category,
18,MismatchLevel_BMIS,B LOCUS MISMATCH LEVEL,CALCULATED,NaT,NaT,,NUM,,,BMIS,Category,
88,MismatchLevel_DRMIS,DR Locus MISMATCH LEVEL,CALCULATED,NaT,NaT,,NUM,,,DRMIS,Category,
153,MismatchLevel_HLMIS,HLA MISMATCH LEVEL,CALCULATED,NaT,NaT,,NUM,,,HLAMIS,Category,


### ANTIHYPERTENSIVE

In [57]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'ANTIHYPE', True)

              count unique top   freq
ANTIHYPE_DON  28545      3   N  19094

NaNs:
ANTIHYPE_DON    206
dtype: int64

Datatypes:
ANTIHYPE_DON    object
dtype: object


         Feature                                                   Description           FormSection DataType SASAnalysisFormat Comment Information
11  ANTIHYPE_DON  DECEASED DONOR-ANTIHYPERTENSIVES W/IN 24 HRS PRE-CROSS CLAMP  CLINICAL INFORMATION  CHAR(1)                               Unknown


ANTIHYPE_DON: ['Y' 'N' nan 'U']


In [58]:
# fill NaN with X: Missing
df[features] = df[features].fillna('X')

# feature value mapping
mapping = {'N': 'No', 'Y': 'Yes', 'U': 'Unknown', 'X': 'Missing'}

# iterate
for col in features:
    # mapping feature
    df = uf.mappingCol(df, col, mapping, display=True)


# mapping
colMap = {'ANTIHYPE_DON': 'AntiHypertensive_DON'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt=f"N/Y/U/X to No/Yes/Unknown/Missing")

# update dataframe
df_don  = uf.insertIntoDataFrame(df_don, list(colMap.values()))
df_nominal  = uf.insertIntoDataFrame(df_nominal, list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.loc[idx]

Converted Column ANTIHYPE_DON Unique Vaue(s) ['Yes', 'No', 'Missing', 'Unknown']
Categories (4, object): ['Missing', 'No', 'Unknown', 'Yes']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
11,AntiHypertensive_DON,DECEASED DONOR-ANTIHYPERTENSIVES W/IN 24 HRS PRE-CROSS CLAMP,DDR,1994-04-01,NaT,CLINICAL INFORMATION,CHAR(1),,,ANTIHYPE_DON,Category,N/Y/U/X to No/Yes/Unknown/Missing


### ARGININE

In [59]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'ARGININE', True)

              count unique top   freq
ARGININE_DON  28545      3   Y  19759

NaNs:
ARGININE_DON    206
dtype: int64

Datatypes:
ARGININE_DON    object
dtype: object


         Feature                                                                         Description           FormSection DataType SASAnalysisFormat Comment Information
12  ARGININE_DON  DECEASED DONOR-WAS DONOR GIVEN ARGININE VASOPRESSIN WITHIN 24 HRS PRE CROSS CLAMP?  CLINICAL INFORMATION  CHAR(1)                               Unknown


ARGININE_DON: ['N' 'Y' nan 'U']


In [60]:
# fill NaN with X: Missing
df[features] = df[features].fillna('X')

# feature value mapping
mapping = {'N': 'No', 'Y': 'Yes', 'U': 'Unknown', 'X': 'Missing'}

# iterate
for col in features:
    # mapping feature
    df = uf.mappingCol(df, col, mapping, display=True)


# mapping
colMap = {'ARGININE_DON':'ArginnieManagement_DON'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt=f"N/Y/U/X to No/Yes/Unknown/Missing")

# update dataframe
df_don  = uf.insertIntoDataFrame(df_don, list(colMap.values()))
df_nominal  = uf.insertIntoDataFrame(df_nominal, list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.loc[idx]

Converted Column ARGININE_DON Unique Vaue(s) ['No', 'Yes', 'Missing', 'Unknown']
Categories (4, object): ['Missing', 'No', 'Unknown', 'Yes']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
12,ArginnieManagement_DON,DECEASED DONOR-WAS DONOR GIVEN ARGININE VASOPRESSIN WITHIN 24 HRS PRE CROSS CLAMP?,DDR,2004-06-30,NaT,CLINICAL INFORMATION,CHAR(1),,,ARGININE_DON,Category,N/Y/U/X to No/Yes/Unknown/Missing


### DIAGNOSIS

In [61]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'DGN|DIAG', True)

                count         mean        std    min     25%     50%     75%     max
THORACIC_DGN  28751.0  1033.533825  62.284933  999.0  1000.0  1007.0  1049.0  1209.0
TCR_DGN       28678.0  1033.925483  63.490478  999.0  1000.0  1007.0  1049.0  1209.0
DIAG          28680.0  1034.390690  63.790522  999.0  1000.0  1007.0  1049.0  1209.0
BIOPSY_DGN    28545.0     1.002137   0.069750    1.0     1.0     1.0     1.0     4.0

NaNs:
THORACIC_DGN      0
TCR_DGN          73
DIAG             71
BIOPSY_DGN      206
dtype: int64

Datatypes:
THORACIC_DGN      int64
TCR_DGN         float64
DIAG            float64
BIOPSY_DGN      float64
dtype: object


          Feature                                                                              Description                          FormSection DataType SASAnalysisFormat                                                                 Comment Information
13     BIOPSY_DGN  BIOPSY PERFORMED: NO, YES MYOCARDITIS, YES NEG. BIOPSY RESULT, YES OTHER DIAG

In [62]:
# check for differences between two sets
uf.symmetricDifference(set(df.TCR_DGN.dropna().unique().astype(int)), set(df.THORACIC_DGN.dropna().unique().astype(int)))

Symmetric difference: []


In [63]:
# check for differences between two sets
uf.symmetricDifference(set(df.DIAG.dropna().unique().astype(int)), set(df.THORACIC_DGN.dropna().unique().astype(int)))

Symmetric difference: [1205]


In [64]:
# check for differences between two sets
uf.symmetricDifference(set(df.DIAG.dropna().unique().astype(int)), set(df.TCR_DGN.dropna().unique().astype(int)))

Symmetric difference: [1205]


In [65]:
findMappingDfFlat(df.BIOPSY_DGN, df_flat, 'BIOPCONF', NaN=998)

Compare Length: 5 & 3

CODE                        LABEL
   1              Biopsy not done
   2     Yes, rejection confirmed
   3 Yes, rejection not confirmed


In [66]:
# df_flat FMTNAME: TH_DGN
mapping = {
    998: 'Missing',
    999: 'OTHER - SPECIFY',
    1000: 'DILATED MYOPATHY: IDIOPATHIC',
    1001: 'DILATED MYOPATHY: ADRIAMYCIN',
    1002: 'DILATED MYOPATHY: POST PARTUM',
    1003: 'DILATED MYOPATHY: FAMILIAL',
    1004: 'DILATED MYOPATHY: MYOCARDITIS',
    1005: 'DILATED MYOPATHY: ALCOHOLIC',
    1006: 'DILATED MYOPATHY: VIRAL',
    1007: 'DILATED MYOPATHY: ISCHEMIC',
    1008: 'DILATED MYOPATHY: VIRAL (NOT COVID-19)',
    1009: 'COVID-19: DILATED MYOPATHY: ACTIVE MYOCARDITIS',
    1010: 'COVID-19: DILATED MYOPATHY: HISTORY OF MYOCARDITIS',
    1049: 'DILATED MYOPATHY: OTHER SPECIFY',
    1050: 'RESTRICTIVE MYOPATHY: IDIOPATHIC',
    1051: 'RESTRICTIVE MYOPATHY: AMYLOIDOSIS',
    1052: 'RESTRICTIVE MYOPATHY: ENDOCARDIAL FIBROS',
    1053: 'RESTRICTIVE MYOPATHY: SARCOIDOSIS',
    1054: 'RESTRICTIVE MYOPATHY: SEC TO RADIAT/CHEM',
    1099: 'RESTRICTIVE MYOPATHY: OTHER SPECIFY',
    1100: 'HEART RE-TX/GF: HYPERACUTE REJECTION',
    1101: 'HEART RE-TX/GF: ACUTE REJECTION',
    1102: 'HEART RE-TX/GF: CORONARY ARTERY DISEASE',
    1103: 'HEART RE-TX/GF: NON-SPECIFIC',
    1104: 'HEART RE-TX/GF: RESTRICTIVE/CONSTRICTIVE',
    1105: 'HEART RE-TX/GF: CHRONIC REJECTION',
    1106: 'HEART RE-TX/GF: PRIMARY FAILURE',
    1199: 'HEART RE-TX/GF: OTHER SPECIFY',
    1200: 'CORONARY ARTERY DISEASE',
    1201: 'HYPERTROPHIC CARDIOMYOPATHY',
    1202: 'VALVULAR HEART DISEASE',
    1203: 'CONGENITAL HEART DEFECT - PRIOR SURGERY UNKNOWN',
    1204: 'CANCER',
    1205: 'CONGENITAL HEART DEFECT - HYPOPLASTIC LEFT HEART SYNDROME - UNOPERATED',
    1206: 'CONGENITAL HEART DEFECT - WITHOUT SURGERY',
    1207: 'CONGENITAL HEART DEFECT - WITH SURGERY',
    1208: 'ARRHYTHMOGENIC RIGHT VENTRICULAR DYSPLASIA/CARDIOMYOPATHY',
    1209: 'MUSCULAR DYSTROPHY: OTHER SPECIFY'
}

# fill NaN with 998: Missing
df[features] = df[features].fillna(998)
# convert to integer
df[features] = df[features].astype(int)

# mapping
df = uf.mappingCol(df, 'THORACIC_DGN', mapping)
df = uf.mappingCol(df, 'TCR_DGN', mapping)
df = uf.mappingCol(df, 'DIAG', mapping)

# df_flat FMTNAME: BIOPCONF
mapping = {
    1: 'Biopsy not done',
    2: 'Yes, rejection confirmed',
    3: 'Yes, rejection not confirmed',
    4: 'Unknow',
    998: 'Missing'
}

# mapping
df = uf.mappingCol(df, 'BIOPSY_DGN', mapping)


# mapping
colMap = {'BIOPSY_DGN':'Biopsy_DON', 'THORACIC_DGN': 'WaitListDiagnosisCode_CAN', 'TCR_DGN': 'DiagnosisAtListing_CAN', 'DIAG':'PrimaryDiagnosisType_CAN'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt='FMTNAME: TH_DGN')
df_dict = uf.updateDictionaryInformation(df_dict, [13], txt='FMTNAME: BIOPCONF').copy()

# update dataframe
df_can = uf.insertIntoDataFrame(df_can, ['WaitListDiagnosisCode_CAN','DiagnosisAtListing_CAN', 'PrimaryDiagnosisType_CAN'])
df_don = uf.insertIntoDataFrame(df_don, ['Biopsy_DON'])

df_nominal = uf.insertIntoDataFrame(df_nominal,list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.loc[idx]

Converted Column THORACIC_DGN Unique Vaue(s) ['DILATED MYOPATHY: IDIOPATHIC', 'DILATED MYOPATHY: OTHER SPECIFY', 'DILATED MYOPATHY: ISCHEMIC', 'HEART RE-TX/GF: CORONARY ARTERY DISEASE', 'HYPERTROPHIC CARDIOMYOPATHY', ..., 'CANCER', 'DILATED MYOPATHY: VIRAL (NOT COVID-19)', 'COVID-19: DILATED MYOPATHY: ACTIVE MYOCARDITIS', 'COVID-19: DILATED MYOPATHY: HISTORY OF MYOCARDITIS', 'CONGENITAL HEART DEFECT - HYPOPLASTIC LEFT HEART SYNDROME - UNOPERATED']
Length: 37
Categories (37, object): ['ARRHYTHMOGENIC RIGHT VENTRICULAR DYSPLASIA/CARDIOMYOPATHY', 'CANCER', 'CONGENITAL HEART DEFECT - HYPOPLASTIC LEFT HEART SYNDROME - UNOPERATED', 'CONGENITAL HEART DEFECT - PRIOR SURGERY UNKNOWN', ..., 'RESTRICTIVE MYOPATHY: OTHER SPECIFY', 'RESTRICTIVE MYOPATHY: SARCOIDOSIS', 'RESTRICTIVE MYOPATHY: SEC TO RADIAT/CHEM', 'VALVULAR HEART DISEASE']
Converted Column TCR_DGN Unique Vaue(s) ['DILATED MYOPATHY: IDIOPATHIC', 'DILATED MYOPATHY: VIRAL', 'DILATED MYOPATHY: ISCHEMIC', 'VALVULAR HEART DISEASE', 'HEART R

Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
13,Biopsy_DON,"BIOPSY PERFORMED: NO, YES MYOCARDITIS, YES NEG. BIOPSY RESULT, YES OTHER DIAG. SPECIFY.",DDR,1999-10-25,NaT,HEART DONOR'S CARDIAC FUNCTION,NUM,,,BIOPSY_DGN,Category,FMTNAME: BIOPCONF
72,PrimaryDiagnosisType_CAN,RECIPIENT PRIMARY DIAGNOSIS,TRR>TCR,1987-10-01,NaT,PATIENT STATUS/CLINICAL INFORMATION,NUM,ALL_DGN,"THIS FIELD DRAWS FROM ""AT TRANSPLANT"" AND IF NOT THERE THEN FROM TCR.",DIAG,Category,FMTNAME: TH_DGN
270,DiagnosisAtListing_CAN,CANDIDATE DIAGNOSIS AT LISTING,TCR,1994-04-01,NaT,CLINICAL INFORMATION,NUM,ALL_DGN,,TCR_DGN,Category,FMTNAME: TH_DGN
272,WaitListDiagnosisCode_CAN,Waitlist CANDIDATE DIAGNOSIS,WL DATA,NaT,NaT,,NUM,ALL_DGN,,THORACIC_DGN,Category,FMTNAME: TH_DGN


### DEATH

In [67]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'COD|DEATH', True)

                    count        mean         std  min  25%  50%  75%    max
COD_CAD_DON       28747.0   24.174766  146.633417  1.0  1.0  2.0  3.0  999.0
DEATH_CIRCUM_DON  28736.0  128.318868  329.281294  1.0  2.0  5.0  6.0  997.0
DEATH_MECH_DON    28743.0   26.019344  135.022720  1.0  5.0  7.0  9.0  997.0

NaNs:
COD_CAD_DON          4
DEATH_CIRCUM_DON    15
DEATH_MECH_DON       8
dtype: int64

Datatypes:
COD_CAD_DON         float64
DEATH_CIRCUM_DON    float64
DEATH_MECH_DON      float64
dtype: object


             Feature                           Description        FormSection DataType SASAnalysisFormat Comment Information
39       COD_CAD_DON         DECEASED DONOR-CAUSE OF DEATH  DONOR INFORMATION      NUM           DON_COD             Unknown
68  DEATH_CIRCUM_DON  DECEASED DONOR-CIRCUMSTANCE OF DEATH  DONOR INFORMATION      NUM           DTHCIRC             Unknown
69    DEATH_MECH_DON     DECEASED DONOR-MECHANISM OF DEATH  DONOR INFORMATION      NUM           DTHMECH            

In [68]:
# fill NaN with 1000: Unknown
df[features] = df[features].fillna(1000)

# df_flat FMTNAME: DON_COD
mapping = {
    1: 'ANOXIA',
    2: 'CEREBROVASCULAR/STROKE',
    3: 'HEAD TRAUMA',
    4: 'CNS TUMOR',
    999: 'OTHER SPECIFY',
    998: 'Unknown',
    1000: 'Missing'
}

# mapping feature
df = uf.mappingCol(df, 'COD_CAD_DON', mapping, display=True)


# df_flat FMTNAME: DTHCIRC
mapping = {
    1: "MVA",
    2: "SUICIDE",
    3: "HOMICIDE",
    4: "CHILD-ABUSE",
    5: "Accident, Non-MVA",
    6: "DEATH FROM NATURAL CAUSES",
    997: "NONE OF THE ABOVE",
    1000: "Missing"
}

# mapping feature
df = uf.mappingCol(df, 'DEATH_CIRCUM_DON', mapping, display=True)


# df_flat FMTNAME: DEATH_MECH_DON
mapping = {
    1: "DROWNING",
    2: "SEIZURE",
    3: "DRUG INTOXICATION",
    4: "ASPHYXIATION",
    5: "CARDIOVASCULAR",
    6: "ELECTRICAL",
    7: "GUNSHOT WOUND",
    8: "STAB",
    9: "BLUNT INJURY",
    10: "SIDS",
    11: "INTRACRANIAL HEMORRHAGE/STROKE",
    12: "DEATH FROM NATURAL CAUSES",
    995: "995-Gunshot/stab wound (Pre-OTIS)",
    997: "NONE OF THE ABOVE",
    1000: "Missing"
}
# mapping feature
df = uf.mappingCol(df, 'DEATH_MECH_DON', mapping, display=True)


# mapping
colMap = {'COD_CAD_DON': 'CauseOfDeath_DON','DEATH_CIRCUM_DON':'DeathCircumstance_DON', 'DEATH_MECH_DON':'DeathMechanism_DON'}


# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt=f"")
df_dict = uf.updateDictionaryInformation(df_dict, [39], txt='FMTNAME: DON_COD')
df_dict = uf.updateDictionaryInformation(df_dict, [68], txt='FMTNAME: DTHCIRC')
df_dict = uf.updateDictionaryInformation(df_dict, [69], txt='FMTNAME: DEATH_MECH_DON')

# update dataframe
df_don  = uf.insertIntoDataFrame(df_don, list(colMap.values()))
df_nominal  = uf.insertIntoDataFrame(df_nominal, list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.iloc[idx]

Converted Column COD_CAD_DON Unique Vaue(s) ['HEAD TRAUMA', 'ANOXIA', 'CEREBROVASCULAR/STROKE', 'OTHER SPECIFY', 'CNS TUMOR', 'Missing']
Categories (6, object): ['ANOXIA', 'CEREBROVASCULAR/STROKE', 'CNS TUMOR', 'HEAD TRAUMA', 'Missing', 'OTHER SPECIFY']
Converted Column DEATH_CIRCUM_DON Unique Vaue(s) ['HOMICIDE', 'Accident, Non-MVA', 'NONE OF THE ABOVE', 'DEATH FROM NATURAL CAUSES', 'SUICIDE', 'MVA', 'Missing', 'CHILD-ABUSE']
Categories (8, object): ['Accident, Non-MVA', 'CHILD-ABUSE', 'DEATH FROM NATURAL CAUSES', 'HOMICIDE', 'MVA', 'Missing', 'NONE OF THE ABOVE', 'SUICIDE']
Converted Column DEATH_MECH_DON Unique Vaue(s) ['GUNSHOT WOUND', 'BLUNT INJURY', 'DRUG INTOXICATION', 'INTRACRANIAL HEMORRHAGE/STROKE', 'CARDIOVASCULAR', ..., 'SEIZURE', 'STAB', 'Missing', 'DROWNING', 'ELECTRICAL']
Length: 13
Categories (13, object): ['ASPHYXIATION', 'BLUNT INJURY', 'CARDIOVASCULAR', 'DEATH FROM NATURAL CAUSES', ..., 'Missing', 'NONE OF THE ABOVE', 'SEIZURE', 'STAB']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
39,CauseOfDeath_DON,DECEASED DONOR-CAUSE OF DEATH,DDR,1987-10-01,NaT,DONOR INFORMATION,NUM,DON_COD,,COD_CAD_DON,Category,FMTNAME: DON_COD
68,DeathCircumstance_DON,DECEASED DONOR-CIRCUMSTANCE OF DEATH,DDR,1994-04-01,NaT,DONOR INFORMATION,NUM,DTHCIRC,,DEATH_CIRCUM_DON,Category,FMTNAME: DTHCIRC
69,DeathMechanism_DON,DECEASED DONOR-MECHANISM OF DEATH,DDR,1994-04-01,NaT,DONOR INFORMATION,NUM,DTHMECH,,DEATH_MECH_DON,Category,FMTNAME: DEATH_MECH_DON


### INFECTION

#### BLOOD_INF_DON & OTHER_INF_DON & PULM_INF_DON & URINE_INF_DON

###### BLOOD_INF_DON & OTHER_INF_DON & PULM_INF_DON & URINE_INF_DON

In [69]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, '_INF_', True)

                 count      mean       std  min  25%  50%  75%  max
BLOOD_INF_DON  28748.0  0.098372  0.297822  0.0  0.0  0.0  0.0  1.0
OTHER_INF_DON  21856.0  0.095351  0.293706  0.0  0.0  0.0  0.0  1.0
PULM_INF_DON   28748.0  0.695283  0.460295  0.0  0.0  1.0  1.0  1.0
URINE_INF_DON  28748.0  0.116182  0.320449  0.0  0.0  0.0  0.0  1.0

NaNs:
BLOOD_INF_DON       3
OTHER_INF_DON    6895
PULM_INF_DON        3
URINE_INF_DON       3
dtype: int64

Datatypes:
BLOOD_INF_DON    float64
OTHER_INF_DON    float64
PULM_INF_DON     float64
URINE_INF_DON    float64
dtype: object


           Feature                                Description           FormSection DataType SASAnalysisFormat Comment Information
14   BLOOD_INF_DON   DECEASED DONOR-BLOOD AS INFECTION SOURCE  CLINICAL INFORMATION      NUM                               Unknown
207  OTHER_INF_DON      DECEASED DONOR INFECTION OTHER SOURCE  CLINICAL INFORMATION      NUM                               Unknown
245   PULM_INF_DON  DECEASED DO

In [70]:
# fill NaN with 999: Missing
df[features] = df[features].fillna(999).astype(int)
# convert to category
df = uf.toCategory(df, features)

# mapping
colMap = {'BLOOD_INF_DON': 'BloodInfectionSource_DON', 'OTHER_INF_DON': 'OtherInfectionSource_DON',
         'PULM_INF_DON': 'PulmonaryInfection_DON','URINE_INF_DON': 'UrineInfection_DON'}


# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt=f"")

# update dataframe
df_don  = uf.insertIntoDataFrame(df_don, list(colMap.values()))
df_nominal  = uf.insertIntoDataFrame(df_nominal, list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.loc[idx]

Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
14,BloodInfectionSource_DON,DECEASED DONOR-BLOOD AS INFECTION SOURCE,DDR,1994-04-01,NaT,CLINICAL INFORMATION,NUM,,,BLOOD_INF_DON,Category,
207,OtherInfectionSource_DON,DECEASED DONOR INFECTION OTHER SOURCE,DDR,1994-04-01,NaT,CLINICAL INFORMATION,NUM,,,OTHER_INF_DON,Category,
245,PulmonaryInfection_DON,DECEASED DONOR-INFECTION PULMONARY SOURCE,DDR,1994-04-01,NaT,CLINICAL INFORMATION,NUM,,,PULM_INF_DON,Category,
283,UrineInfection_DON,DECEASED DONOR-INFECTION URINE SOURCE,DDR,1994-04-01,NaT,CLINICAL INFORMATION,NUM,,,URINE_INF_DON,Category,


#### INFECT_IV_DRUG_TRR & CLIN_INFECT_DON

In [71]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'INFECT', True)

                    count unique top   freq
INFECT_IV_DRUG_TRR  28323      3   N  25055
CLIN_INFECT_DON     28546      3   Y  21856

NaNs:
INFECT_IV_DRUG_TRR    428
CLIN_INFECT_DON       205
dtype: int64

Datatypes:
INFECT_IV_DRUG_TRR    object
CLIN_INFECT_DON       object
dtype: object


                Feature                                                               Description                         FormSection DataType SASAnalysisFormat Comment Information
34      CLIN_INFECT_DON                                   DECEASED DONOR-CLINICAL INFECTION (Y,N)                CLINICAL INFORMATION  CHAR(1)                               Unknown
158  INFECT_IV_DRUG_TRR  INFECTION REQUIRING IV DRUG THERAPY (WITHIN 2 WEEKS PRIOR TO TRANSPLANT)  PRETRANSPLANT CLINICAL INFORMATION  CHAR(1)                               Unknown


INFECT_IV_DRUG_TRR: ['N' 'U' 'Y' nan]
CLIN_INFECT_DON: ['Y' 'N' nan 'U']


###### INFECT_IV_DRUG_TRR & CLIN_INFECT_DON

In [72]:
# fill NaN with X: Missing
df[features] = df[features].fillna('X')

# feature value mapping
mapping = {'N': 'No', 'Y': 'Yes', 'U': 'Unknown', 'X': 'Missing'}

# iterate
for col in features:
    # mapping feature
    df = uf.mappingCol(df, col, mapping, display=True)


# mapping
colMap = {'INFECT_IV_DRUG_TRR': 'InfectionTherapyIV_CAN', 'CLIN_INFECT_DON': 'InfectionClinical_DON'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt=f"N/Y/U/X to No/Yes/Unknow/Missing")

# update dataframe
df_don  = uf.insertIntoDataFrame(df_don, ['InfectionClinical_DON'])
df_can  = uf.insertIntoDataFrame(df_can, ['InfectionTherapyIV_CAN'])
df_nominal  = uf.insertIntoDataFrame(df_nominal, list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.loc[idx]

Converted Column INFECT_IV_DRUG_TRR Unique Vaue(s) ['No', 'Unknown', 'Yes', 'Missing']
Categories (4, object): ['Missing', 'No', 'Unknown', 'Yes']
Converted Column CLIN_INFECT_DON Unique Vaue(s) ['Yes', 'No', 'Missing', 'Unknown']
Categories (4, object): ['Missing', 'No', 'Unknown', 'Yes']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
34,InfectionClinical_DON,"DECEASED DONOR-CLINICAL INFECTION (Y,N)",DDR,1994-04-01,NaT,CLINICAL INFORMATION,CHAR(1),,,CLIN_INFECT_DON,Category,N/Y/U/X to No/Yes/Unknow/Missing
158,InfectionTherapyIV_CAN,INFECTION REQUIRING IV DRUG THERAPY (WITHIN 2 WEEKS PRIOR TO TRANSPLANT),TRR,1994-04-01,NaT,PRETRANSPLANT CLINICAL INFORMATION,CHAR(1),,,INFECT_IV_DRUG_TRR,Category,N/Y/U/X to No/Yes/Unknow/Missing


### BMI
BMI = $\frac{weight(kg)}{height(m)^2}$

In [73]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'BMI_', False)

                 count       mean          std        min        25%        50%        75%            max
BMI_TCR        28633.0  60.621048  2940.332879   2.983700  23.849400  27.258100  31.011200  318008.310200
INIT_BMI_CALC  28711.0  27.494497     4.928766  15.000000  23.800000  27.300000  31.000000      68.300000
END_BMI_CALC   28738.0  27.507774     4.872784  15.000000  23.900000  27.300000  31.000000      48.700000
BMI_DON_CALC   28741.0  27.603602     6.064300  13.121499  23.384354  26.555861  30.679328      74.360965
BMI_CALC       28738.0  27.538816     4.965651  15.100000  23.900000  27.300000  31.000000      52.800000

NaNs:
BMI_TCR          118
INIT_BMI_CALC     40
END_BMI_CALC      13
BMI_DON_CALC      10
BMI_CALC          13
dtype: int64

Datatypes:
BMI_TCR          float64
INIT_BMI_CALC    float64
END_BMI_CALC     float64
BMI_DON_CALC     float64
BMI_CALC         float64
dtype: object


           Feature                                       Description                  

In [74]:
# mapping
colMap = {'END_BMI_CALC': 'BMI_CAN', 'BMI_DON_CALC':'BMI_DON'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Numeric', txt='')
df_dict = uf.updateDictionaryInformation(df_dict, [15,17,164], txt=DROP).copy()

# update dataframe
df_can = uf.insertIntoDataFrame(df_can, ['BMI_CAN'])
df_don = uf.insertIntoDataFrame(df_don, ['BMI_DON'])
df_numeric = uf.insertIntoDataFrame(df_numeric,list(colMap.values()))
df_drop = uf.insertIntoDataFrame(df_drop, ['BMI_CALC', 'BMI_TCR', 'INIT_BMI_CALC'])

# display
df_dict.iloc[idx]

Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
15,BMI_CALC,Calculated Recipient BMI,CALCULATED,NaT,NaT,,NUM,,,BMI_CALC,Numeric,** DROP **
16,BMI_DON,Donor BMI - Pre/At Donation Calculated,CALCULATED,NaT,NaT,,NUM,,,BMI_DON_CALC,Numeric,
17,BMI_TCR,BMI AT LISTING,TCR,2004-06-30,NaT,CLINICAL INFORMATION AT LISTING,NUM,,,BMI_TCR,Numeric,** DROP **
97,BMI_CAN,Calculated Candidate BMI at Removal/Current Time,CALCULATED,NaT,NaT,,NUM,,,END_BMI_CALC,Numeric,
164,INIT_BMI_CALC,Calculated Candidate BMI at Listing,CALCULATED,NaT,NaT,,NUM,,,INIT_BMI_CALC,Numeric,** DROP **


### BRONCHOSCOPY

In [75]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'BRONCHO', True)

                  count      mean        std  min  25%  50%  75%    max
BRONCHO_LT_DON  14437.0  3.353398  31.005462  1.0  2.0  2.0  2.0  998.0
BRONCHO_RT_DON  14232.0  3.213392  27.686979  1.0  2.0  2.0  3.0  998.0

NaNs:
BRONCHO_LT_DON    14314
BRONCHO_RT_DON    14519
dtype: int64

Datatypes:
BRONCHO_LT_DON    float64
BRONCHO_RT_DON    float64
dtype: object


           Feature                  Description     FormSection DataType SASAnalysisFormat Comment Information
19  BRONCHO_LT_DON   DDR LEFT LUNG BRONCHOSCOPY  ORGAN RECOVERY      NUM          ABNBRONC             Unknown
20  BRONCHO_RT_DON  DDR RIGHT LUNG BRONCHOSCOPY  ORGAN RECOVERY      NUM          ABNBRONC             Unknown


BRONCHO_LT_DON: [  2.  nan   5.   1.   3.   6.   7.   4. 998.]
BRONCHO_RT_DON: [  2.  nan   1.   3.   5.   6.   7.   4. 998.]


In [76]:
# fill NaN with 999: Unknown
df[features] = df[features].fillna(999).astype(int)

# df_flat FMTNAME: ABNBRONC
mapping = {
    1: "No Bronchoscopy",
    2: "Normal",
    3: "Abnormal-purulent secretions",
    4: "Abnormal-aspiration of foreign body",
    5: "Abnormal-blood",
    6: "Abnormal-anatomy/other lesion",
    7: "Unknown",
    998: "Unknown if bronchoscopy performed",
    999: "Missing"
}

# iterate
for col in features:
    # mapping feature
    df = uf.mappingCol(df, col, mapping, display=True)


# mapping
colMap = {'BRONCHO_LT_DON':'BronchoscopyLeft_DON', 'BRONCHO_RT_DON':'BronchoscopyRight_DON'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt=f"FMTNAME: ABNBRONC")

# update dataframe
df_don  = uf.insertIntoDataFrame(df_don, list(colMap.values()))
df_nominal  = uf.insertIntoDataFrame(df_nominal, list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.loc[idx]

Converted Column BRONCHO_LT_DON Unique Vaue(s) ['Normal', 'Missing', 'Abnormal-blood', 'No Bronchoscopy', 'Abnormal-purulent secretions', 'Abnormal-anatomy/other lesion', 'Unknown', 'Abnormal-aspiration of foreign body', 'Unknown if bronchoscopy performed']
Categories (9, object): ['Abnormal-anatomy/other lesion', 'Abnormal-aspiration of foreign body', 'Abnormal-blood', 'Abnormal-purulent secretions', ..., 'No Bronchoscopy', 'Normal', 'Unknown', 'Unknown if bronchoscopy performed']
Converted Column BRONCHO_RT_DON Unique Vaue(s) ['Normal', 'Missing', 'No Bronchoscopy', 'Abnormal-purulent secretions', 'Abnormal-blood', 'Abnormal-anatomy/other lesion', 'Unknown', 'Abnormal-aspiration of foreign body', 'Unknown if bronchoscopy performed']
Categories (9, object): ['Abnormal-anatomy/other lesion', 'Abnormal-aspiration of foreign body', 'Abnormal-blood', 'Abnormal-purulent secretions', ..., 'No Bronchoscopy', 'Normal', 'Unknown', 'Unknown if bronchoscopy performed']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
19,BronchoscopyLeft_DON,DDR LEFT LUNG BRONCHOSCOPY,DDR,1999-10-25,NaT,ORGAN RECOVERY,NUM,ABNBRONC,,BRONCHO_LT_DON,Category,FMTNAME: ABNBRONC
20,BronchoscopyRight_DON,DDR RIGHT LUNG BRONCHOSCOPY,DDR,1999-10-25,NaT,ORGAN RECOVERY,NUM,ABNBRONC,,BRONCHO_RT_DON,Category,FMTNAME: ABNBRONC


### BUN_DON
- Blood Urea Nitrogen Level

In [77]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'BUN', False)

           count       mean        std  min   25%   50%   75%    max
BUN_DON  28546.0  23.991873  19.930901  0.5  12.0  18.0  28.0  250.0

NaNs:
BUN_DON    205
dtype: int64

Datatypes:
BUN_DON    float64
dtype: object


    Feature                                  Description           FormSection DataType SASAnalysisFormat Comment Information
21  BUN_DON  DECEASED DONOR-TERMINAL BLOOD UREA NITROGEN  CLINICAL INFORMATION      NUM                               Unknown




In [78]:
# mapping
colMap = {'BUN_DON': 'BloodUreaNitrogenLevel_DON'}


# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Numeric', txt=f"")

# update dataframe
df_can  = uf.insertIntoDataFrame(df_can, list(colMap.values()))
df_numeric  = uf.insertIntoDataFrame(df_numeric, list(colMap.values()))

# display
df_dict.iloc[idx]

Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
21,BloodUreaNitrogenLevel_DON,DECEASED DONOR-TERMINAL BLOOD UREA NITROGEN,DDR,1994-04-01,NaT,CLINICAL INFORMATION,NUM,,,BUN_DON,Numeric,


### BW4 & BW6
    - BW4 and BW6 are mutually exclusive epitopes associated with all HLA-B antigens.
    - BW4: Candidate Most Recent/at Removal BW4 Antigen From Waiting List
    - BW6: Candidate Most Recent/at Removal BW6 Antigen From Waiting List

**Note:**
    - [HLA Bw4 and Bw6 Epitopes Recognized by Antibodies and Natural Killer Cells](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5742561/)
    - The removal of BW4 antigen from a waiting list could signify the clearance of individuals or groups that have been screened for this antigen and deemed safe from its threat.
    - The `unit of measurement` for Bw4 and Bw6 epitopes would be more appropriately described in terms of their presence or absence than any quantitative measurement.  

In [79]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'BW', True)

       count       mean        std  min  25%  50%   75%   max
BW4  28751.0  30.290564  44.385290  0.0  0.0  0.0  95.0  99.0
BW6  28751.0  31.372891  44.724526  0.0  0.0  0.0  95.0  99.0

NaNs:
BW4    0
BW6    0
dtype: int64

Datatypes:
BW4    int64
BW6    int64
dtype: object


   Feature                                                     Description        FormSection DataType SASAnalysisFormat Comment Information
22     BW4  Candidate Most Recent/at Removal BW4 Antigen From Waiting List  WAITING LIST DATA      NUM          WKGRPHLA             Unknown
23     BW6  Candidate Most Recent/at Removal BW6 Antigen From Waiting List  WAITING LIST DATA      NUM          WKGRPHLA             Unknown


BW4: [ 0 96 95 99]
BW6: [ 0 95 96 99]


In [80]:
# df_flat FMTNAME: WKGRPHLA
mapping = {
    0: "0",
    95: "Positive",
    96: "Negative",
    98: "Confirmed Blank",
    99: "Not Tested",
    998: "Unknown"
}

# iterate
for col in features:
    # mapping feature
    df = uf.mappingCol(df, col, mapping, display=True)


# mapping
colMap = {'BW4': 'AntigenBW4_CAN', 'BW6':'AntigenBW6_CAN'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt=f"FMTNAME: WKGRPHLA")

# update dataframe
df_can  = uf.insertIntoDataFrame(df_can, list(colMap.values()))
df_nominal  = uf.insertIntoDataFrame(df_nominal, list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.loc[idx]

Converted Column BW4 Unique Vaue(s) ['0', 'Negative', 'Positive', 'Not Tested']
Categories (4, object): ['0', 'Negative', 'Not Tested', 'Positive']
Converted Column BW6 Unique Vaue(s) ['0', 'Positive', 'Negative', 'Not Tested']
Categories (4, object): ['0', 'Negative', 'Not Tested', 'Positive']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
22,AntigenBW4_CAN,Candidate Most Recent/at Removal BW4 Antigen From Waiting List,WAITING LIST DATA,1987-10-01,NaT,WAITING LIST DATA,NUM,WKGRPHLA,,BW4,Category,FMTNAME: WKGRPHLA
23,AntigenBW6_CAN,Candidate Most Recent/at Removal BW6 Antigen From Waiting List,WAITING LIST DATA,1987-10-01,NaT,WAITING LIST DATA,NUM,WKGRPHLA,,BW6,Category,FMTNAME: WKGRPHLA


### [C1 & C2](https://pubmed.ncbi.nlm.nih.gov/30946220/)

In [81]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'C1|C2', True)

      count       mean         std  min  25%  50%  75%     max
C1  28751.0   8.289938   62.703228  0.0  0.0  0.0  4.0  1701.0
C2  28751.0  17.228305  127.935992  0.0  0.0  0.0  6.0  1802.0

NaNs:
C1    0
C2    0
dtype: int64

Datatypes:
C1    int64
C2    int64
dtype: object


   Feature                                                    Description        FormSection DataType SASAnalysisFormat Comment Information
24      C1  Candidate Most Recent/at Removal C1 Antigen From Waiting List  WAITING LIST DATA      NUM             CWHLA             Unknown
25      C2  Candidate Most Recent/at Removal C2 Antigen From Waiting List  WAITING LIST DATA      NUM             CWHLA             Unknown


C1: [   0    7    5    4   10    1    6    2   15    8   16    9   14   12
    3   17   18  701  702  401  802  303  202  804 1601  210  304  102
  704 1203  306  302 1505 1701  602  501  718]
C2: [   0   16   18   10    7   14    9   12   15    4   17    8    5    6
    1    2    3  702  701  704 12

In [82]:
# df_flat FMTNAME: CWHLA
mapping = {
    0: "0",
    1: "01",
    2: "02",
    3: "03",
    4: "04",
    5: "05",
    6: "06",
    7: "07",
    8: "08",
    9: "09",
    10: "10",
    11: "11",
    12: "12",
    13: "13",
    14: "14",
    15: "15",
    16: "16",
    17: "17",
    18: "18",
    97: "Unknown",
    98: "No second antigen detected",
    99: "Not Tested",
    100: "No antigen detected",
    102: "01:02",
    103: "01:03",
    202: "02:02",
    210: "02:10",
    302: "03:02",
    303: "03:03",
    304: "03:04",
    305: "03:05",
    306: "03:06",
    401: "04:01",
    403: "04:03",
    404: "04:04",
    407: "04:07",
    501: "05:01",
    602: "06:02",
    701: "07:01",
    702: "07:02",
    704: "07:04",
    706: "07:06",
    718: "07:18",
    801: "08:01",
    802: "08:02",
    803: "08:03",
    804: "08:04",
    1202: "12:02",
    1203: "12:03",
    1204: "12:04",
    1402: "14:02",
    1403: "14:03",
    1502: "15:02",
    1504: "15:04",
    1505: "15:05",
    1506: "15:06",
    1509: "15:09",
    1601: "16:01",
    1602: "16:02",
    1604: "16:04",
    1701: "17:01",
    1703: "17:03",
    1801: "18:01",
    1802: "18:02"
}

# iterate
for col in features:
    # mapping feature
    df = uf.mappingCol(df, col, mapping, display=True)


# mapping
colMap = {'C1': 'AntigenC1_CAN', 'C2':'AntigenC2_CAN'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt=f"FMTNAME: CWHLA")

# update dataframe
df_can  = uf.insertIntoDataFrame(df_can, list(colMap.values()))
df_nominal  = uf.insertIntoDataFrame(df_nominal, list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.loc[idx]

Converted Column C1 Unique Vaue(s) ['0', '07', '05', '04', '10', ..., '15:05', '17:01', '06:02', '05:01', '07:18']
Length: 37
Categories (37, object): ['0', '01', '01:02', '02', ..., '16:01', '17', '17:01', '18']
Converted Column C2 Unique Vaue(s) ['0', '16', '18', '10', '07', ..., '07:18', '06:02', '08:03', '02:10', '16:04']
Length: 45
Categories (45, object): ['0', '01', '01:02', '02', ..., '17:03', '18', '18:01', '18:02']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
24,AntigenC1_CAN,Candidate Most Recent/at Removal C1 Antigen From Waiting List,WAITING LIST DATA,1987-10-01,NaT,WAITING LIST DATA,NUM,CWHLA,,C1,Category,FMTNAME: CWHLA
25,AntigenC2_CAN,Candidate Most Recent/at Removal C2 Antigen From Waiting List,WAITING LIST DATA,1987-10-01,NaT,WAITING LIST DATA,NUM,CWHLA,,C2,Category,FMTNAME: CWHLA


### [DDR](https://pmc.ncbi.nlm.nih.gov/articles/PMC5141243/)

In [83]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, '^DDR', False)

        count       mean         std  min   25%   50%   75%      max
DDR1  28746.0  18.421450  220.031914  1.0   4.0   7.0  13.0  10300.0
DDR2  28734.0  29.461892  155.716489  1.0  11.0  13.0  15.0  10300.0

NaNs:
DDR1     5
DDR2    17
dtype: int64

Datatypes:
DDR1    float64
DDR2    float64
dtype: object


   Feature        Description                             FormSection DataType SASAnalysisFormat Comment Information
66    DDR1  DONOR DR1 ANTIGEN  DONOR CENTER HISTOCOMPATIBILITY TYPING      NUM           DRLOCUS             Unknown
67    DDR2  DONOR DR2 ANTIGEN  DONOR CENTER HISTOCOMPATIBILITY TYPING      NUM           DRLOCUS             Unknown




In [84]:
# fill NaN with 999: Missing
df[features] = df[features].fillna(999).astype(int)

# df_flat FMTNAME: DRLOCUS
DRLOCUS = {
    999: "Missing",
    0: "0",
    1: "1",
    2: "2",
    3: "3",
    4: "4",
    5: "5",
    6: "6",
    7: "7",
    8: "8",
    9: "9",
    10: "10",
    11: "11",
    12: "12",
    13: "13",
    14: "14",
    15: "15",
    16: "16",
    17: "17",
    18: "18",
    97: "Unknown",
    98: "No second antigen detected",
    99: "Not Tested",
    101: "01:01",
    102: "01:02",
    103: "01:03",
    301: "03:01",
    302: "03:02",
    303: "03:03",
    401: "04:01",
    402: "04:02",
    403: "04:03",
    404: "04:04",
    405: "04:05",
    406: "04:06",
    407: "04:07",
    410: "04:10",
    411: "04:11",
    801: "08:01",
    802: "08:02",
    803: "08:03",
    807: "08:07",
    901: "09:01",
    902: "09:02",
    1101: "11:01",
    1103: "11:03",
    1104: "11:04",
    1201: "12:01",
    1202: "12:02",
    1301: "13:01",
    1302: "13:02",
    1303: "13:03",
    1305: "13:05",
    1401: "14:01",
    1402: "14:02",
    1403: "14:03",
    1404: "14:04",
    1405: "14:05",
    1406: "14:06",
    1454: "14:54",
    1501: "15:01",
    1502: "15:02",
    1503: "15:03",
    1601: "16:01",
    1602: "16:02",
    10300: "103"
}

# iterate
for col in features:
    # mapping feature
    df = uf.mappingCol(df, col, DRLOCUS, display=True)


# mapping
colMap = {'DDR1': 'AntigenDR1_DON', 'DDR2':'AntigenDR2_DON'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt=f"FMTNAME: DRLOCUS")

# update dataframe
df_don  = uf.insertIntoDataFrame(df_don, list(colMap.values()))
df_nominal  = uf.insertIntoDataFrame(df_nominal, list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.loc[idx]

Converted Column DDR1 Unique Vaue(s) ['4', '10', '11', '1', '7', ..., '08:02', '14:02', '04:11', '11:04', '103']
Length: 44
Categories (44, object): ['01:01', '01:02', '01:03', '03:01', ..., '7', '8', '9', 'Missing']
Converted Column DDR2 Unique Vaue(s) ['15', '16', '8', '10', '11', ..., '04:07', '14:01', '103', '12:02', '5']
Length: 48
Categories (48, object): ['01:01', '01:03', '03:01', '04:01', ..., '8', '9', 'Missing', 'No second antigen detected']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
66,AntigenDR1_DON,DONOR DR1 ANTIGEN,CALCULATED,1987-10-01,NaT,DONOR CENTER HISTOCOMPATIBILITY TYPING,NUM,DRLOCUS,,DDR1,Category,FMTNAME: DRLOCUS
67,AntigenDR2_DON,DONOR DR2 ANTIGEN,CALCULATED,1987-10-01,NaT,DONOR CENTER HISTOCOMPATIBILITY TYPING,NUM,DRLOCUS,,DDR2,Category,FMTNAME: DRLOCUS


### DR5
- Antigen

In [85]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'DR5', True)

          count       mean        std  min  25%  50%   75%   max
DR51    28751.0  25.900977  42.519096  0.0  0.0  0.0  95.0  99.0
DR51_2  28751.0   5.169803  21.672154  0.0  0.0  0.0   0.0  99.0
DR52    28751.0  26.825432  42.828147  0.0  0.0  0.0  95.0  99.0
DR52_2  28751.0   5.085180  21.451180  0.0  0.0  0.0   0.0  99.0
DR53    28751.0  26.268999  42.641010  0.0  0.0  0.0  95.0  99.0
DR53_2  28751.0   5.074745  21.471003  0.0  0.0  0.0   0.0  99.0

NaNs:
DR51      0
DR51_2    0
DR52      0
DR52_2    0
DR53      0
DR53_2    0
dtype: int64

Datatypes:
DR51      int64
DR51_2    int64
DR52      int64
DR52_2    int64
DR53      int64
DR53_2    int64
dtype: object


   Feature                                                      Description        FormSection DataType SASAnalysisFormat Comment Information
82    DR51  Candidate Most Recent/at Removal DR51 Antigen From Waiting List  WAITING LIST DATA      NUM          WKGRPHLA             Unknown
83  DR51_2  Candidate Most Recent/at Removal 

In [86]:
# df_flat FMTNAME: WKGRPHLA
mapping = {
    0: "0",
    1: "1",
    2: "2",
    3: "3",
    4: "4",
    5: "5",
    7: "7",
    95: "Positive",
    96: "Negative",
    98: "Confirmed Blank",
    99: "Not Tested",
    998: "Unknown"
}

# iterate
for col in features:
    # mapping feature
    df = uf.mappingCol(df, col, mapping, display=True)


# mapping
colMap = {'DR51': 'AntigenDR51_CAN', 'DR51_2':'AntigenDR51_2_CAN', 'DR52':'AntigenDR52_CAN', 'DR52_2':'AntigenDR52_2_CAN', 
          'DR53':'AntigenDR53_CAN', 'DR53_2':'AntigenDR53_2_CAN'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt=f"FMTNAME: WKGRPHLA")

# update dataframe
df_don  = uf.insertIntoDataFrame(df_don, list(colMap.values()))
df_nominal  = uf.insertIntoDataFrame(df_nominal, list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.loc[idx]

Converted Column DR51 Unique Vaue(s) ['0', 'Positive', 'Negative', 'Not Tested', '1', '3', '4', '5', '2']
Categories (9, object): ['0', '1', '2', '3', ..., '5', 'Negative', 'Not Tested', 'Positive']
Converted Column DR51_2 Unique Vaue(s) ['0', 'Negative', '3', 'Positive', '5', 'Not Tested', '1', '2']
Categories (8, object): ['0', '1', '2', '3', '5', 'Negative', 'Not Tested', 'Positive']
Converted Column DR52 Unique Vaue(s) ['0', 'Positive', 'Negative', 'Not Tested', '1', '4', '2', '5', '7', '3']
Categories (10, object): ['0', '1', '2', '3', ..., '7', 'Negative', 'Not Tested', 'Positive']
Converted Column DR52_2 Unique Vaue(s) ['0', 'Positive', '1', 'Negative', '5', '2', '3', 'Not Tested', '4', '7']
Categories (10, object): ['0', '1', '2', '3', ..., '7', 'Negative', 'Not Tested', 'Positive']
Converted Column DR53 Unique Vaue(s) ['0', 'Negative', 'Not Tested', 'Positive', '3', '2', '1']
Categories (7, object): ['0', '1', '2', '3', 'Negative', 'Not Tested', 'Positive']
Converted Column DR

Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
82,AntigenDR51_CAN,Candidate Most Recent/at Removal DR51 Antigen From Waiting List,WAITING LIST DATA,1987-10-01,NaT,WAITING LIST DATA,NUM,WKGRPHLA,,DR51,Category,FMTNAME: WKGRPHLA
83,AntigenDR51_2_CAN,Candidate Most Recent/at Removal DR51 Antigen From Waiting List,WAITING LIST DATA,1987-10-01,NaT,WAITING LIST DATA,NUM,WKGRPHLA,,DR51_2,Category,FMTNAME: WKGRPHLA
84,AntigenDR52_CAN,Candidate Most Recent/at Removal DR52 Antigen From Waiting List,WAITING LIST DATA,1987-10-01,NaT,WAITING LIST DATA,NUM,WKGRPHLA,,DR52,Category,FMTNAME: WKGRPHLA
85,AntigenDR52_2_CAN,Candidate Most Recent/at Removal DR52 Antigen From Waiting List,WAITING LIST DATA,1987-10-01,NaT,WAITING LIST DATA,NUM,WKGRPHLA,,DR52_2,Category,FMTNAME: WKGRPHLA
86,AntigenDR53_CAN,Candidate Most Recent/at Removal DR53 Antigen From Waiting List,WAITING LIST DATA,1987-10-01,NaT,WAITING LIST DATA,NUM,WKGRPHLA,,DR53,Category,FMTNAME: WKGRPHLA
87,AntigenDR53_2_CAN,Candidate Most Recent/at Removal DR53 Antigen From Waiting List,WAITING LIST DATA,1987-10-01,NaT,WAITING LIST DATA,NUM,WKGRPHLA,,DR53_2,Category,FMTNAME: WKGRPHLA


### PREV_TX
- Previous Transplant Informatio

In [87]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'PREV_TX', True)

               count unique  top   freq      mean       std  min  25%  50%  75%  max
NUM_PREV_TX  28751.0    NaN  NaN    NaN  0.034607  0.194048  0.0  0.0  0.0  0.0  3.0
PREV_TX        28751      2    N  27856       NaN       NaN  NaN  NaN  NaN  NaN  NaN
PREV_TX_ANY    28751      2    N  27782       NaN       NaN  NaN  NaN  NaN  NaN  NaN

NaNs:
NUM_PREV_TX    0
PREV_TX        0
PREV_TX_ANY    0
dtype: int64

Datatypes:
NUM_PREV_TX     int64
PREV_TX        object
PREV_TX_ANY    object
dtype: object


         Feature                                                                Description        FormSection DataType SASAnalysisFormat Comment Information
203  NUM_PREV_TX                                         THE NUMBER OF PREVIOUS TRANSPLANTS  WAITING LIST DATA      NUM                               Unknown
219      PREV_TX  HISTORY of a PREVIOUS TRANSPLANT INVOLVING EXACT SAME ORGAN AS CURRENT TX                     CHAR(1)                               Unknown
220  PREV_TX_ANY     

In [88]:
# feature value mapping
mapping = {'N': 'No', 'Y': 'Yes'}

# mapping feature
df = uf.mappingCol(df, 'PREV_TX', mapping, display=True)
df = uf.mappingCol(df, 'PREV_TX_ANY', mapping, display=True)

# mapping feature 
colMap = {'NUM_PREV_TX': 'PreviousTransplantNumber_CAN', 'PREV_TX': 'PreviousTransplantSameOrgan_CAN', 'PREV_TX_ANY':'PreviousTransplantAnyOrgan_CAN'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt='')
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, [244,245], Type='Category', txt='Y/N to Yes/No')

# update dataframe
df_can = uf.insertIntoDataFrame(df_can, list(colMap.values()))
df_nominal = uf.insertIntoDataFrame(df_nominal,['PreviousTransplantSameOrgan_CAN', 'PreviousTransplantAnyOrgan_CAN'])
df_ordinal = uf.insertIntoDataFrame(df_ordinal,['PreviousTransplantNumber_CAN'])
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.iloc[idx]

Converted Column PREV_TX Unique Vaue(s) ['No', 'Yes']
Categories (2, object): ['No', 'Yes']
Converted Column PREV_TX_ANY Unique Vaue(s) ['No', 'Yes']
Categories (2, object): ['No', 'Yes']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
203,PreviousTransplantNumber_CAN,THE NUMBER OF PREVIOUS TRANSPLANTS,WAITING LIST DATA,1987-10-01,NaT,WAITING LIST DATA,NUM,,,NUM_PREV_TX,Category,
219,PreviousTransplantSameOrgan_CAN,HISTORY of a PREVIOUS TRANSPLANT INVOLVING EXACT SAME ORGAN AS CURRENT TX,CALCULATED,NaT,NaT,,CHAR(1),,,PREV_TX,Category,
220,PreviousTransplantAnyOrgan_CAN,CALCULATED Previous Transplant of Any Organ Type,CALCULATED,NaT,NaT,,CHAR(1),,,PREV_TX_ANY,Category,


### GENDER

In [89]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'GENDER', True)

            count unique top   freq
GENDER      28751      2   M  21055
GENDER_DON  28751      2   M  20345

NaNs:
GENDER        0
GENDER_DON    0
dtype: int64

Datatypes:
GENDER        object
GENDER_DON    object
dtype: object


        Feature       Description            FormSection DataType SASAnalysisFormat Comment Information
111      GENDER  RECIPIENT GENDER  CANDIDATE INFORMATION  CHAR(1)               SEX             Unknown
112  GENDER_DON      DONOR GENDER      DONOR INFORMATION  CHAR(1)               SEX             Unknown


GENDER: ['M' 'F']
GENDER_DON: ['M' 'F']


In [90]:
# mapping
colMap = {'GENDER': 'Gender_CAN', 'GENDER_DON':'Gender_DON'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt='')

# update dataframe
df_can = uf.insertIntoDataFrame(df_can, ['Gender_CAN'])
df_don = uf.insertIntoDataFrame(df_don, ['Gender_DON'])
df_nominal = uf.insertIntoDataFrame(df_nominal,list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.loc[idx]

Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
111,Gender_CAN,RECIPIENT GENDER,TCR,1987-10-01,NaT,CANDIDATE INFORMATION,CHAR(1),SEX,,GENDER,Category,
112,Gender_DON,DONOR GENDER,DDR/LDR,1987-10-01,NaT,DONOR INFORMATION,CHAR(1),SEX,,GENDER_DON,Category,


### WGT_KG

In [91]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'WGT_KG', False)

                    count       mean        std     min     25%     50%     75%    max
WGT_KG_TCR        28699.0  83.386873  18.406565   9.979  70.307  82.554  95.708  271.0
INIT_WGT_KG_CALC  28729.0  83.381256  18.267346  31.200  70.300  82.600  95.700  182.5
END_WGT_KG_CALC   28744.0  83.404634  18.115061  29.900  70.400  82.600  95.700  165.1
WGT_KG_DON_CALC   28747.0  83.561332  19.553094  23.200  70.000  80.800  94.100  198.0
WGT_KG_CALC       28744.0  83.459237  18.291032  31.900  70.300  82.600  95.700  162.1

NaNs:
WGT_KG_TCR          52
INIT_WGT_KG_CALC    22
END_WGT_KG_CALC      7
WGT_KG_DON_CALC      4
WGT_KG_CALC          7
dtype: int64

Datatypes:
WGT_KG_TCR          float64
INIT_WGT_KG_CALC    float64
END_WGT_KG_CALC     float64
WGT_KG_DON_CALC     float64
WGT_KG_CALC         float64
dtype: object


              Feature                                                Description           FormSection DataType SASAnalysisFormat Comment Information
103   END_WGT_KG_CALC  Ca

In [92]:
# mapping
colMap = {'WGT_KG_DON_CALC': 'WeightKg_DON', 'END_WGT_KG_CALC':'WeightKg_CAN'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Numeric', txt='')
df_dict = uf.updateDictionaryInformation(df_dict, [170,297,298], txt=DROP).copy()

# update dataframe
df_can = uf.insertIntoDataFrame(df_can, ['WeightKg_CAN'])
df_don = uf.insertIntoDataFrame(df_don, ['WeightKg_DON'])
df_numeric = uf.insertIntoDataFrame(df_numeric,list(colMap.values()))
df_drop = uf.insertIntoDataFrame(df_drop, ['INIT_WGT_KG_CALC', 'WGT_KG_CALC', 'WGT_KG_TCR'])

# display
df_dict.loc[idx]

Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
103,WeightKg_CAN,Calculated Candidate Weight in KG at Removal/Current Time,CALCULATED,NaT,NaT,,NUM,,,END_WGT_KG_CALC,Numeric,
170,INIT_WGT_KG_CALC,Calculated Candidate Weight in KG at Listing,CALCULATED,NaT,NaT,,NUM,,,INIT_WGT_KG_CALC,Numeric,** DROP **
297,WGT_KG_CALC,CALCULATED RECIPIENT WEIGHT (kg),CALCULATED,NaT,NaT,,NUM,,,WGT_KG_CALC,Numeric,** DROP **
298,WeightKg_DON,CALCULATED DONOR WEIGHT (KG),CALCULATED,NaT,NaT,,NUM,,,WGT_KG_DON_CALC,Numeric,** DROP **
299,WGT_KG_TCR,RECIPIENT WEIGHT (kg) @ REGISTRATION,TCR,1987-10-01,NaT,CLINICAL INFORMATION,NUM,,,WGT_KG_TCR,Numeric,


### HGT_CM

In [93]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'HGT_CM', False)

                    count        mean        std    min    25%    50%    75%    max
HGT_CM_TCR        28635.0  173.700437  10.433253    2.0  168.0  175.0  180.0  218.0
INIT_HGT_CM_CALC  28751.0  173.736580  10.083580  124.0  167.6  175.2  180.3  213.4
END_HGT_CM_CALC   28751.0  173.697270  10.031508  124.0  167.6  175.0  180.3  213.4
HGT_CM_DON_CALC   28751.0  173.945661   9.576501  115.0  167.6  175.0  180.3  213.0
HGT_CM_CALC       28751.0  173.672902  10.067025  124.0  167.6  175.0  180.3  210.8

NaNs:
HGT_CM_TCR          116
INIT_HGT_CM_CALC      0
END_HGT_CM_CALC       0
HGT_CM_DON_CALC       0
HGT_CM_CALC           0
dtype: int64

Datatypes:
HGT_CM_TCR          float64
INIT_HGT_CM_CALC    float64
END_HGT_CM_CALC     float64
HGT_CM_DON_CALC     float64
HGT_CM_CALC         float64
dtype: object


              Feature                                                Description           FormSection DataType SASAnalysisFormat Comment Information
99    END_HGT_CM_CALC  Calculated Cand

In [94]:
# mapping
colMap = {'END_HGT_CM_CALC': 'HeightCm_CAN', 'HGT_CM_DON_CALC':'HeightCm_DON'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Numeric', txt='')
df_dict = uf.updateDictionaryInformation(df_dict, [140,142,166], txt=DROP).copy()

# update dataframe
df_can = uf.insertIntoDataFrame(df_can, ['HeightCm_CAN'])
df_don = uf.insertIntoDataFrame(df_don, ['HeightCm_DON'])
df_numeric = uf.insertIntoDataFrame(df_numeric,list(colMap.values()))
df_drop = uf.insertIntoDataFrame(df_drop, ['HGT_CM_CALC', 'HGT_CM_TCR', 'INIT_HGT_CM_CALC'])

# display
df_dict.iloc[idx]

Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
99,HeightCm_CAN,Calculated Candidate Height in CM at Removal/Current Time,CALCULATED,NaT,NaT,,NUM,,,END_HGT_CM_CALC,Numeric,
140,HGT_CM_CALC,CALCULATED RECIPIENT HEIGHT(cm),CALCULATED,NaT,NaT,,NUM,,,HGT_CM_CALC,Numeric,** DROP **
141,HeightCm_DON,CALCULATED DONOR HEIGHT (CM),CALCULATED,NaT,NaT,,NUM,,,HGT_CM_DON_CALC,Numeric,
142,HGT_CM_TCR,RECIPIENT HEIGHT @ REGISTRATION,TCR,1987-10-01,NaT,CLINICAL INFORMATION,NUM,,,HGT_CM_TCR,Numeric,** DROP **
166,INIT_HGT_CM_CALC,Calculated Candidate Height in CM at Listing,CALCULATED,NaT,NaT,,NUM,,,INIT_HGT_CM_CALC,Numeric,** DROP **


### CITIZENSHIP

In [95]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'CITIZENSHIP', True)

                   count       mean         std  min  25%  50%  75%    max
CITIZENSHIP      28679.0   1.111196    0.605735  1.0  1.0  1.0  1.0    6.0
CITIZENSHIP_DON  28549.0  57.090336  229.496722  1.0  1.0  1.0  1.0  998.0

NaNs:
CITIZENSHIP         72
CITIZENSHIP_DON    202
dtype: int64

Datatypes:
CITIZENSHIP        float64
CITIZENSHIP_DON    float64
dtype: object


            Feature                           Description            FormSection DataType SASAnalysisFormat                                                                                                                                                   Comment Information
32      CITIZENSHIP  RECIPIENT CITIZENSHIP @ REGISTRATION  CANDIDATE INFORMATION      NUM           CITIZEN                                                                                                                                                               Unknown
33  CITIZENSHIP_DON                     DONOR CITIZENSHIP      DONOR INFORMATIO

In [96]:
findMappingDfFlat(df.CITIZENSHIP_DON, df_flat, 'CITIZEN', 999)

Compare Length: 7 & 5

CODE                                                                           LABEL
   1                                                                      US Citizen
   2                                                                  RESIDENT ALIEN
   3                                                              NON-RESIDENT ALIEN
   4                                                      Non-US Citizen/US Resident
   5 Non-US Citizen/Non-US Resident, Traveled to US for Reason Other Than Transplant


In [97]:
# df_flat FMTNAME: CITIZEN
mapping = {
    1: 'US Citizen',
    2: 'RESIDENT ALIEN',
    3: 'NON-RESIDENT ALIEN',
    4: 'Non-US Citizen/US Resident',
    5: 'Non-US Citizen/Non-US Resident, Traveled to US for Reason Other Than Transplant',
    998: 'Unknown',
    999: 'Missing'
}

# fill NaN with 998: Missing & convert to integer
df[features] = df[features].fillna(999).astype(int)

# mapping feature
df = uf.mappingCol(df, 'CITIZENSHIP', mapping, display=True)
df = uf.mappingCol(df, 'CITIZENSHIP_DON', mapping, display=True)

# mapping
colMap = {'CITIZENSHIP': 'Citizenship_CAN', 'CITIZENSHIP_DON':'Citizenship_DON'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt='Flat file - FMTNAME: CITIZEN')

# update dataframe
df_can = uf.insertIntoDataFrame(df_can, ['Citizenship_CAN'])
df_don = uf.insertIntoDataFrame(df_don, ['Citizenship_DON'])
df_nominal = uf.insertIntoDataFrame(df_nominal, list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.loc[idx]

Converted Column CITIZENSHIP Unique Vaue(s) ['US Citizen', 'Non-US Citizen/US Resident', 'RESIDENT ALIEN', 6, 'Non-US Citizen/Non-US Resident, Traveled to US for Reason Other Than Transplant', 'NON-RESIDENT ALIEN', 'Missing']
Categories (7, object): [6, 'Missing', 'NON-RESIDENT ALIEN', 'Non-US Citizen/Non-US Resident, Traveled to US for Reason Other Than Transplant', 'Non-US Citizen/US Resident', 'RESIDENT ALIEN', 'US Citizen']
Converted Column CITIZENSHIP_DON Unique Vaue(s) ['US Citizen', 'Non-US Citizen/US Resident', 'Non-US Citizen/Non-US Resident, Traveled to US for Reason Other Than Transplant', 'Missing', 'Unknown', 'NON-RESIDENT ALIEN', 'RESIDENT ALIEN']
Categories (7, object): ['Missing', 'NON-RESIDENT ALIEN', 'Non-US Citizen/Non-US Resident, Traveled to US for Reason Other Than Transplant', 'Non-US Citizen/US Resident', 'RESIDENT ALIEN', 'US Citizen', 'Unknown']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
32,Citizenship_CAN,RECIPIENT CITIZENSHIP @ REGISTRATION,TCR,1987-10-01,NaT,CANDIDATE INFORMATION,NUM,CITIZEN,,CITIZENSHIP,Category,Flat file - FMTNAME: CITIZEN
33,Citizenship_DON,DONOR CITIZENSHIP,DDR/LDR,1987-10-01,NaT,DONOR INFORMATION,NUM,,This field uses separate SAS Analysis Format types for donor type. For deceased donors (don_ty=C) use CITIZDDR. For living donors (don_ty=L) use CITIZEN,CITIZENSHIP_DON,Category,Flat file - FMTNAME: CITIZEN


### STATE

In [98]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'STATE', True)

                count unique top  freq
PERM_STATE      28659     55  CA  3395
PERM_STATE_TRR  28277     54  CA  3372
HOME_STATE_DON  28430     54  CA  2966

NaNs:
PERM_STATE         92
PERM_STATE_TRR    474
HOME_STATE_DON    321
dtype: int64

Datatypes:
PERM_STATE        object
PERM_STATE_TRR    object
HOME_STATE_DON    object
dtype: object


            Feature                                   Description            FormSection DataType SASAnalysisFormat Comment Information
154  HOME_STATE_DON                                 DR HOME STATE      DONOR INFORMATION  CHAR(2)             STATE             Unknown
209      PERM_STATE  RECIPIENT STATE OF RESIDENCY  @ REGISTRATION  CANDIDATE INFORMATION  CHAR(2)             STATE             Unknown
210  PERM_STATE_TRR    RECIPIENT STATE OF RESIDENCY  @ TRANSPLANT  CANDIDATE INFORMATION  CHAR(2)             STATE             Unknown


PERM_STATE: ['CA' 'TX' 'NY' 'NC' 'KY' 'MA' 'MI' 'IL' 'MN' 'FL' 'MD' 'CT' 'GA' 'PA'
 'WI' 'VA' 'OK' 'UT' 'OH' 

In [99]:
findMappingDfFlat(df.PERM_STATE, df_flat, 'STATE', 'XX')

Compare Length: 56 & 55

CODE             LABEL
  AK            ALASKA
  AL           ALABAMA
  AR          ARKANSAS
  AZ           ARIZONA
  CA        CALIFORNIA
  CO          COLORADO
  CT       CONNECTICUT
  DC DIST. OF COLUMBIA
  DE          DELAWARE
  FL           FLORIDA
  GA           GEORGIA
  GU              GUAM
  HI            HAWAII
  IA              IOWA
  ID             IDAHO
  IL          ILLINOIS
  IN           INDIANA
  KS            KANSAS
  KY          KENTUCKY
  LA         LOUISIANA
  MA     MASSACHUSETTS
  MD          MARYLAND
  ME             MAINE
  MI          MICHIGAN
  MN         MINNESOTA
  MO          MISSOURI
  MS       MISSISSIPPI
  MT           MONTANA
  NC    NORTH CAROLINA
  ND      NORTH DAKOTA
  NE          NEBRASKA
  NH     NEW HAMPSHIRE
  NJ        NEW JERSEY
  NM        NEW MEXICO
  NV            NEVADA
  NY          NEW YORK
  OH              OHIO
  OK          OKLAHOMA
  OR            OREGON
  PA      PENNSYLVANIA
  PR       PUERTO RICO
  RI     

In [100]:
# fill NaN with XX: Missing & convert to category data type
df[features] = df[features].fillna('XX')

# mapping
colMap = {'HOME_STATE_DON':'ResidencyState_DON', 'PERM_STATE': 'ResidencyStateRegistration_CAN', 'PERM_STATE_TRR':'ResidencyStateTransplant_CAN'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt='')

# update dataframe
df_can = uf.insertIntoDataFrame(df_can, ['ResidencyStateRegistration_CAN', 'ResidencyStateTransplant_CAN'])
df_don = uf.insertIntoDataFrame(df_don, ['ResidencyState_DON'])
df_nominal = uf.insertIntoDataFrame(df_nominal, list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.loc[idx]

Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
154,ResidencyState_DON,DR HOME STATE,DDR/LDR,1987-10-01,NaT,DONOR INFORMATION,CHAR(2),STATE,,HOME_STATE_DON,Category,
209,ResidencyStateRegistration_CAN,RECIPIENT STATE OF RESIDENCY @ REGISTRATION,TCR,1997-10-01,NaT,CANDIDATE INFORMATION,CHAR(2),STATE,,PERM_STATE,Category,
210,ResidencyStateTransplant_CAN,RECIPIENT STATE OF RESIDENCY @ TRANSPLANT,TRR,2004-06-30,NaT,CANDIDATE INFORMATION,CHAR(2),STATE,,PERM_STATE_TRR,Category,


### EDUCATION

In [101]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'EDUCATION', True)

             count       mean         std  min  25%  50%  75%    max
EDUCATION  28677.0  39.130558  183.681394  1.0  3.0  4.0  5.0  998.0

NaNs:
EDUCATION    74
dtype: int64

Datatypes:
EDUCATION    float64
dtype: object


      Feature                                         Description            FormSection DataType SASAnalysisFormat Comment Information
95  EDUCATION  RECIPIENT HIGHEST EDUCATIONAL LEVEL @ REGISTRATION  CANDIDATE INFORMATION      NUM           EDLEVEL             Unknown


EDUCATION: [  4.   3.   5.   6.   2. 998.   1.  nan]


In [102]:
findMappingDfFlat(df.EDUCATION, df_flat, 'EDLEVEL', 999)

Compare Length: 8 & 7

CODE                             LABEL
   1                              NONE
   2                GRADE SCHOOL (0-8)
   3         HIGH SCHOOL (9-12) or GED
   4 ATTENDED COLLEGE/TECHNICAL SCHOOL
   5         ASSOCIATE/BACHELOR DEGREE
   6      POST-COLLEGE GRADUATE DEGREE
 998                           UNKNOWN


In [103]:
# df_flat FMTNAME: EDLEVEL
mapping = {
    1: 'NONE',
    2: 'GRADE SCHOOL (0-8)',
    3: 'HIGH SCHOOL (9-12) or GED',
    4: 'ATTENDED COLLEGE/TECHNICAL SCHOOL',
    5: 'ASSOCIATE/BACHELOR DEGREE',
    6: 'POST-COLLEGE GRADUATE DEGREE',
    998: 'UNKNOWN',
    999: 'Missing'
}

# fill NaN with 998: Missing & covert to integer
df[features] = df[features].fillna(999).astype(int)

# mapping feature
df = uf.mappingCol(df, 'EDUCATION', mapping, display=True)

# mapping
colMap = {'EDUCATION': 'EducationLevel_CAN'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt='FMTNAME: EDLEVEL')

# update dataframe
df_can = uf.insertIntoDataFrame(df_can, list(colMap.values()))
df_ordinal = uf.insertIntoDataFrame(df_ordinal, list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values())).copy()

# display
df_dict.loc[idx]

Converted Column EDUCATION Unique Vaue(s) ['ATTENDED COLLEGE/TECHNICAL SCHOOL', 'HIGH SCHOOL (9-12) or GED', 'ASSOCIATE/BACHELOR DEGREE', 'POST-COLLEGE GRADUATE DEGREE', 'GRADE SCHOOL (0-8)', 'UNKNOWN', 'NONE', 'Missing']
Categories (8, object): ['ASSOCIATE/BACHELOR DEGREE', 'ATTENDED COLLEGE/TECHNICAL SCHOOL', 'GRADE SCHOOL (0-8)', 'HIGH SCHOOL (9-12) or GED', 'Missing', 'NONE', 'POST-COLLEGE GRADUATE DEGREE', 'UNKNOWN']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
95,EducationLevel_CAN,RECIPIENT HIGHEST EDUCATIONAL LEVEL @ REGISTRATION,TCR,1994-04-01,NaT,CANDIDATE INFORMATION,NUM,EDLEVEL,,EDUCATION,Category,FMTNAME: EDLEVEL


### Life Support

#### ECMO & IABP & INHALED & OTH_LIFE & PGE

In [104]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'ECMO|IABP|INHALED|OTH_LIFE|PGE', True)

                    count      mean       std  min  25%  50%  75%  max
ECMO_TCR          28751.0  0.018678  0.135386  0.0  0.0  0.0  0.0  1.0
IABP_TCR          28751.0  0.084101  0.277545  0.0  0.0  0.0  0.0  1.0
INHALED_NO        28751.0  0.002052  0.045254  0.0  0.0  0.0  0.0  1.0
PGE_TCR           28751.0  0.000591  0.024310  0.0  0.0  0.0  0.0  1.0
OTH_LIFE_SUP_TCR  28751.0  0.048729  0.215304  0.0  0.0  0.0  0.0  1.0
ECMO_TRR          28751.0  0.024243  0.153804  0.0  0.0  0.0  0.0  1.0
PGE_TRR           28751.0  0.002922  0.053974  0.0  0.0  0.0  0.0  1.0
IABP_TRR          28751.0  0.138395  0.345320  0.0  0.0  0.0  0.0  1.0
OTH_LIFE_SUP_TRR  28751.0  0.070085  0.255294  0.0  0.0  0.0  0.0  1.0
INHALED_NO_TRR    28751.0  0.001843  0.042896  0.0  0.0  0.0  0.0  1.0
INHALED_NO_TCR    28751.0  0.002052  0.045254  0.0  0.0  0.0  0.0  1.0

NaNs:
ECMO_TCR            0
IABP_TCR            0
INHALED_NO          0
PGE_TCR             0
OTH_LIFE_SUP_TCR    0
ECMO_TRR            0
PGE_TRR  

In [105]:
# fill NaN with 999: Missing & covert to integer
df[features] = df[features].fillna(999).astype(int)

# mapping
colMap = {'ECMO_TCR': 'LifeSupportRegistration_ECMO_CAN', 'ECMO_TRR':'LifeSupportTransplant_ECMO_CAN',
          'IABP_TCR': 'LifeSupportRegistration_IABP_CAN', 'IABP_TRR':'LifeSupportTransplant_IABP_CAN',
          'INHALED_NO_TCR': 'LifeSupportInhaledRegistration_CAN','INHALED_NO_TRR':'LifeSupportInhaledTransplant_CAN', 'INHALED_NO': 'LifeSupportInhaled_CAN',
          'OTH_LIFE_SUP_TCR': 'LifeSupportMechanismRegistration_OTHER_CAN', 'OTH_LIFE_SUP_TRR':'LifeSupportMechanismTransplant_OTHER_CAN',
          'PGE_TCR': 'LifeSupportRegistration_PGE_CAN', 'PGE_TRR':'LifeSupportTransplant_PGE_CAN'
         }

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt='')

# update dataframe
df_can = uf.insertIntoDataFrame(df_can, list(colMap.values()))
df_nominal = uf.insertIntoDataFrame(df_nominal,list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.loc[idx]

Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
93,LifeSupportRegistration_ECMO_CAN,PATIENT ON LIFE SUPPORT - ECMO @ REGISTRATION,TCR,1995-04-01,NaT,CANDIDATE INFORMATION,NUM,,,ECMO_TCR,Category,
94,LifeSupportTransplant_ECMO_CAN,PATIENT ON LIFE SUPPORT - ECMO @ TRANSPLANT,TRR,1995-04-01,NaT,PATIENT STATUS,NUM,,,ECMO_TRR,Category,
155,LifeSupportRegistration_IABP_CAN,PATIENT ON LIFE SUPPORT - IABP @ REGISTRATION,TCR,1994-04-01,NaT,CANDIDATE INFORMATION,NUM,,,IABP_TCR,Category,
156,LifeSupportTransplant_IABP_CAN,PATIENT ON LIFE SUPPORT - IABP @ TRANSPLANT,TRR,1987-10-01,NaT,PATIENT STATUS,NUM,,,IABP_TRR,Category,
159,LifeSupportInhaled_CAN,CANDIDATE INHALED NO,WL,NaT,NaT,WAITING LIST DATA,NUM,,,INHALED_NO,Category,
160,LifeSupportInhaledRegistration_CAN,TCR Patient on Life Support://Inhaled NO,TCR,2004-06-30,NaT,CANDIDATE INFORMATION,NUM,,,INHALED_NO_TCR,Category,
161,LifeSupportInhaledTransplant_CAN,TRR Patient on Life Support://Inhaled NO,TRR,NaT,NaT,PATIENT STATUS,NUM,,,INHALED_NO_TRR,Category,
205,LifeSupportMechanismRegistration_OTHER_CAN,"OTHER MECHANISM OF LIFE Y/N, 1=Y @ REGISTRATION",TCR,1994-04-01,NaT,CANDIDATE INFORMATION,NUM,,,OTH_LIFE_SUP_TCR,Category,
206,LifeSupportMechanismTransplant_OTHER_CAN,"OTHER MECHANISM OF LIFE Y/N, 1=Y @ TRANSPLANT",TRR,1990-10-01,NaT,PATIENT STATUS,NUM,,,OTH_LIFE_SUP_TRR,Category,
211,LifeSupportRegistration_PGE_CAN,PATIENT ON LIFE SUPPORT: PGE @ REGISTRATION,TCR,1995-04-01,2004-06-30,CANDIDATE INFORMATION,NUM,,,PGE_TCR,Category,


### DROP FEATURES

#### PROS_INFUS_TCR & PROSTACYCLIN_TCR & PROS_INFUS_TRR & PROSTACYCLIN_TRR & LT_ONE_WEEK_DON & ORGAN & RECOV_OUT_US & DATA_TRANSPLANT & DATA_WAITLIST

In [106]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'PROS|PROSTACYCLIN|LT_ONE_WEEK_DON|ORGAN|RECOV_OUT_US|DATA', True)

                    count unique  top   freq mean  std  min  25%  50%  75%  max
PROS_INFUS_TCR    28751.0    NaN  NaN    NaN  0.0  0.0  0.0  0.0  0.0  0.0  0.0
PROSTACYCLIN_TCR  28751.0    NaN  NaN    NaN  0.0  0.0  0.0  0.0  0.0  0.0  0.0
PROS_INFUS_TRR    28751.0    NaN  NaN    NaN  0.0  0.0  0.0  0.0  0.0  0.0  0.0
PROSTACYCLIN_TRR  28751.0    NaN  NaN    NaN  0.0  0.0  0.0  0.0  0.0  0.0  0.0
ORGAN               28751      1   HR  28751  NaN  NaN  NaN  NaN  NaN  NaN  NaN
RECOV_OUT_US        28748      2    N  28721  NaN  NaN  NaN  NaN  NaN  NaN  NaN
LT_ONE_WEEK_DON     28751      1    N  28751  NaN  NaN  NaN  NaN  NaN  NaN  NaN
DATA_TRANSPLANT     28751      1    Y  28751  NaN  NaN  NaN  NaN  NaN  NaN  NaN
DATA_WAITLIST       28751      1    Y  28751  NaN  NaN  NaN  NaN  NaN  NaN  NaN

NaNs:
PROS_INFUS_TCR      0
PROSTACYCLIN_TCR    0
PROS_INFUS_TRR      0
PROSTACYCLIN_TRR    0
ORGAN               0
RECOV_OUT_US        3
LT_ONE_WEEK_DON     0
DATA_TRANSPLANT     0
DATA_WAITLIST    

In [107]:
# mapping
colMap = {}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Unknown', txt=f"{DROP} No Value ADDED.")

# update dataframe
df_drop = uf.insertIntoDataFrame(df_drop, features)

# display
df_dict.loc[idx]

Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
50,DATA_TRANSPLANT,Recipient TRR Data Reported,CALCULATED,NaT,NaT,,CHAR(1),,,DATA_TRANSPLANT,Unknown,** DROP ** No Value ADDED.
51,DATA_WAITLIST,Candidate WL Data Reported,CALCULATED,NaT,NaT,,CHAR(1),,,DATA_WAITLIST,Unknown,** DROP ** No Value ADDED.
194,LT_ONE_WEEK_DON,Donor Less Than 7 Days Old at Time of Donation,CALCULATED,2016-09-13,NaT,,CHAR(1),,,LT_ONE_WEEK_DON,Unknown,** DROP ** No Value ADDED.
204,ORGAN,ORGAN TYPE TRANSPLANTED,CALCULATED,NaT,NaT,,CHAR(2),,,ORGAN,Unknown,** DROP ** No Value ADDED.
229,PROS_INFUS_TCR,TCR CANDIDATE PROSTACYCLIN INFUSION,TCR,NaT,NaT,,NUM,,,PROS_INFUS_TCR,Unknown,** DROP ** No Value ADDED.
230,PROS_INFUS_TRR,TRR RECIPIENT PROSTACYCLIN INFUSION,TRR,NaT,NaT,PATIENT STATUS,NUM,,,PROS_INFUS_TRR,Unknown,** DROP ** No Value ADDED.
231,PROSTACYCLIN_TCR,TCR CANDIDATE PROSTACYCLIN INHALATION,TCR,NaT,NaT,,NUM,,,PROSTACYCLIN_TCR,Unknown,** DROP ** No Value ADDED.
232,PROSTACYCLIN_TRR,TRR RECIPIENT PROSTACYCLIN INHALATION,TRR,2004-06-30,NaT,PATIENT STATUS,NUM,,,PROSTACYCLIN_TRR,Unknown,** DROP ** No Value ADDED.
254,RECOV_OUT_US,ORGAN RECOVERED OUTSIDE U.S.,LDR,1999-10-25,NaT,ORGAN RECOVERY,CHAR(1),,,RECOV_OUT_US,Unknown,** DROP ** No Value ADDED.


#### _FLG
- INIT_LLU_FLG & INIT_RLU_FLG & INIT_BLU_FLG & END_LLU_FLG & END_RLU_FLG & END_BLU_FLG

In [108]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, '_FLG', True)

                count  mean  std  min  25%  50%  75%  max
INIT_LLU_FLG  28751.0   0.0  0.0  0.0  0.0  0.0  0.0  0.0
INIT_RLU_FLG  28751.0   0.0  0.0  0.0  0.0  0.0  0.0  0.0
INIT_BLU_FLG  28751.0   0.0  0.0  0.0  0.0  0.0  0.0  0.0
END_LLU_FLG   28751.0   0.0  0.0  0.0  0.0  0.0  0.0  0.0
END_RLU_FLG   28751.0   0.0  0.0  0.0  0.0  0.0  0.0  0.0
END_BLU_FLG   28751.0   0.0  0.0  0.0  0.0  0.0  0.0  0.0

NaNs:
INIT_LLU_FLG    0
INIT_RLU_FLG    0
INIT_BLU_FLG    0
END_LLU_FLG     0
END_RLU_FLG     0
END_BLU_FLG     0
dtype: int64

Datatypes:
INIT_LLU_FLG    float64
INIT_RLU_FLG    float64
INIT_BLU_FLG    float64
END_LLU_FLG       int64
END_RLU_FLG       int64
END_BLU_FLG       int64
dtype: object


          Feature                                                 Description        FormSection DataType SASAnalysisFormat Comment Information
96    END_BLU_FLG   LUNG PREFERENCE AT REMOVAL/CURRENT TIME/ TCR - BOTH (1=Y)  WAITING LIST DATA      NUM                               Unknown
100   

In [109]:
# mapping
colMap = {}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Unknown', txt=f"{DROP} No Value ADDED.")

# update dataframe
df_drop  = uf.insertIntoDataFrame(df_drop , features)

# display
df_dict.loc[idx]

Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
96,END_BLU_FLG,LUNG PREFERENCE AT REMOVAL/CURRENT TIME/ TCR - BOTH (1=Y),WAITING LIST DATA,1995-03-01,NaT,WAITING LIST DATA,NUM,,,END_BLU_FLG,Unknown,** DROP ** No Value ADDED.
100,END_LLU_FLG,LUNG PREFERENCE AT REMOVAL/CURRENT TIME/ TCR - LEFT (1=Y),WAITING LIST DATA,1995-03-01,NaT,WAITING LIST DATA,NUM,,,END_LLU_FLG,Unknown,** DROP ** No Value ADDED.
101,END_RLU_FLG,LUNG PREFERENCE AT REMOVAL/CURRENT TIME/ TCR - RIGHT (1=Y),WAITING LIST DATA,1995-03-01,NaT,WAITING LIST DATA,NUM,,,END_RLU_FLG,Unknown,** DROP ** No Value ADDED.
163,INIT_BLU_FLG,LUNG PREFERENCE AT LISTING - BOTH (1=Y),WAITING LIST DATA,1995-03-01,NaT,WAITING LIST DATA,NUM,,,INIT_BLU_FLG,Unknown,** DROP ** No Value ADDED.
167,INIT_LLU_FLG,LUNG PREFERENCE AT LISTING - LEFT (1=Y),WAITING LIST DATA,1995-03-01,NaT,WAITING LIST DATA,NUM,,,INIT_LLU_FLG,Unknown,** DROP ** No Value ADDED.
168,INIT_RLU_FLG,LUNG PREFERENCE AT LISTING - RIGHT (1=Y),WAITING LIST DATA,1995-03-01,NaT,WAITING LIST DATA,NUM,,,INIT_RLU_FLG,Unknown,** DROP ** No Value ADDED.


### INOTROPES

In [110]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'INOTROPES', True)

                 count      mean       std  min  25%  50%  75%  max
INOTROPES_TCR  28751.0  0.328823  0.469794  0.0  0.0  0.0  1.0  1.0
INOTROPES_TRR  28751.0  0.370491  0.482945  0.0  0.0  0.0  1.0  1.0

NaNs:
INOTROPES_TCR    0
INOTROPES_TRR    0
dtype: int64

Datatypes:
INOTROPES_TCR    int64
INOTROPES_TRR    int64
dtype: object


           Feature                  Description            FormSection DataType SASAnalysisFormat Comment Information
183  INOTROPES_TCR  IV INOTROPES @ REGISTRATION  CANDIDATE INFORMATION      NUM                               Unknown
184  INOTROPES_TRR    IV INOTROPES @ TRANSPLANT         PATIENT STATUS      NUM                               Unknown


INOTROPES_TCR: [0 1]
INOTROPES_TRR: [0 1]


In [111]:
# mapping
colMap = {'INOTROPES_TCR': 'IntropesIVRegistration_CAN', 'INOTROPES_TRR':'IntropesIVTransplant_CAN'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt='')

# update dataframe
df_can = uf.insertIntoDataFrame(df_can, list(colMap.values()))
df_nominal = uf.insertIntoDataFrame(df_nominal,list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.loc[idx]

Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
183,IntropesIVRegistration_CAN,IV INOTROPES @ REGISTRATION,TCR,1994-04-01,NaT,CANDIDATE INFORMATION,NUM,,,INOTROPES_TCR,Category,
184,IntropesIVTransplant_CAN,IV INOTROPES @ TRANSPLANT,TRR,1994-04-01,NaT,PATIENT STATUS,NUM,,,INOTROPES_TRR,Category,


### VAD_DEVICE

In [112]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'VAD_DEVICE', True)

                     count      mean       std  min  25%  50%  75%  max
VAD_DEVICE_TY_TCR  28679.0  1.336204  0.645582  1.0  1.0  1.0  2.0  5.0
VAD_DEVICE_TY_TRR  28351.0  1.508871  0.743829  1.0  1.0  1.0  2.0  5.0

NaNs:
VAD_DEVICE_TY_TCR     72
VAD_DEVICE_TY_TRR    400
dtype: int64

Datatypes:
VAD_DEVICE_TY_TCR    float64
VAD_DEVICE_TY_TRR    float64
dtype: object


               Feature                              Description            FormSection DataType SASAnalysisFormat Comment Information
286  VAD_DEVICE_TY_TCR  CANDIDATE TYPE OF VAD DEVICE AT LISTING  CANDIDATE INFORMATION      NUM          VADDEVTY             Unknown
287  VAD_DEVICE_TY_TRR                      TRR VAD DEVICE TYPE         PATIENT STATUS      NUM          VADDEVTY             Unknown


VAD_DEVICE_TY_TCR: [ 5.  2.  1.  4.  3. nan]
VAD_DEVICE_TY_TRR: [ 5.  2.  1.  4.  3. nan]


In [113]:
findMappingDfFlat(df.VAD_DEVICE_TY_TCR, df_flat, 'VADDEVTY', NaN=998)

Compare Length: 6 & 5

CODE     LABEL
   1      NONE
   2      LVAD
   3      RVAD
   4       TAH
   5 LVAD+RVAD


In [114]:
# fill NaN with 998: Missing & convert to integer
df[features] = df[features].fillna(998).astype(int)

# df_flat FMTNAME:  VADDEVTY
mapping = {
    1: 'None',
    2: 'Lvad',
    3: 'Rvad',
    4: 'Tah',
    5: 'Lvad+Rvad',
    6: 'Lvad/Rvad/Tah unspecified',
    998: 'Missing'
}

# iterate
for col in features:
    # mapping feature
    df = uf.mappingCol(df, col, mapping, display=True)


# mapping
colMap = {'VAD_DEVICE_TY_TCR': 'VentricularDeviceTypeRegistration_CAN', 'VAD_DEVICE_TY_TRR':'VentricularDeviceTypeTransplant_CAN'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Object', txt='FMTNAME: VADDEVTY - Type of Ventricular Device used.')

# update dataframe
df_can = uf.insertIntoDataFrame(df_can, list(colMap.values()))
df_object = uf.insertIntoDataFrame(df_object,list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.loc[idx]

Converted Column VAD_DEVICE_TY_TCR Unique Vaue(s) ['Lvad+Rvad', 'Lvad', 'None', 'Tah', 'Rvad', 'Missing']
Categories (6, object): ['Lvad', 'Lvad+Rvad', 'Missing', 'None', 'Rvad', 'Tah']
Converted Column VAD_DEVICE_TY_TRR Unique Vaue(s) ['Lvad+Rvad', 'Lvad', 'None', 'Tah', 'Rvad', 'Missing']
Categories (6, object): ['Lvad', 'Lvad+Rvad', 'Missing', 'None', 'Rvad', 'Tah']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
286,VentricularDeviceTypeRegistration_CAN,CANDIDATE TYPE OF VAD DEVICE AT LISTING,TCR,2004-06-30,NaT,CANDIDATE INFORMATION,NUM,VADDEVTY,,VAD_DEVICE_TY_TCR,Object,FMTNAME: VADDEVTY - Type of Ventricular Device used.
287,VentricularDeviceTypeTransplant_CAN,TRR VAD DEVICE TYPE,TRR,2004-06-30,NaT,PATIENT STATUS,NUM,VADDEVTY,,VAD_DEVICE_TY_TRR,Object,FMTNAME: VADDEVTY - Type of Ventricular Device used.


In [115]:
df_dict.loc[idx]

Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
286,VentricularDeviceTypeRegistration_CAN,CANDIDATE TYPE OF VAD DEVICE AT LISTING,TCR,2004-06-30,NaT,CANDIDATE INFORMATION,NUM,VADDEVTY,,VAD_DEVICE_TY_TCR,Object,FMTNAME: VADDEVTY - Type of Ventricular Device used.
287,VentricularDeviceTypeTransplant_CAN,TRR VAD DEVICE TYPE,TRR,2004-06-30,NaT,PATIENT STATUS,NUM,VADDEVTY,,VAD_DEVICE_TY_TRR,Object,FMTNAME: VADDEVTY - Type of Ventricular Device used.


### VAD_BRAND

In [116]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'VAD_BRAND', True)

                  count        mean         std    min    25%    50%    75%    max
VAD_BRAND1_TCR   8163.0  277.820777  207.744281  201.0  205.0  205.0  226.0  999.0
VAD_BRAND1_TRR  12074.0  268.689167  188.006343  201.0  205.0  224.0  226.0  999.0

NaNs:
VAD_BRAND1_TCR    20588
VAD_BRAND1_TRR    16677
dtype: int64

Datatypes:
VAD_BRAND1_TCR    float64
VAD_BRAND1_TRR    float64
dtype: object


            Feature                       Description            FormSection DataType SASAnalysisFormat Comment Information
284  VAD_BRAND1_TCR  CANDIDATE VAD BRAND 1 AT LISTING  CANDIDATE INFORMATION      NUM                               Unknown
285  VAD_BRAND1_TRR      TRR LIFE SUPPORT VAD BRAND 1         PATIENT STATUS      NUM          VADBRAND             Unknown


VAD_BRAND1_TCR: [227. 205.  nan 402. 221. 231. 224. 215. 999. 325. 210. 226. 208. 320.
 228. 225. 209. 217. 313. 230. 316. 236. 318. 223. 237. 311. 235. 321.
 238. 332. 329. 330. 218. 319. 309. 201. 204. 216. 222.]
VAD_BRAND1_TRR

In [117]:
findMappingDfFlat(df.VAD_BRAND1_TCR, df_flat, 'VADBRAND', NaN=998)

Compare Length: 39 & 38

CODE                           LABEL
 201                Abiomed BVS 5000
 204                      Biomedicus
 205                    Heartmate II
 208                   Heartmate XVE
 209                  Heartsaver VAD
 210                     Jarvik 2000
 215     Cardiac Assist Tandem Heart
 216                        Thoratec
 217                   Thoratec IVAD
 218                          Toyobo
 221                  Abiomed AB5000
 222              Berlin Heart EXCOR
 223                        Evaheart
 224                  Heartware HVAD
 225             Impella Recover 2.5
 226             Impella Recover 5.0
 227 CentriMag (Thoratec/Levitronix)
 228          Maquet Jostra Rotaflow
 230                Terumo DuraHeart
 231                   Thoratec PVAD
 235       Cardiac Assist Protek Duo
 236                   HeartMate III
 237                      Impella CP
 238                      Impella RP
 309                  Abiomed AB5000
 311     Card

In [118]:
findMappingDfFlat(df.VAD_BRAND1_TRR, df_flat, 'VADBRAND', NaN=998)

Compare Length: 41 & 40

CODE                           LABEL
 201                Abiomed BVS 5000
 204                      Biomedicus
 205                    Heartmate II
 208                   Heartmate XVE
 209                  Heartsaver VAD
 210                     Jarvik 2000
 215     Cardiac Assist Tandem Heart
 217                   Thoratec IVAD
 218                          Toyobo
 221                  Abiomed AB5000
 222              Berlin Heart EXCOR
 223                        Evaheart
 224                  Heartware HVAD
 225             Impella Recover 2.5
 226             Impella Recover 5.0
 227 CentriMag (Thoratec/Levitronix)
 228          Maquet Jostra Rotaflow
 230                Terumo DuraHeart
 231                   Thoratec PVAD
 232           Ventracor VentrAssist
 233              Worldheart Levacor
 235       Cardiac Assist Protek Duo
 236                   HeartMate III
 237                      Impella CP
 309                  Abiomed AB5000
 311     Card

In [119]:
# check for differences between two sets
uf.symmetricDifference(set(df.VAD_BRAND1_TCR.dropna().unique().astype(int)), set(df.VAD_BRAND1_TRR.dropna().unique().astype(int)))

Symmetric difference: [216, 232, 233, 238, 317, 330, 331, 401]


In [120]:
# fill NaN with 998: Missing & convert to integer
df[features] = df[features].fillna(998).astype(int)

# df_flat FMTNAME: VADBRAND
mapping = {
    201: "Abiomed BVS 5000",
    204: "Biomedicus",
    205: "Heartmate II",
    208: "Heartmate XVE",
    209: "Heartsaver VAD",
    210: "Jarvik 2000",
    215: "Cardiac Assist Tandem Heart",
    216: "Thoratec",
    217: "Thoratec IVAD",
    218: "Toyobo",
    221: "Abiomed AB5000",
    222: "Berlin Heart EXCOR",
    223: "Evaheart",
    224: "Heartware HVAD",
    225: "Impella Recover 2.5",
    226: "Impella Recover 5.0",
    227: "CentriMag (Thoratec/Levitronix)",
    228: "Maquet Jostra Rotaflow",
    230: "Terumo DuraHeart",
    231: "Thoratec PVAD",
    235: "Cardiac Assist Protek Duo",
    236: "HeartMate III",
    237: "Impella CP",
    238: "Impella RP",
    309: "Abiomed AB5000",
    311: "Cardiac Assist Tandem Heart",
    313: "Heartmate II",
    316: "Heartware HVAD",
    317: "Impella Recover 2.5",
    318: "Impella Recover 5.0",
    319: "Jarvik 2000",
    320: "CentriMag (Thoratec/Levitronix)",
    321: "Maquet Jostra Rotaflow",
    325: "Thoratec PVAD",
    329: "Cardiac Assist Protek Duo",
    330: "HeartMate III",
    331: "Impella CP",
    332: "Impella RP",
    402: "SynCardia CardioWest",
    999: "Other, Specify",
    998: "Missing"
}

# iterate
for col in features:
    # mapping feature
    df = uf.mappingCol(df, col, mapping, display=False)


# mapping
colMap = {'VAD_BRAND1_TCR': 'VentricularDeviceBrandRegistration_CAN', 'VAD_BRAND1_TRR':'VentricularDeviceBrandTransplant_CAN'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Object', txt='FMTNAME: VADBRAND - Type of Ventricular Device Brand used.')

# update dataframe
df_can = uf.insertIntoDataFrame(df_can, list(colMap.values()))
df_object = uf.insertIntoDataFrame(df_object,list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.loc[idx]

Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
284,VentricularDeviceBrandRegistration_CAN,CANDIDATE VAD BRAND 1 AT LISTING,TCR,2004-06-30,NaT,CANDIDATE INFORMATION,NUM,,,VAD_BRAND1_TCR,Object,FMTNAME: VADBRAND - Type of Ventricular Device Brand used.
285,VentricularDeviceBrandTransplant_CAN,TRR LIFE SUPPORT VAD BRAND 1,TRR,2004-06-30,NaT,PATIENT STATUS,NUM,VADBRAND,,VAD_BRAND1_TRR,Object,FMTNAME: VADBRAND - Type of Ventricular Device Brand used.


### FUNC_STAT

In [121]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'FUNC', True)

                 count         mean         std    min     25%     50%     75%     max
FUNC_STAT_TCR  28676.0  2020.646917  233.893168    1.0  2020.0  2040.0  2070.0  4100.0
FUNC_STAT_TRR  28341.0  1995.294661  220.062257  998.0  2020.0  2040.0  2060.0  2100.0
FUNC_STAT_TRF  18282.0  1926.284542  370.737551  998.0  2020.0  2080.0  2090.0  2100.0

NaNs:
FUNC_STAT_TCR       75
FUNC_STAT_TRR      410
FUNC_STAT_TRF    10469
dtype: int64

Datatypes:
FUNC_STAT_TCR    float64
FUNC_STAT_TRR    float64
FUNC_STAT_TRF    float64
dtype: object


           Feature                                 Description                          FormSection DataType SASAnalysisFormat Comment Information
108  FUNC_STAT_TCR  RECIPIENT FUNCTIONAL STATUS @ REGISTRATION                CANDIDATE INFORMATION      NUM          FUNCSTAT             Unknown
109  FUNC_STAT_TRF             TRF FUNCTIONAL STATUS @ TRR/FOL  PATIENT STATUS AT TIME OF FOLLOW-UP      NUM          FUNCSTAT             Unknown
110  FUNC_STAT_TRR 

In [122]:
findMappingDfFlat(df.FUNC_STAT_TCR, df_flat, 'FUNCSTAT', NaN=999)

Compare Length: 24 & 23

CODE                                                                                               LABEL
   1                                             Performs activities of daily living with NO assistance.
   2                                           Performs activities of daily living with SOME assistance.
 998                                                                                             Unknown
2010                                                 10% - Moribund, fatal processes progressing rapidly
2020                              20% - Very sick, hospitalization necessary: active treatment necessary
2030                           30% - Severely disabled: hospitalization is indicated, death not imminent
2040                                                40% - Disabled: requires special care and assistance
2050                                    50% - Requires considerable assistance and frequent medical care
2060                          

In [123]:
findMappingDfFlat(df.FUNC_STAT_TRR, df_flat, 'FUNCSTAT', NaN=999)

Compare Length: 12 & 11

CODE                                                                     LABEL
 998                                                                   Unknown
2010                       10% - Moribund, fatal processes progressing rapidly
2020    20% - Very sick, hospitalization necessary: active treatment necessary
2030 30% - Severely disabled: hospitalization is indicated, death not imminent
2040                      40% - Disabled: requires special care and assistance
2050          50% - Requires considerable assistance and frequent medical care
2060        60% - Requires occasional assistance but is able to care for needs
2070   70% - Cares for self: unable to carry on normal activity or active work
2080               80% - Normal activity with effort: some symptoms of disease
2090         90% - Able to carry on normal activity: minor symptoms of disease
2100                      100% - Normal, no complaints, no evidence of disease


In [124]:
findMappingDfFlat(df.FUNC_STAT_TRF, df_flat, 'FUNCSTAT', NaN=999)

Compare Length: 12 & 11

CODE                                                                     LABEL
 998                                                                   Unknown
2010                       10% - Moribund, fatal processes progressing rapidly
2020    20% - Very sick, hospitalization necessary: active treatment necessary
2030 30% - Severely disabled: hospitalization is indicated, death not imminent
2040                      40% - Disabled: requires special care and assistance
2050          50% - Requires considerable assistance and frequent medical care
2060        60% - Requires occasional assistance but is able to care for needs
2070   70% - Cares for self: unable to carry on normal activity or active work
2080               80% - Normal activity with effort: some symptoms of disease
2090         90% - Able to carry on normal activity: minor symptoms of disease
2100                      100% - Normal, no complaints, no evidence of disease


In [125]:
# fill NaN with 998: Missing & convert to integer
df[features] = df[features].fillna(998).astype(int)

# df_flat FMTNAME: FUNCSTAT
mapping = {
    999: "Missing",
    998: "Unknown",
    2010: "10% - Moribund, fatal processes progressing rapidly",
    2020: "20% - Very sick, hospitalization necessary: active treatment necessary",
    2030: "30% - Severely disabled: hospitalization is indicated, death not imminent",
    2040: "40% - Disabled: requires special care and assistance",
    2050: "50% - Requires considerable assistance and frequent medical care",
    2060: "60% - Requires occasional assistance but is able to care for needs",
    2070: "70% - Cares for self: unable to carry on normal activity or active work",
    2080: "80% - Normal activity with effort: some symptoms of disease",
    2090: "90% - Able to carry on normal activity: minor symptoms of disease",
    2100: "100% - Normal, no complaints, no evidence of disease",
    4010: "10% - No play; does not get out of bed",
    4020: "20% - Often sleeping; play entirely limited to very passive activities",
    4030: "30% - In bed; needs assistance even for quiet play",
    4040: "40% - Mostly in bed; participates in quiet activities",
    4050: "50% - Can dress but lies around much of day; no active play; can take part in quiet play/activities",
    4060: "60% - Up and around, but minimal active play; keeps busy with quieter activities",
    4070: "70% - Both greater restriction of and less time spent in play activity",
    4080: "80% - Active, but tires more quickly",
    4090: "90% - Minor restrictions in physically strenuous activity",
    4100: "100% - Fully active, normal"
}

# iterate
for col in features:
    # mapping feature
    df = uf.mappingCol(df, col, mapping, display=False)
    

# mapping
colMap = {'FUNC_STAT_TCR': 'FunctionalStatusRegistration_CAN', 'FUNC_STAT_TRF':'FunctionalStatusFollowUp', 'FUNC_STAT_TRR':'FunctionalStatusTransplant_CAN'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt='FMTNAME: FUNCSTAT')

# update dataframe
df_can = uf.insertIntoDataFrame(df_can, ['FunctionalStatusRegistration_CAN','FunctionalStatusTransplant_CAN'])
df_label = uf.insertIntoDataFrame(df_label, ['FunctionalStatusFollowUp'])
df_ordinal = uf.insertIntoDataFrame(df_ordinal,list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.loc[idx]

Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
108,FunctionalStatusRegistration_CAN,RECIPIENT FUNCTIONAL STATUS @ REGISTRATION,TCR,1994-04-01,NaT,CANDIDATE INFORMATION,NUM,FUNCSTAT,,FUNC_STAT_TCR,Category,FMTNAME: FUNCSTAT
109,FunctionalStatusFollowUp,TRF FUNCTIONAL STATUS @ TRR/FOL,TRR/TRF,1997-04-01,NaT,PATIENT STATUS AT TIME OF FOLLOW-UP,NUM,FUNCSTAT,,FUNC_STAT_TRF,Category,FMTNAME: FUNCSTAT
110,FunctionalStatusTransplant_CAN,RECIPIENT FUNCTIONAL STATUS @TRANSPLANT,TRR,1994-04-01,NaT,PATIENT STATUS,NUM,FUNCSTAT,,FUNC_STAT_TRR,Category,FMTNAME: FUNCSTAT


### PRI_PAYMENT

In [126]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'PRI_PAYMENT', True)

                   count      mean       std  min  25%  50%  75%   max
PRI_PAYMENT_TCR  28679.0  2.162732  1.578231  1.0  1.0  1.0  3.0  13.0
PRI_PAYMENT_TRR  28344.0  2.241885  1.525560  1.0  1.0  2.0  3.0  12.0

NaNs:
PRI_PAYMENT_TCR     72
PRI_PAYMENT_TRR    407
dtype: int64

Datatypes:
PRI_PAYMENT_TCR    float64
PRI_PAYMENT_TRR    float64
dtype: object


             Feature                                              Description            FormSection DataType SASAnalysisFormat Comment Information
221  PRI_PAYMENT_TCR  RECIPIENT PRIMARY PROJECTED PAYMENT TYPE @ REGISTRATION  CANDIDATE INFORMATION      NUM           PRIMPAY             Unknown
222  PRI_PAYMENT_TRR            RECIPIENT PRIMARY PAYMENT SOURCE @ TRANSPLANT         PATIENT STATUS      NUM           PRIMPAY             Unknown


PRI_PAYMENT_TCR: [ 2.  3.  1.  4.  6.  7. 10. 11. 12.  8.  5.  9. nan 13.]
PRI_PAYMENT_TRR: [ 2.  3.  1.  4.  6.  7. 10. 12.  8.  5.  9. nan]


In [127]:
findMappingDfFlat(df.PRI_PAYMENT_TCR, df_flat, 'PRIMPAY', NaN=999)

Compare Length: 14 & 13

CODE                                                         LABEL
   1                                             Private insurance
   2                                   Public insurance - Medicaid
   3             Public insurance - Medicare FFS (Fee for Service)
   4                          Public insurance - Medicare & Choice
   5 Public insurance - CHIP (Children's Health Insurance Program)
   6                           Public insurance - Department of VA
   7                           Public insurance - Other government
   8                                                          Self
   9                                                      Donation
  10                                                     Free Care
  11                                                       Pending
  12                                    Foreign Government Specify
  13    Public insurance - Medicare (further detail not collected)


In [128]:
findMappingDfFlat(df.PRI_PAYMENT_TRR, df_flat, 'PRIMPAY', NaN=999)

Compare Length: 12 & 11

CODE                                                         LABEL
   1                                             Private insurance
   2                                   Public insurance - Medicaid
   3             Public insurance - Medicare FFS (Fee for Service)
   4                          Public insurance - Medicare & Choice
   5 Public insurance - CHIP (Children's Health Insurance Program)
   6                           Public insurance - Department of VA
   7                           Public insurance - Other government
   8                                                          Self
   9                                                      Donation
  10                                                     Free Care
  12                                    Foreign Government Specify


In [129]:
# fill NaN with 999: Missing & convert to integer
df[features] = df[features].fillna(999).astype(int)

# df_flat FMTNAME: PRIMPAY
mapping = {
    1: "Private insurance",
    2: "Public insurance - Medicaid",
    3: "Public insurance - Medicare FFS (Fee for Service)",
    4: "Public insurance - Medicare & Choice",
    5: "Public insurance - CHIP (Children's Health Insurance Program)",
    6: "Public insurance - Department of VA",
    7: "Public insurance - Other government",
    8: "Self",
    9: "Donation",
    10: "Free Care",
    11: "Pending",
    12: "Foreign Government Specify",
    999: "Missing"
}

# iterate
for col in features:
    # mapping feature
    df = uf.mappingCol(df, col, mapping, display=True)
    

# mapping
colMap = {'PRI_PAYMENT_TCR': 'PrimaryPaymentRegistration_CAN', 'PRI_PAYMENT_TRR':'PrimaryPaymentTransplant_CAN'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Object', txt='FMTNAME: PRIMPAY')

# update dataframe
df_can = uf.insertIntoDataFrame(df_can, list(colMap.values()))
df_object = uf.insertIntoDataFrame(df_object,list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.loc[idx]

Converted Column PRI_PAYMENT_TCR Unique Vaue(s) ['Public insurance - Medicaid', 'Public insurance - Medicare FFS (Fee for Service)', 'Private insurance', 'Public insurance - Medicare & Choice', 'Public insurance - Department of VA', ..., 'Self', 'Public insurance - CHIP (Children's Health Insurance Program)', 'Donation', 'Missing', 13]
Length: 14
Categories (14, object): [13, 'Donation', 'Foreign Government Specify', 'Free Care', ..., 'Public insurance - Medicare & Choice', 'Public insurance - Medicare FFS (Fee for Service)', 'Public insurance - Other government', 'Self']
Converted Column PRI_PAYMENT_TRR Unique Vaue(s) ['Public insurance - Medicaid', 'Public insurance - Medicare FFS (Fee for Service)', 'Private insurance', 'Public insurance - Medicare & Choice', 'Public insurance - Department of VA', ..., 'Foreign Government Specify', 'Self', 'Public insurance - CHIP (Children's Health Insurance Program)', 'Donation', 'Missing']
Length: 12
Categories (12, object): ['Donation', 'Foreign

Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
221,PrimaryPaymentRegistration_CAN,RECIPIENT PRIMARY PROJECTED PAYMENT TYPE @ REGISTRATION,TCR,1994-04-01,NaT,CANDIDATE INFORMATION,NUM,PRIMPAY,,PRI_PAYMENT_TCR,Object,FMTNAME: PRIMPAY
222,PrimaryPaymentTransplant_CAN,RECIPIENT PRIMARY PAYMENT SOURCE @ TRANSPLANT,TRR,1994-04-01,NaT,PATIENT STATUS,NUM,PRIMPAY,,PRI_PAYMENT_TRR,Object,FMTNAME: PRIMPAY


### DIAB

In [130]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'DIAB', True)

                     count unique  top   freq      mean        std  min  25%  50%  75%    max
DIAB               28678.0    NaN  NaN    NaN   2.34985  27.604111  1.0  1.0  1.0  3.0  998.0
HIST_DIABETES_DON  28747.0    NaN  NaN    NaN  7.626744  80.571116  1.0  1.0  1.0  1.0  998.0
DIABETES_DON         28747      3    N  27502       NaN        NaN  NaN  NaN  NaN  NaN    NaN

NaNs:
DIAB                 73
HIST_DIABETES_DON     4
DIABETES_DON          4
dtype: int64

Datatypes:
DIAB                 float64
HIST_DIABETES_DON    float64
DIABETES_DON          object
dtype: object


               Feature                                                    Description           FormSection DataType SASAnalysisFormat                                    Comment Information
70                DIAB                              RECIPIENT DIABETES @ REGISTRATION  CLINICAL INFORMATION      NUM            DIABTY                                                Unknown
71        DIABETES_DON               

In [131]:
findMappingDfFlat(df.DIAB, df_flat, 'DIABTY', NaN=999)

Compare Length: 7 & 6

CODE                   LABEL
   1                      No
   2                  Type I
   3                 Type II
   4              Type Other
   5            Type Unknown
 998 Diabetes Status Unknown


In [132]:
findMappingDfFlat(df.DIAB, df_flat, 'HISTDIAB', NaN=999)

Compare Length: 7 & 6

CODE                 LABEL
   1                    NO
   2        YES, 0-5 YEARS
   3       YES, 6-10 YEARS
   4        YES, >10 YEARS
   5 YES, DURATION UNKNOWN
 998               UNKNOWN


In [133]:
# update NaNs
df.DIAB = df.DIAB.fillna(999).astype(int) # 999 Missing
df.DIABETES_DON = df.DIABETES_DON.fillna('X') # X Missing
df.HIST_DIABETES_DON = df.HIST_DIABETES_DON.fillna(999).astype(int) # 999 Missing

# feature value mapping
mapping = {'N': 'No', 'Y': 'Yes', 'U': 'Unknown', 'X': 'Missing'}

# mapping feature
df = uf.mappingCol(df, 'DIABETES_DON', mapping)


# df_flat FMTNAME: DIABTY
mapping = {
    1: 'No',
    2: 'Type I',
    3: 'Type II',
    4: 'Type Other',
    5: 'Type Unknown',
    998: 'Diabetes Status Unknown',
    999: 'Missing'
}

# map
df = uf.mappingCol(df, 'DIAB', mapping)


# df_flat FMTNAME: HISTDIAB
mapping = {
    1: 'No',
    2: 'Yes, 0-5 Years',
    3: 'Yes, 6-10 Years',
    4: 'Yes, >10 Years',
    5: 'Yes, Duration Unknown',
    998: 'Unknown',
    999: 'Missing'
}

# map
df = uf.mappingCol(df, 'HIST_DIABETES_DON', mapping)

# mapping
colMap = {'DIAB': 'DiabetesType_CAN', 'DIABETES_DON':'Diabetes_DON', 'HIST_DIABETES_DON':'DiabetesHistory_DON'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt='')
df_dict = uf.updateDictionaryInformation(df_dict, [70], txt='FMTNAME: DIABTY')
df_dict = uf.updateDictionaryInformation(df_dict, [146], txt='FMTNAME: HISTDIAB').copy()

# update dataframe
df_can = uf.insertIntoDataFrame(df_can, ['DiabetesType_CAN'])
df_don = uf.insertIntoDataFrame(df_don, ['Diabetes_DON', 'DiabetesType_CAN'])
df_nominal = uf.insertIntoDataFrame(df_nominal,['Diabetes_DON', 'DiabetesType_CAN'])
df_ordinal = uf.insertIntoDataFrame(df_ordinal,['DiabetesHistory_DON'])
# convert to category
df = uf.toCategory(df,  list(colMap.values())).copy()

# display
df_dict.loc[idx]

Converted Column DIABETES_DON Unique Vaue(s) ['No', 'Yes', 'Unknown', 'Missing']
Categories (4, object): ['Missing', 'No', 'Unknown', 'Yes']
Converted Column DIAB Unique Vaue(s) ['No', 'Type II', 'Type I', 'Type Unknown', 'Type Other', 'Diabetes Status Unknown', 'Missing']
Categories (7, object): ['Diabetes Status Unknown', 'Missing', 'No', 'Type I', 'Type II', 'Type Other', 'Type Unknown']
Converted Column HIST_DIABETES_DON Unique Vaue(s) ['No', 'Yes, 0-5 Years', 'Yes, >10 Years', 'Yes, 6-10 Years', 'Unknown', 'Yes, Duration Unknown', 'Missing']
Categories (7, object): ['Missing', 'No', 'Unknown', 'Yes, 0-5 Years', 'Yes, 6-10 Years', 'Yes, >10 Years', 'Yes, Duration Unknown']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
70,DiabetesType_CAN,RECIPIENT DIABETES @ REGISTRATION,TCR,1994-04-01,NaT,CLINICAL INFORMATION,NUM,DIABTY,,DIAB,Category,FMTNAME: DIABTY
71,Diabetes_DON,"DECEASED DONOR-HISTORY OF DIABETES (Y,N)",DDR,1994-04-01,NaT,DONOR HISTORY,CHAR(1),,UNROLLED Y/N FIELD FROM HIST_DIABETES_DON,DIABETES_DON,Category,
146,DiabetesHistory_DON,"DECEASED DONOR-HISTORY OF DIABETES, INCL. DURATION OF DISEASE",DDR,1994-04-01,NaT,DONOR HISTORY,NUM,HISTDIAB,,HIST_DIABETES_DON,Category,FMTNAME: HISTDIAB


### DIAL

In [134]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'DIAL_', True)

                   count unique  top   freq     mean        std  min  25%  50%  75%    max
DIAL_TY_TCR      28676.0    NaN  NaN    NaN  2.69947  40.787537  1.0  1.0  1.0  1.0  999.0
DIAL_AFTER_LIST    28328      3    N  27012      NaN        NaN  NaN  NaN  NaN  NaN    NaN
DIAL_PRIOR_TX      28311      2    N  26772      NaN        NaN  NaN  NaN  NaN  NaN    NaN

NaNs:
DIAL_TY_TCR         75
DIAL_AFTER_LIST    423
DIAL_PRIOR_TX      440
dtype: int64

Datatypes:
DIAL_TY_TCR        float64
DIAL_AFTER_LIST     object
DIAL_PRIOR_TX       object
dtype: object


            Feature                                        Description                         FormSection DataType SASAnalysisFormat                                                   Comment Information
73  DIAL_AFTER_LIST  DIALYSIS OCCURRING BETWEEN LISTING AND TRANSPLANT  PRETRANSPLANT CLINICAL INFORMATION  CHAR(1)                                                                                 Unknown
74    DIAL_PRIOR_TX           

In [135]:
findMappingDfFlat(df.DIAL_TY_TCR, df_flat, 'DIAL_TY', NaN=997)

Compare Length: 6 & 5

CODE                               LABEL
   1                         No dialysis
   2                        Hemodialysis
   3                 Peritoneal Dialysis
 998             Dialysis Status Unknown
 999 Dialysis-Unknown Type was performed


In [136]:
# update NaNs
df[['DIAL_AFTER_LIST','DIAL_PRIOR_TX']] = df[['DIAL_AFTER_LIST','DIAL_PRIOR_TX']].fillna('X') # X Missing
df.DIAL_TY_TCR = df.DIAL_TY_TCR.fillna(997).astype(int) # 997 Missing

# feature value mapping
mapping = {'N': 'No', 'Y': 'Yes', 'U': 'Unknown', 'X': 'Missing'}

# mapping feature
df = uf.mappingCol(df, 'DIAL_AFTER_LIST', mapping, display=True)
df = uf.mappingCol(df, 'DIAL_PRIOR_TX', mapping, display=True)


# df_flat FMTNAME: DIAL_TY
mapping = {
    1: 'No dialysis',
    2: 'Hemodialysis',
    3: 'Peritoneal Dialysis',
    998: 'Dialysis Status Unknown',
    999: 'Dialysis - Unknown Type was performed',
    997: 'Missing'
}

# map
df = uf.mappingCol(df, 'DIAL_TY_TCR', mapping, True).copy()


# mapping
colMap = {'DIAL_AFTER_LIST': 'DialysisBetweenRegistrationTransplant_CAN', 'DIAL_PRIOR_TX':'DialysisPriorRegistration_CAN', 
          'DIAL_TY_TCR':'DialysisTypeRegistration_CAN'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt='N/Y/U/X to No/Yes/Unknown/Missing')
df_dict = uf.updateDictionaryInformation(df_dict, [76], txt='FMTNAME: DIAL_TY - Type of Dialysis @ Registration.', FeatureType='Object').copy()

# update dataframe
df_can = uf.insertIntoDataFrame(df_can, ['DialysisBetweenRegistrationTransplant_CAN','DialysisPriorRegistration_CAN'])
df_nominal = uf.insertIntoDataFrame(df_nominal, ['DialysisPriorRegistration_CAN','DialysisPriorRegistration_CAN', 'DialysisBetweenRegistrationTransplant_CAN'])
df_object = uf.insertIntoDataFrame(df_object,['DialysisTypeRegistration_CAN'])
# convert to category
df = uf.toCategory(df, list(colMap.values())).copy()

# display
df_dict.loc[idx]

Converted Column DIAL_AFTER_LIST Unique Vaue(s) ['No', 'Yes', 'Unknown', 'Missing']
Categories (4, object): ['Missing', 'No', 'Unknown', 'Yes']
Converted Column DIAL_PRIOR_TX Unique Vaue(s) ['No', 'Yes', 'Missing']
Categories (3, object): ['Missing', 'No', 'Yes']
Converted Column DIAL_TY_TCR Unique Vaue(s) ['No dialysis', 'Hemodialysis', 'Dialysis - Unknown Type was performed', 'Dialysis Status Unknown', 'Peritoneal Dialysis', 'Missing']
Categories (6, object): ['Dialysis - Unknown Type was performed', 'Dialysis Status Unknown', 'Hemodialysis', 'Missing', 'No dialysis', 'Peritoneal Dialysis']
Already Exists: (DialysisPriorRegistration_CAN)


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
73,DialysisBetweenRegistrationTransplant_CAN,DIALYSIS OCCURRING BETWEEN LISTING AND TRANSPLANT,TRR,1994-04-01,NaT,PRETRANSPLANT CLINICAL INFORMATION,CHAR(1),,,DIAL_AFTER_LIST,Category,N/Y/U/X to No/Yes/Unknown/Missing
74,DialysisPriorRegistration_CAN,Calculated: Ever Dialysis Prior Tx?,CALCULATED,NaT,NaT,,CHAR(1),,,DIAL_PRIOR_TX,Category,N/Y/U/X to No/Yes/Unknown/Missing
75,DialysisTypeRegistration_CAN,PATIENT TYPE OF DIALYSIS @ REGISTRATION,TCR,1994-04-01,NaT,CLINICAL INFORMATION,NUM,DIAL_TY,Collected for HR . But removed for HL and LU on 3/31/15.,DIAL_TY_TCR,Category,N/Y/U/X to No/Yes/Unknown/Missing


### CEREB

In [137]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'CEREB', True)

            count unique top   freq
CEREB_VASC  28674      3   N  26646

NaNs:
CEREB_VASC    77
dtype: int64

Datatypes:
CEREB_VASC    object
dtype: object


       Feature                                                 Description           FormSection DataType SASAnalysisFormat Comment Information
29  CEREB_VASC  PATIENT SYMPTOMATIC CEREBROVASCULAR DISEASE @ REGISTRATION  CLINICAL INFORMATION  CHAR(1)                               Unknown


CEREB_VASC: ['N' 'Y' 'U' nan]


In [138]:
# fill NaN with X: Missing
df[features] = df[features].fillna('X')

# feature value mapping
mapping = {'N': 'No', 'Y': 'Yes', 'U': 'Unknown', 'X': 'Missing'}

# iterate
for col in features:
    # mapping feature
    df = uf.mappingCol(df, col, mapping, display=True)
    

# mapping
colMap = {'CEREB_VASC': 'CerebroVascularDisease_CAN'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt='N/Y/U/X to No/Yes/Unknown/Missing')

# update dataframe
df_can = uf.insertIntoDataFrame(df_can, list(colMap.values()))
df_nominal = uf.insertIntoDataFrame(df_nominal,list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values())).copy()

# display
df_dict.loc[idx]

Converted Column CEREB_VASC Unique Vaue(s) ['No', 'Yes', 'Unknown', 'Missing']
Categories (4, object): ['Missing', 'No', 'Unknown', 'Yes']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
29,CerebroVascularDisease_CAN,PATIENT SYMPTOMATIC CEREBROVASCULAR DISEASE @ REGISTRATION,TCR,1994-04-01,2007-01-01,CLINICAL INFORMATION,CHAR(1),,,CEREB_VASC,Category,N/Y/U/X to No/Yes/Unknown/Missing


### MALIG

In [139]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'MALIG', True)

           count unique top   freq
MALIG_TCR  28678      3   N  26133
MALIG_TRR   8559      3   N   8475
MALIG      28678      3   N  26137

NaNs:
MALIG_TCR       73
MALIG_TRR    20192
MALIG           73
dtype: int64

Datatypes:
MALIG_TCR    object
MALIG_TRR    object
MALIG        object
dtype: object


       Feature                                                                Description                         FormSection DataType SASAnalysisFormat Comment Information
197      MALIG                                                   ANY PREVIOUS MALIGNANCY?                                      CHAR(1)                               Unknown
198  MALIG_TCR  ANY PREVIOUS MALIGNANCY (EXCLUDE NON-MELANOMA SKIN CANCER) @ REGISTRATION                CLINICAL INFORMATION  CHAR(1)                               Unknown
199  MALIG_TRR               RECIPIENT ANY KNOWN MALIGNANCIES SINCE LISTING  @ TRANSPLANT  PRETRANSPLANT CLINICAL INFORMATION  CHAR(1)                               Unknown


M

In [140]:
# fill NaN with X: Missing
df[features] = df[features].fillna('X')

# feature value mapping
mapping = {'N': 'No', 'Y': 'Yes', 'U': 'Unknown', 'X': 'Missing'}

# iterate
for col in features:
    # mapping feature
    df = uf.mappingCol(df, col, mapping, display=True)
    

# mapping
colMap = {'MALIG': 'Malignancy_CAN', 'MALIG_TCR': 'PreviousMalignancy_CAN', 'MALIG_TRR': 'MalignancyBetweenRegistrationTransplant_CAN'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt='N/Y/U/X to No/Yes/Unknown/Missing')

# update dataframe
df_can = uf.insertIntoDataFrame(df_can, list(colMap.values()))
df_nominal = uf.insertIntoDataFrame(df_nominal, list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display  
df_dict.loc[idx]

Converted Column MALIG_TCR Unique Vaue(s) ['No', 'Yes', 'Unknown', 'Missing']
Categories (4, object): ['Missing', 'No', 'Unknown', 'Yes']
Converted Column MALIG_TRR Unique Vaue(s) ['No', 'Unknown', 'Yes', 'Missing']
Categories (4, object): ['Missing', 'No', 'Unknown', 'Yes']
Converted Column MALIG Unique Vaue(s) ['No', 'Yes', 'Unknown', 'Missing']
Categories (4, object): ['Missing', 'No', 'Unknown', 'Yes']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
197,Malignancy_CAN,ANY PREVIOUS MALIGNANCY?,CALCULATED,NaT,NaT,,CHAR(1),,,MALIG,Category,N/Y/U/X to No/Yes/Unknown/Missing
198,PreviousMalignancy_CAN,ANY PREVIOUS MALIGNANCY (EXCLUDE NON-MELANOMA SKIN CANCER) @ REGISTRATION,TCR,1994-04-01,NaT,CLINICAL INFORMATION,CHAR(1),,,MALIG_TCR,Category,N/Y/U/X to No/Yes/Unknown/Missing
199,MalignancyBetweenRegistrationTransplant_CAN,RECIPIENT ANY KNOWN MALIGNANCIES SINCE LISTING @ TRANSPLANT,TRR,1999-10-25,2015-03-31,PRETRANSPLANT CLINICAL INFORMATION,CHAR(1),,,MALIG_TRR,Category,N/Y/U/X to No/Yes/Unknown/Missing


### CANCER

#### Cancer Type

In [141]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'CANCER', True)

                           count unique  top   freq      mean       std  min  25%  50%  75%    max
EXTRACRANIAL_CANCER_DON    28544      3    N  28326       NaN       NaN  NaN  NaN  NaN  NaN    NaN
CANCER_SITE_DON          28738.0    NaN  NaN    NaN  9.960714  93.52717  1.0  1.0  1.0  1.0  999.0
INTRACRANIAL_CANCER_DON    28544      3    N  28302       NaN       NaN  NaN  NaN  NaN  NaN    NaN
HIST_CANCER_DON            28738      3    N  28182       NaN       NaN  NaN  NaN  NaN  NaN    NaN
SKIN_CANCER_DON            28544      3    N  28352       NaN       NaN  NaN  NaN  NaN  NaN    NaN

NaNs:
EXTRACRANIAL_CANCER_DON    207
CANCER_SITE_DON             13
INTRACRANIAL_CANCER_DON    207
HIST_CANCER_DON             13
SKIN_CANCER_DON            207
dtype: int64

Datatypes:
EXTRACRANIAL_CANCER_DON     object
CANCER_SITE_DON            float64
INTRACRANIAL_CANCER_DON     object
HIST_CANCER_DON             object
SKIN_CANCER_DON             object
dtype: object


                     Feature

In [142]:
# fill NaN with X: Missing
df[features] = df[features].fillna('X')

# feature value mapping
mapping = {'N': 'No', 'Y': 'Yes', 'U': 'Unknown', 'X': 'Missing'}

# iterate
for col in features:
    # mapping feature
    df = uf.mappingCol(df, col, mapping, display=True)
    

# mapping
colMap = {'EXTRACRANIAL_CANCER_DON': 'CancerExtraCranial_DON', 'HIST_CANCER_DON':'CancerHistory_DON',
          'INTRACRANIAL_CANCER_DON':'CancerIntraCranial_DON', 'SKIN_CANCER_DON':'CancerSkin_DON'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt='N/Y/U/X to No/Yes/Unknown/Missing')

# update dataframe
df_don = uf.insertIntoDataFrame(df_don,list(colMap.values()))
df_nominal = uf.insertIntoDataFrame(df_nominal,list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values())).copy()

# display
df_dict.loc[idx]

Converted Column EXTRACRANIAL_CANCER_DON Unique Vaue(s) ['No', 'Missing', 'Unknown', 'Yes']
Categories (4, object): ['Missing', 'No', 'Unknown', 'Yes']
Converted Column CANCER_SITE_DON Unique Vaue(s) [1.0, 2.0, 14.0, 19.0, 998.0, ..., 6.0, 26.0, 15.0, 16.0, 17.0]
Length: 27
Categories (27, object): [1.0, 2.0, 3.0, 4.0, ..., 35.0, 998.0, 999.0, 'Missing']
Converted Column INTRACRANIAL_CANCER_DON Unique Vaue(s) ['No', 'Unknown', 'Yes', 'Missing']
Categories (4, object): ['Missing', 'No', 'Unknown', 'Yes']
Converted Column HIST_CANCER_DON Unique Vaue(s) ['No', 'Yes', 'Unknown', 'Missing']
Categories (4, object): ['Missing', 'No', 'Unknown', 'Yes']
Converted Column SKIN_CANCER_DON Unique Vaue(s) ['No', 'Yes', 'Missing', 'Unknown']
Categories (4, object): ['Missing', 'No', 'Unknown', 'Yes']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
26,CANCER_SITE_DON,DECEASED DONOR-CANCER SITE,DDR,1994-04-01,NaT,DONOR HISTORY,NUM,HISTCAN,,CANCER_SITE_DON,Category,N/Y/U/X to No/Yes/Unknown/Missing
107,CancerExtraCranial_DON,DECEASED DONOR-EXTRACANIAL CANCER AT PROCUREMENT,DDR,1994-04-01,NaT,DONOR HISTORY,CHAR(1),,,EXTRACRANIAL_CANCER_DON,Category,N/Y/U/X to No/Yes/Unknown/Missing
143,CancerHistory_DON,DECEASED DONOR-HISTORY OF CANCER (Y/N),DDR,1994-04-01,NaT,DONOR HISTORY,CHAR(1),,,HIST_CANCER_DON,Category,N/Y/U/X to No/Yes/Unknown/Missing
186,CancerIntraCranial_DON,DECEASED DONOR-INTRACANIAL CANCER AT PROCUREMENT,DDR,1994-04-01,NaT,DONOR HISTORY,CHAR(1),,,INTRACRANIAL_CANCER_DON,Category,N/Y/U/X to No/Yes/Unknown/Missing
262,CancerSkin_DON,DECEASED DONOR-SKIN CANCER AT PROCUREMENT (Y/N),DDR,1994-04-01,NaT,DONOR HISTORY,CHAR(1),,,SKIN_CANCER_DON,Category,N/Y/U/X to No/Yes/Unknown/Missing


#### Cancer Site

In [143]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'CANCER_SITE_DON', True)

                   count  unique  top     freq
CANCER_SITE_DON  28751.0    27.0  1.0  28182.0

NaNs:
CANCER_SITE_DON    0
dtype: int64

Datatypes:
CANCER_SITE_DON    category
dtype: object


            Feature                 Description    FormSection DataType SASAnalysisFormat Comment                        Information
26  CANCER_SITE_DON  DECEASED DONOR-CANCER SITE  DONOR HISTORY      NUM           HISTCAN          N/Y/U/X to No/Yes/Unknown/Missing


CANCER_SITE_DON: [1.0, 2.0, 14.0, 19.0, 998.0, ..., 6.0, 26.0, 15.0, 16.0, 17.0]
Length: 27
Categories (27, object): [1.0, 2.0, 3.0, 4.0, ..., 35.0, 998.0, 999.0, 'Missing']


In [144]:
# df_flat FMTNAME: HISTCAN
mapping = {
    1: "NO",
    2: "SKIN - SQUAMOUS, BASAL CELL",
    3: "SKIN - MELANOMA",
    4: "CNS TUMOR - ASTROCYTOMA",
    5: "CNS TUMOR - GLIOBLASTOMA MULTIFORME",
    6: "CNS TUMOR - MEDULLOBLASTOMA",
    7: "CNS TUMOR - NEUROBLASTOMA",
    8: "CNS TUMOR - ANGIOBLASTOMA",
    9: "CNS TUMOR - MENINGIOMA",
    12: "CNS TUMOR - OTHER",
    13: "GENITOURINARY - BLADDER",
    14: "GENITOURINARY - UTERINE CERVIX",
    15: "GENITOURINARY - UTERINE BODY ENDOMETRIAL",
    16: "GENITOURINARY - UTERINE BODY CHORIOCARCINOMA",
    17: "GENITOURINARY - VULVA",
    18: "GENITOURINARY - OVARIAN",
    19: "GENITOURINARY - PENIS, TESTICULAR",
    20: "GENITOURINARY - PROSTATE",
    21: "GENITOURINARY - KIDNEY",
    22: "GENITOURINARY - UNKNOWN",
    23: "GASTROINTESTINAL - ESOPHAGEAL",
    24: "GASTROINTESTINAL - STOMACH",
    25: "GASTROINTESTINAL - SMALL INTESTINE",
    26: "GASTROINTESTINAL - COLO-RECTAL",
    27: "GASTROINTESTINAL - LIVER & BILIARY TRACT",
    28: "GASTROINTESTINAL - PANCREAS",
    29: "BREAST",
    30: "THYROID",
    32: "TONGUE/THROAT",
    33: "LARYNX",
    34: "LUNG (include bronchial)",
    35: "LEUKEMIA/LYMPHOMA",
    998: "UNKNOWN",
    999: "OTHER, SPECIFY",
    1000: "Missing"
}


# iterate
for col in features:
    # mapping feature
    df = uf.mappingCol(df, col, mapping, display=True)


# mapping
colMap = {'CANCER_SITE_DON':'CancerSite_DON'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Object', txt=f"FMTNAME: HISTCAN")

# update dataframe
df_don  = uf.insertIntoDataFrame(df_don, list(colMap.values()))
df_object  = uf.insertIntoDataFrame(df_object, list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values())).copy()

# display
df_dict.loc[idx]

Converted Column CANCER_SITE_DON Unique Vaue(s) ['NO', 'SKIN - SQUAMOUS, BASAL CELL', 'GENITOURINARY - UTERINE CERVIX', 'GENITOURINARY - PENIS, TESTICULAR', 'UNKNOWN', ..., 'CNS TUMOR - MEDULLOBLASTOMA', 'GASTROINTESTINAL - COLO-RECTAL', 'GENITOURINARY - UTERINE BODY ENDOMETRIAL', 'GENITOURINARY - UTERINE BODY CHORIOCARCINOMA', 'GENITOURINARY - VULVA']
Length: 27
Categories (27, object): ['BREAST', 'CNS TUMOR - ASTROCYTOMA', 'CNS TUMOR - GLIOBLASTOMA MULTIFORME', 'CNS TUMOR - MEDULLOBLASTOMA', ..., 'SKIN - MELANOMA', 'SKIN - SQUAMOUS, BASAL CELL', 'THYROID', 'UNKNOWN']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
26,CancerSite_DON,DECEASED DONOR-CANCER SITE,DDR,1994-04-01,NaT,DONOR HISTORY,NUM,HISTCAN,,CANCER_SITE_DON,Object,FMTNAME: HISTCAN


### CREATININE

In [145]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'CREAT', False)

                   count      mean       std   min   25%  50%   75%   max
MOST_RCNT_CREAT  28604.0  1.363390  0.959501  0.08  0.94  1.2  1.50  24.0
CREAT_TRR        28330.0  1.372100  1.001949  0.06  0.94  1.2  1.50  37.0
CREAT_DON        28546.0  1.496195  1.543989  0.04  0.76  1.0  1.48  23.0

NaNs:
MOST_RCNT_CREAT    147
CREAT_TRR          421
CREAT_DON          205
dtype: int64

Datatypes:
MOST_RCNT_CREAT    float64
CREAT_TRR          float64
CREAT_DON          float64
dtype: object


             Feature                                         Description                         FormSection DataType SASAnalysisFormat                                                                 Comment Information
45         CREAT_DON              DECEASED DONOR-TERMINAL LAB CREATININE                CLINICAL INFORMATION      NUM                                                                                               Unknown
46         CREAT_TRR            RECIPIENT SERUM CREATININE AT TIME

In [146]:
# mapping
colMap = {'CREAT_DON': 'Creatinine_DON', 'CREAT_TRR':'CreatinineTransplant_CAN', 'MOST_RCNT_CREAT':'CreatinineRegistration_CAN'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Numeric', txt='')

# update dataframe
df_can = uf.insertIntoDataFrame(df_can, ['CreatinineTransplant_CAN','CreatinineRegistration_CAN'])
df_don = uf.insertIntoDataFrame(df_don, ['Creatinine_DON'])
df_numeric = uf.insertIntoDataFrame(df_numeric,list(colMap.values()))

# display
df_dict.loc[idx]

Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
45,Creatinine_DON,DECEASED DONOR-TERMINAL LAB CREATININE,DDR,1994-04-01,NaT,CLINICAL INFORMATION,NUM,,,CREAT_DON,Numeric,
46,CreatinineTransplant_CAN,RECIPIENT SERUM CREATININE AT TIME OF TX,TRR,1994-04-01,NaT,PRETRANSPLANT CLINICAL INFORMATION,NUM,,,CREAT_TRR,Numeric,
201,CreatinineRegistration_CAN,PATIENT MOST RECENT ABSOLUTE CREATININE AT LISTING,TCR,1999-10-25,2007-01-01,CLINICAL INFORMATION,NUM,,Collection ended 1/1/07 for Lung (see INIT_CREAT & END_CREAT instead),MOST_RCNT_CREAT,Numeric,


### SERUM

In [147]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'SERUM', False)

                  count      mean       std  min  25%  50%  75%  max
TOT_SERUM_ALBUM  8413.0  3.719684  0.659506  0.5  3.3  3.8  4.2  8.1

NaNs:
TOT_SERUM_ALBUM    20338
dtype: int64

Datatypes:
TOT_SERUM_ALBUM    float64
dtype: object


             Feature                                                           Description           FormSection DataType SASAnalysisFormat Comment Information
273  TOT_SERUM_ALBUM  PATIENT TOTAL SERUM ALBUMIN  @ REGISTRATION (pre 1/1/2007 for adult)  CLINICAL INFORMATION      NUM                               Unknown




In [148]:
# mapping
colMap = {'TOT_SERUM_ALBUM': 'TotalSerumAlbuminRegistration_CAN'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Numeric', txt='')

# update dataframe
df_can = uf.insertIntoDataFrame(df_can, list(colMap.values()))
df_numeric = uf.insertIntoDataFrame(df_numeric,list(colMap.values()))

# display
df_dict.loc[idx]

Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
273,TotalSerumAlbuminRegistration_CAN,PATIENT TOTAL SERUM ALBUMIN @ REGISTRATION (pre 1/1/2007 for adult),TCR,1999-10-25,NaT,CLINICAL INFORMATION,NUM,,,TOT_SERUM_ALBUM,Numeric,


### DEFIBRILLATOR

In [149]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'DEFIBRIL', True)

               count unique top   freq
IMPL_DEFIBRIL  28676      3   Y  21153

NaNs:
IMPL_DEFIBRIL    75
dtype: int64

Datatypes:
IMPL_DEFIBRIL    object
dtype: object


           Feature                                     Description           FormSection DataType SASAnalysisFormat Comment Information
157  IMPL_DEFIBRIL  IMPLANTABLE DEFIBRILLATOR Y/N/U @ REGISTRATION  CLINICAL INFORMATION  CHAR(1)                               Unknown


IMPL_DEFIBRIL: ['Y' 'N' 'U' nan]


In [150]:
# fill NaN with X: Missing
df[features] = df[features].fillna('X')

# feature value mapping
mapping = {'N': 'No', 'Y': 'Yes', 'U': 'Unknown', 'X': 'Missing'}

# iterate
for col in features:
    # mapping feature
    df = uf.mappingCol(df, col, mapping, display=True)


# mapping
colMap = {'IMPL_DEFIBRIL': 'DefibrillatorImplantRegistration_CAN'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt='N/Y/U/X to No/Yes/Unknown/Missing')

# update dataframe
df_can = uf.insertIntoDataFrame(df_can, list(colMap.values()))
df_nominal = uf.insertIntoDataFrame(df_nominal,list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.loc[idx]

Converted Column IMPL_DEFIBRIL Unique Vaue(s) ['Yes', 'No', 'Unknown', 'Missing']
Categories (4, object): ['Missing', 'No', 'Unknown', 'Yes']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
157,DefibrillatorImplantRegistration_CAN,IMPLANTABLE DEFIBRILLATOR Y/N/U @ REGISTRATION,TCR,1994-04-01,NaT,CLINICAL INFORMATION,CHAR(1),,,IMPL_DEFIBRIL,Category,N/Y/U/X to No/Yes/Unknown/Missing


### HEMODYNAMICS

In [151]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'HEMO', False)

                   count       mean        std   min    25%    50%    75%    max
HEMO_SYS_TCR     27713.0  42.056400  14.254475  0.00  31.00  41.00  52.00  130.0
HEMO_PA_DIA_TCR  27692.0  20.529113   8.688313  0.00  14.00  20.00  26.00   91.0
HEMO_PA_MN_TCR   27426.0  28.657685  10.213014  0.00  21.00  28.00  36.00   96.0
HEMO_PCW_TCR     26023.0  19.142739   8.802624  0.00  12.00  19.00  25.00   50.0
HEMO_CO_TCR      27092.0   4.266872   1.345663  0.39   3.33   4.11   5.00   15.0
HEMO_CO_TRR      27142.0   4.537917   1.457784  0.20   3.51   4.40   5.36   15.0
HEMO_PA_DIA_TRR  27582.0  19.144776   8.476800  0.00  13.00  18.00  25.00  110.0
HEMO_PA_MN_TRR   27320.0  27.004952   9.922125  0.00  20.00  26.00  34.00  110.0
HEMO_PCW_TRR     26326.0  17.764157   8.676076  0.00  11.00  17.00  24.00   50.0
HEMO_SYS_TRR     27602.0  39.733708  13.817825  0.00  30.00  38.00  49.00  159.0

NaNs:
HEMO_SYS_TCR       1038
HEMO_PA_DIA_TCR    1059
HEMO_PA_MN_TCR     1325
HEMO_PCW_TCR       2728
HEMO_C

In [152]:
# mapping
colMap = {'HEMO_CO_TCR': 'HemodynamicsRegistration_CO_CAN', 'HEMO_CO_TRR':'HemodynamicsTransplant_CO_CAN', 
          'HEMO_PA_DIA_TCR':'HemodynamicsRegistration_PA_DIA_CAN', 'HEMO_PA_DIA_TRR':'HemodynamicsTransplant_PA_DIA_CAN',
          'HEMO_PA_MN_TCR':'HemodynamicsRegistration_PA_MN_CAN','HEMO_PA_MN_TRR':'HemodynamicsTransplant_PA_MN_CAN',
          'HEMO_PCW_TCR':'HemodynamicsRegistration_PCW_CAN','HEMO_PCW_TRR':'HemodynamicsTransplant_PCW_CAN',
          'HEMO_SYS_TCR':'HemodynamicsRegistration_SYS_CAN','HEMO_SYS_TRR':'HemodynamicsTransplant_SYS_CAN' 
         }

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Numeric', txt='')

# update dataframe
df_can = uf.insertIntoDataFrame(df_can, list(colMap.values()))
df_numeric = uf.insertIntoDataFrame(df_numeric,list(colMap.values()))

# display
df_dict.loc[idx]

Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
128,HemodynamicsRegistration_CO_CAN,MOST RECENT HEMODYNAMICS CO L/MIN @ REGISTRATION,TCR,1994-04-01,NaT,CLINICAL INFORMATION,NUM,,BOTH BEST AND BASELINE COLLECTED BETWEEN 04/01/1994 AND 10/25/1999. AFTER 10/25/1999 ONE VALUE COLLECTED.,HEMO_CO_TCR,Numeric,
129,HemodynamicsTransplant_CO_CAN,MOST RECENT HEMODYNAMICS CO L/MIN @ TRANSPLANT,TRR,1994-04-01,NaT,PRETRANSPLANT CLINICAL INFORMATION,NUM,,BOTH BEST AND BASELINE COLLECTED BETWEEN 04/01/1994 AND 10/25/1999. AFTER 10/25/1999 ONE VALUE COLLECTED.,HEMO_CO_TRR,Numeric,
130,HemodynamicsRegistration_PA_DIA_CAN,MOST RECENT HEMODYNAMICS PA (DIA) MM/HG @ REGISTRATION,TCR,1994-04-01,NaT,CLINICAL INFORMATION,NUM,,BOTH BEST AND BASELINE COLLECTED BETWEEN 04/01/1994 AND 10/25/1999. AFTER 10/25/1999 ONE VALUE COLLECTED.,HEMO_PA_DIA_TCR,Numeric,
131,HemodynamicsTransplant_PA_DIA_CAN,MOST RECENT HEMODYNAMICS PA (DIA) MM/HG @ TRANSPLANT,TRR,1994-04-01,NaT,PRETRANSPLANT CLINICAL INFORMATION,NUM,,BOTH BEST AND BASELINE COLLECTED BETWEEN 04/01/1994 AND 10/25/1999. AFTER 10/25/1999 ONE VALUE COLLECTED.,HEMO_PA_DIA_TRR,Numeric,
132,HemodynamicsRegistration_PA_MN_CAN,MOST RECENT HEMODYNAMICS PA (MEAN) MM/HG @ REGISTRATION,TCR,1994-04-01,NaT,CLINICAL INFORMATION,NUM,,BOTH BEST AND BASELINE COLLECTED BETWEEN 04/01/1994 AND 10/25/1999. AFTER 10/25/1999 ONE VALUE COLLECTED.,HEMO_PA_MN_TCR,Numeric,
133,HemodynamicsTransplant_PA_MN_CAN,MOST RECENT HEMODYNAMICS PA (MEAN) MM/HG @ TRANSPLANT,TRR,1994-04-01,NaT,PRETRANSPLANT CLINICAL INFORMATION,NUM,,BOTH BEST AND BASELINE COLLECTED BETWEEN 04/01/1994 AND 10/25/1999. AFTER 10/25/1999 ONE VALUE COLLECTED.,HEMO_PA_MN_TRR,Numeric,
134,HemodynamicsRegistration_PCW_CAN,MOST RECENT HEMODYNAMICS PCW (MEAN) MM/HG @ REGISTRATION,TCR,1994-04-01,NaT,CLINICAL INFORMATION,NUM,,BOTH BEST AND BASELINE COLLECTED BETWEEN 04/01/1994 AND 10/25/1999. AFTER 10/25/1999 ONE VALUE COLLECTED.,HEMO_PCW_TCR,Numeric,
135,HemodynamicsTransplant_PCW_CAN,MOST RECENT HEMODYNAMICS PCW (MEAN) MM/HG @ TRANSPLANT,TRR,1994-04-01,NaT,PRETRANSPLANT CLINICAL INFORMATION,NUM,,BOTH BEST AND BASELINE COLLECTED BETWEEN 04/01/1994 AND 10/25/1999. AFTER 10/25/1999 ONE VALUE COLLECTED.,HEMO_PCW_TRR,Numeric,
136,HemodynamicsRegistration_SYS_CAN,MOST RECENT HEMODYNAMICS PA (SYS) MM/HG @ REGISTRATION,TCR,1994-04-01,NaT,CLINICAL INFORMATION,NUM,,BOTH BEST AND BASELINE COLLECTED BETWEEN 04/01/1994 AND 10/25/1999. AFTER 10/25/1999 ONE VALUE COLLECTED.,HEMO_SYS_TCR,Numeric,
137,HemodynamicsTransplant_SYS_CAN,MOST RECENT HEMODYNAMICS PA (SYS) MM/HG @ TRANSPLANT,TRR,1994-04-01,NaT,PRETRANSPLANT CLINICAL INFORMATION,NUM,,BOTH BEST AND BASELINE COLLECTED BETWEEN 04/01/1994 AND 10/25/1999. AFTER 10/25/1999 ONE VALUE COLLECTED.,HEMO_SYS_TRR,Numeric,


### INOTROP

In [153]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'INOTROP', True)

                      count unique top   freq
INOTROP_VASO_SYS_TCR  27711      2   N  16259
INOTROP_VASO_DIA_TCR  27690      2   N  16250
INOTROP_VASO_MN_TCR   27424      2   N  16128
INOTROP_VASO_PCW_TCR  26020      2   N  16005
INOTROP_VASO_CO_TCR   27089      2   N  15998
INOTROP_VASO_CO_TRR   27135      2   N  15537
INOTROP_VASO_DIA_TRR  27577      2   N  15700
INOTROP_VASO_MN_TRR   27314      2   N  15572
INOTROP_VASO_PCW_TRR  26321      2   N  15525
INOTROP_VASO_SYS_TRR  27596      2   N  15714
INOTROP_SUPPORT_DON   28546      3   N  17310

NaNs:
INOTROP_VASO_SYS_TCR    1040
INOTROP_VASO_DIA_TCR    1061
INOTROP_VASO_MN_TCR     1327
INOTROP_VASO_PCW_TCR    2731
INOTROP_VASO_CO_TCR     1662
INOTROP_VASO_CO_TRR     1616
INOTROP_VASO_DIA_TRR    1174
INOTROP_VASO_MN_TRR     1437
INOTROP_VASO_PCW_TRR    2430
INOTROP_VASO_SYS_TRR    1155
INOTROP_SUPPORT_DON      205
dtype: int64

Datatypes:
INOTROP_VASO_SYS_TCR    object
INOTROP_VASO_DIA_TCR    object
INOTROP_VASO_MN_TCR     object
INOT

In [154]:
# fill NaN with X: Missing
df[features] = df[features].fillna('X')

# feature value mapping
mapping = {'N': 'No', 'Y': 'Yes', 'U': 'Unknown', 'X': 'Missing'}

# iterate
for col in features:
    # mapping feature
    df = uf.mappingCol(df, col, mapping, display=True)


# mapping
colMap = {'INOTROP_VASO_CO_TCR': 'IntropesVasodilatorsRegistration_CO_CAN', 'INOTROP_VASO_CO_TRR':'IntropesVasodilatorsTransplant_CO_CAN', 
          'INOTROP_VASO_DIA_TCR':'IntropesVasodilatorsRegistration_DIA_CAN', 'INOTROP_VASO_DIA_TRR':'IntropesVasodilatorsTransplant_DIA_CAN',
          'INOTROP_VASO_MN_TCR':'IntropesVasodilatorsRegistration_MN_CAN','INOTROP_VASO_MN_TRR':'IntropesVasodilatorsTransplant_MN_CAN',
          'INOTROP_VASO_PCW_TCR':'IntropesVasodilatorsRegistration_PCW_CAN','INOTROP_VASO_PCW_TRR':'IntropesVasodilatorsTransplant_PCW_CAN',
          'INOTROP_VASO_SYS_TCR':'IntropesVasodilatorsRegistration_SYS_CAN','INOTROP_VASO_SYS_TRR':'IntropesVasodilatorsTransplant_SYS_CAN' 
         }


# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt='N/Y/U/X to No/Yes/Unknown/Missing')

# update dataframe
df_can = uf.insertIntoDataFrame(df_can, list(colMap.values()))
df_nominal = uf.insertIntoDataFrame(df_nominal,list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# donor
colMap = {'INOTROP_SUPPORT_DON':'IntropicMedicationProcurement_DON'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt='N/Y/U/X to No/Yes/Unknown/Missing')

# update dataframe
df_don = uf.insertIntoDataFrame(df_don, list(colMap.values()))
df_nominal = uf.insertIntoDataFrame(df_nominal,list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.loc[idx]

Converted Column INOTROP_VASO_SYS_TCR Unique Vaue(s) ['Yes', 'No', 'Missing']
Categories (3, object): ['Missing', 'No', 'Yes']
Converted Column INOTROP_VASO_DIA_TCR Unique Vaue(s) ['Yes', 'No', 'Missing']
Categories (3, object): ['Missing', 'No', 'Yes']
Converted Column INOTROP_VASO_MN_TCR Unique Vaue(s) ['Yes', 'No', 'Missing']
Categories (3, object): ['Missing', 'No', 'Yes']
Converted Column INOTROP_VASO_PCW_TCR Unique Vaue(s) ['Yes', 'No', 'Missing']
Categories (3, object): ['Missing', 'No', 'Yes']
Converted Column INOTROP_VASO_CO_TCR Unique Vaue(s) ['Yes', 'No', 'Missing']
Categories (3, object): ['Missing', 'No', 'Yes']
Converted Column INOTROP_VASO_CO_TRR Unique Vaue(s) ['Missing', 'No', 'Yes']
Categories (3, object): ['Missing', 'No', 'Yes']
Converted Column INOTROP_VASO_DIA_TRR Unique Vaue(s) ['Missing', 'No', 'Yes']
Categories (3, object): ['Missing', 'No', 'Yes']
Converted Column INOTROP_VASO_MN_TRR Unique Vaue(s) ['Missing', 'No', 'Yes']
Categories (3, object): ['Missing', '

Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
172,IntropicMedicationProcurement_DON,DECEASED DONOR INOTROPIC MEDICATION AT PROCUREMENT (Y/N),DDR,2003-01-27,NaT,CLINICAL INFORMATION,CHAR(1),,"For Heart, this field was collected since 10/25/1999.",INOTROP_SUPPORT_DON,Category,N/Y/U/X to No/Yes/Unknown/Missing
173,IntropesVasodilatorsRegistration_CO_CAN,MOST RECENT CO L/MIN INOTROPES/VASODILATORS YES/NO AT LISTING,TCR,2004-06-30,NaT,HEART/LUNG MEDICAL FACTORS,CHAR(1),,,INOTROP_VASO_CO_TCR,Category,N/Y/U/X to No/Yes/Unknown/Missing
174,IntropesVasodilatorsTransplant_CO_CAN,TRR CARDIAC OUTPUT MEASUREMENT OBTAINED WHILE ON INOTROPES OR VASODILATERS Y/N,TRR,1999-10-25,NaT,PRETRANSPLANT CLINICAL INFORMATION,CHAR(1),,,INOTROP_VASO_CO_TRR,Category,N/Y/U/X to No/Yes/Unknown/Missing
175,IntropesVasodilatorsRegistration_DIA_CAN,MOST RECENT PA (DIA) MM/HG INOTROPES/VAOSDILATORS YES/NO AT LISTING,TCR,2004-06-30,NaT,HEART/LUNG MEDICAL FACTORS,CHAR(1),,,INOTROP_VASO_DIA_TCR,Category,N/Y/U/X to No/Yes/Unknown/Missing
176,IntropesVasodilatorsTransplant_DIA_CAN,TRR DIASTOLIC MEASUREMENT OBTAINED WHILE ON INOTROPES OR VASODILATERS Y/N,TRR,1999-10-26,NaT,PRETRANSPLANT CLINICAL INFORMATION,CHAR(1),,,INOTROP_VASO_DIA_TRR,Category,N/Y/U/X to No/Yes/Unknown/Missing
177,IntropesVasodilatorsRegistration_MN_CAN,MOST RECENT PA (MEAN) MM/HG INOTROPES/VASODILATORS YES/NO AT LISTING,TCR,2004-06-30,NaT,HEART/LUNG MEDICAL FACTORS,CHAR(1),,,INOTROP_VASO_MN_TCR,Category,N/Y/U/X to No/Yes/Unknown/Missing
178,IntropesVasodilatorsTransplant_MN_CAN,TRR MEAN PULMONARY ARTERY MEASUREMENT OBTAINED WHILE ON INOTROPES OR VASODILATERS Y/N,TRR,1999-10-27,NaT,PRETRANSPLANT CLINICAL INFORMATION,CHAR(1),,,INOTROP_VASO_MN_TRR,Category,N/Y/U/X to No/Yes/Unknown/Missing
179,IntropesVasodilatorsRegistration_PCW_CAN,MOST RECENT PCW (MEAN) MM/HG INOTROPES/VASODILATORS YES/NO AT LISTING,TCR,2004-06-30,NaT,HEART/LUNG MEDICAL FACTORS,CHAR(1),,,INOTROP_VASO_PCW_TCR,Category,N/Y/U/X to No/Yes/Unknown/Missing
180,IntropesVasodilatorsTransplant_PCW_CAN,TRR MEAN PULMONARY CAPILLARY WEDGE MEASUREMENT OBTAINED WHILE ON INOTROPES OR VASODILATERS Y/N,TRR,1999-10-28,NaT,PRETRANSPLANT CLINICAL INFORMATION,CHAR(1),,,INOTROP_VASO_PCW_TRR,Category,N/Y/U/X to No/Yes/Unknown/Missing
181,IntropesVasodilatorsRegistration_SYS_CAN,Most Recent PA (sys) mm/Hg Inotropes/Vasodilators YES/NO AT LISTING,TCR,1999-10-25,NaT,HEART/LUNG MEDICAL FACTORS,CHAR(1),,,INOTROP_VASO_SYS_TCR,Category,N/Y/U/X to No/Yes/Unknown/Missing


In [155]:
df_dict.loc[idx]

Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
172,IntropicMedicationProcurement_DON,DECEASED DONOR INOTROPIC MEDICATION AT PROCUREMENT (Y/N),DDR,2003-01-27,NaT,CLINICAL INFORMATION,CHAR(1),,"For Heart, this field was collected since 10/25/1999.",INOTROP_SUPPORT_DON,Category,N/Y/U/X to No/Yes/Unknown/Missing
173,IntropesVasodilatorsRegistration_CO_CAN,MOST RECENT CO L/MIN INOTROPES/VASODILATORS YES/NO AT LISTING,TCR,2004-06-30,NaT,HEART/LUNG MEDICAL FACTORS,CHAR(1),,,INOTROP_VASO_CO_TCR,Category,N/Y/U/X to No/Yes/Unknown/Missing
174,IntropesVasodilatorsTransplant_CO_CAN,TRR CARDIAC OUTPUT MEASUREMENT OBTAINED WHILE ON INOTROPES OR VASODILATERS Y/N,TRR,1999-10-25,NaT,PRETRANSPLANT CLINICAL INFORMATION,CHAR(1),,,INOTROP_VASO_CO_TRR,Category,N/Y/U/X to No/Yes/Unknown/Missing
175,IntropesVasodilatorsRegistration_DIA_CAN,MOST RECENT PA (DIA) MM/HG INOTROPES/VAOSDILATORS YES/NO AT LISTING,TCR,2004-06-30,NaT,HEART/LUNG MEDICAL FACTORS,CHAR(1),,,INOTROP_VASO_DIA_TCR,Category,N/Y/U/X to No/Yes/Unknown/Missing
176,IntropesVasodilatorsTransplant_DIA_CAN,TRR DIASTOLIC MEASUREMENT OBTAINED WHILE ON INOTROPES OR VASODILATERS Y/N,TRR,1999-10-26,NaT,PRETRANSPLANT CLINICAL INFORMATION,CHAR(1),,,INOTROP_VASO_DIA_TRR,Category,N/Y/U/X to No/Yes/Unknown/Missing
177,IntropesVasodilatorsRegistration_MN_CAN,MOST RECENT PA (MEAN) MM/HG INOTROPES/VASODILATORS YES/NO AT LISTING,TCR,2004-06-30,NaT,HEART/LUNG MEDICAL FACTORS,CHAR(1),,,INOTROP_VASO_MN_TCR,Category,N/Y/U/X to No/Yes/Unknown/Missing
178,IntropesVasodilatorsTransplant_MN_CAN,TRR MEAN PULMONARY ARTERY MEASUREMENT OBTAINED WHILE ON INOTROPES OR VASODILATERS Y/N,TRR,1999-10-27,NaT,PRETRANSPLANT CLINICAL INFORMATION,CHAR(1),,,INOTROP_VASO_MN_TRR,Category,N/Y/U/X to No/Yes/Unknown/Missing
179,IntropesVasodilatorsRegistration_PCW_CAN,MOST RECENT PCW (MEAN) MM/HG INOTROPES/VASODILATORS YES/NO AT LISTING,TCR,2004-06-30,NaT,HEART/LUNG MEDICAL FACTORS,CHAR(1),,,INOTROP_VASO_PCW_TCR,Category,N/Y/U/X to No/Yes/Unknown/Missing
180,IntropesVasodilatorsTransplant_PCW_CAN,TRR MEAN PULMONARY CAPILLARY WEDGE MEASUREMENT OBTAINED WHILE ON INOTROPES OR VASODILATERS Y/N,TRR,1999-10-28,NaT,PRETRANSPLANT CLINICAL INFORMATION,CHAR(1),,,INOTROP_VASO_PCW_TRR,Category,N/Y/U/X to No/Yes/Unknown/Missing
181,IntropesVasodilatorsRegistration_SYS_CAN,Most Recent PA (sys) mm/Hg Inotropes/Vasodilators YES/NO AT LISTING,TCR,1999-10-25,NaT,HEART/LUNG MEDICAL FACTORS,CHAR(1),,,INOTROP_VASO_SYS_TCR,Category,N/Y/U/X to No/Yes/Unknown/Missing


### CIGARETTES

In [156]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'CIG|ABSTAIN', True)

                   count unique  top   freq       mean         std  min  25%  50%  75%    max
CIG_USE            28675      2    N  16093        NaN         NaN  NaN  NaN  NaN  NaN    NaN
TCR_DUR_ABSTAIN  12582.0    NaN  NaN    NaN  41.125735  184.942702  1.0  3.0  7.0  7.0  998.0
HIST_CIG_DON       28746      3    N  24978        NaN         NaN  NaN  NaN  NaN  NaN    NaN

NaNs:
CIG_USE               76
TCR_DUR_ABSTAIN    16169
HIST_CIG_DON           5
dtype: int64

Datatypes:
CIG_USE             object
TCR_DUR_ABSTAIN    float64
HIST_CIG_DON        object
dtype: object


             Feature                                                 Description            FormSection DataType SASAnalysisFormat Comment Information
31           CIG_USE                                    HISTORY OF CIGARETTE USE   CLINICAL INFORMATION  CHAR(1)                               Unknown
144     HIST_CIG_DON  DECEASED DONOR-HISTORY OF CIGARETTES IN PAST @ >20PACK YRS          DONOR HISTORY  CHAR(1)      

In [157]:
findMappingDfFlat(df.TCR_DUR_ABSTAIN, df_flat, 'CIGDUR', NaN=999)

Compare Length: 10 & 8

CODE            LABEL
   1       0-2 Months
   2      3-12 Months
   3     13-24 Months
   4     25-36 Months
   5     37-48 Months
   6     49-60 Months
   7       >60 Months
 998 Unknown Duration


In [158]:
# fill NaN with X or 999: Missing
df[['CIG_USE','HIST_CIG_DON']] = df[['CIG_USE','HIST_CIG_DON']].fillna('X')
df['TCR_DUR_ABSTAIN'] = df['TCR_DUR_ABSTAIN'].fillna(999).astype(int)

# feature value mapping
mapping = {'N': 'No', 'Y': 'Yes', 'U': 'Unknown', 'X': 'Missing'}

# mapping feature
df = uf.mappingCol(df, 'CIG_USE', mapping, display=True)
df = uf.mappingCol(df, 'HIST_CIG_DON', mapping, display=True)


# df_flat FMTNAME: CIGDURAB
mapping = {
    1: '0-2 months',
    2: '3-12 months',
    3: '13-24 months',
    4: '25-36 months',
    5: '37-48 months',
    6: '49-60 months',
    7: '>60 months',
    8: 'Continues to smoke',
    998: 'Unknown duration',
    999: "Missing"
}

# map
df = uf.mappingCol(df, 'TCR_DUR_ABSTAIN', mapping, False)


# mapping
colMap = {'CIG_USE': 'CigaretteUse_CAN', 'HIST_CIG_DON':'CigaretteHistory_DON', 'TCR_DUR_ABSTAIN': 'CigaretteAbstinence_CAN'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt='N/Y/U/X to No/Yes/Unknown/Missing')
df_dict = uf.updateDictionaryInformation(df_dict, [297], txt='FMTNAME: CIGDURAB')

# update dataframe
df_can = uf.insertIntoDataFrame(df_can, ['CigaretteUse_CAN', 'CigaretteAbstinence_CAN'])
df_don = uf.insertIntoDataFrame(df_don, ['CigaretteHistory_DON'])
df_nominal = uf.insertIntoDataFrame(df_nominal, ['CigaretteUse_CAN','CigaretteHistory_DON'])
df_ordinal = uf.insertIntoDataFrame(df_ordinal, ['CigaretteAbstinence_CAN'])
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.loc[idx]

Converted Column CIG_USE Unique Vaue(s) ['No', 'Yes', 'Missing']
Categories (3, object): ['Missing', 'No', 'Yes']
Converted Column HIST_CIG_DON Unique Vaue(s) ['No', 'Yes', 'Unknown', 'Missing']
Categories (4, object): ['Missing', 'No', 'Unknown', 'Yes']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
31,CigaretteUse_CAN,HISTORY OF CIGARETTE USE,TCR,2004-06-30,NaT,CLINICAL INFORMATION,CHAR(1),,,CIG_USE,Category,N/Y/U/X to No/Yes/Unknown/Missing
144,CigaretteHistory_DON,DECEASED DONOR-HISTORY OF CIGARETTES IN PAST @ >20PACK YRS,DDR,1994-04-01,NaT,DONOR HISTORY,CHAR(1),,,HIST_CIG_DON,Category,N/Y/U/X to No/Yes/Unknown/Missing
271,CigaretteAbstinence_CAN,DURATION OF ABSTINENCE FOR CIGARETTE USE,TCR,2004-06-30,NaT,CANDIDATE INFORMATION,NUM,CIGDURAB,,TCR_DUR_ABSTAIN,Category,N/Y/U/X to No/Yes/Unknown/Missing


### PRIOR_CARD

In [159]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'PRIOR_CARD', True)

                                  count unique   top   freq       mean       std  min  25%   50%   75%   max
PRIOR_CARD_SURG_TCR               28550      3     N  16845        NaN       NaN  NaN  NaN   NaN   NaN   NaN
PRIOR_CARD_SURG_TYPE_TCR        11261.0    NaN   NaN    NaN   9.620904  7.260373  1.0  2.0  16.0  16.0  26.0
PRIOR_CARD_SURG_TYPE_OSTXT_TCR     5999   1697  LVAD   1894        NaN       NaN  NaN  NaN   NaN   NaN   NaN
PRIOR_CARD_SURG_TYPE_TRR         6566.0    NaN   NaN    NaN  12.427201  6.332771  1.0  4.0  16.0  16.0  28.0
PRIOR_CARD_SURG_TRR               28328      3     N  21663        NaN       NaN  NaN  NaN   NaN   NaN   NaN

NaNs:
PRIOR_CARD_SURG_TCR                 201
PRIOR_CARD_SURG_TYPE_TCR          17490
PRIOR_CARD_SURG_TYPE_OSTXT_TCR    22752
PRIOR_CARD_SURG_TYPE_TRR          22185
PRIOR_CARD_SURG_TRR                 423
dtype: int64

Datatypes:
PRIOR_CARD_SURG_TCR                object
PRIOR_CARD_SURG_TYPE_TCR          float64
PRIOR_CARD_SURG_TYPE_OSTXT_TCR

In [160]:
# get unique values
print(uf.combineGetUnique(df, 'PRIOR_CARD_SURG_TYPE_TCR', 'PRIOR_CARD_SURG_TYPE_TRR', 302, False))

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 17, 18, 19, 20, 21, 22, 24, 25, 26, 28, 302]


In [161]:
# check for differences between two sets
uf.symmetricDifference(set(df.PRIOR_CARD_SURG_TYPE_TCR.dropna().unique().astype(int)), set(df.PRIOR_CARD_SURG_TYPE_TRR.dropna().unique().astype(int)))

Symmetric difference: [11, 12, 14, 25, 28]


In [162]:
# fill NaN with X or 302: Missing
df[['PRIOR_CARD_SURG_TCR','PRIOR_CARD_SURG_TRR']] = df[['PRIOR_CARD_SURG_TCR','PRIOR_CARD_SURG_TRR']].fillna('X')
df[['PRIOR_CARD_SURG_TYPE_TCR', 'PRIOR_CARD_SURG_TYPE_TRR']] = df[['PRIOR_CARD_SURG_TYPE_TCR', 'PRIOR_CARD_SURG_TYPE_TRR']].fillna(302).astype(int)

# feature value mapping
mapping = {'N': 'No', 'Y': 'Yes', 'U': 'Unknown', 'X': 'Missing'}

# mapping feature
df = uf.mappingCol(df, 'PRIOR_CARD_SURG_TCR', mapping, display=True)
df = uf.mappingCol(df, 'PRIOR_CARD_SURG_TRR', mapping, display=True)


# df_flat FMTNAME: CARDSURG
mapping = {
    302: "Missing",
    303: "Not Reported",
    1: "CABG",
    2: "Valve Replace/Repair",
    3: "CABG; Valve Replace/Repair",
    4: "Congenital",
    5: "CABG; Congenital",
    6: "Valve Replace/Repair; Congenital",
    7: "CABG; Valve Replace/Repair; Congenital",
    8: "Left Vent. Remodeling",
    9: "CABG; Left Vent. Remodeling",
    10: "Valve Replace/Repair; Left Vent. Remodeling",
    11: "CABG; Valve Replace/Repair; Left Vent. Remodeling",
    12: "Congenital; Left Vent. Remodeling",
    13: "CABG; Congenital; Left Vent. Remodeling",
    14: "Valve Replace/Repair; Congenital; Left Vent. Remodeling",
    15: "CABG; Valve Replace/Repair; Congenital; Left Vent. Remodeling",
    16: "Other, specify",
    17: "CABG; Other, specify",
    18: "Valve Replace/Repair; Other, specify",
    19: "CABG; Valve Replace/Repair; Other, specify",
    20: "Congenital; Other, specify",
    21: "CABG; Congenital; Other, specify",
    22: "Valve Replace/Repair; Congenital; Other, specify",
    23: "CABG; Valve Replace/Repair; Congenital; Other, specify",
    24: "Left Vent. Remodeling; Other, specify",
    25: "CABG; Left Vent. Remodeling; Other, specify",
    26: "Valve Replace/Repair; Left Vent. Remodeling; Other, specify",
    27: "CABG; Valve Replace/Repair; Left Vent. Remodeling; Other, specify",
    28: "Congenital; Left Vent. Remodeling; Other, specify",
    29: "CABG; Congenital; Left Vent. Remodeling; Other, specify",
    30: "Valve Replace/Repair; Congenital; Left Vent. Remodeling; Other, specify",
    31: "CABG; Valve Replace/Repair; Congenital; Left Vent. Remodeling; Other, specify",
    334: "Unknown"
}

# mapping feature
df = uf.mappingCol(df, 'PRIOR_CARD_SURG_TYPE_TCR', mapping, display=True)
df = uf.mappingCol(df, 'PRIOR_CARD_SURG_TYPE_TRR', mapping, display=True)


# mapping
colMap = {'PRIOR_CARD_SURG_TCR': 'PriorCardiacSurgery_CAN', 'PRIOR_CARD_SURG_TRR':'PriorCardiacSurgeryListAndTransplant_CAN',
          'PRIOR_CARD_SURG_TYPE_OSTXT_TCR': 'PriorCardiacSurgeryTypeText_CAN',
          'PRIOR_CARD_SURG_TYPE_TCR': 'PriorCardiacSurgeryType_CAN', 'PRIOR_CARD_SURG_TYPE_TRR': 'PriorCardiacSurgeryTypeListAndTransplant_CAN'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt='')
df_dict = uf.updateDictionaryInformation(df_dict, [248,249], txt='N/Y/U/X to No/Yes/Unknown/Missing')
df_dict = uf.updateDictionaryInformation(df_dict, [225,226], txt='FMTNAME: CARDSURG').copy()

# update dataframe
df_can = uf.insertIntoDataFrame(df_can, list(colMap.values()))
df_nominal = uf.insertIntoDataFrame(df_nominal, list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.loc[idx]

Converted Column PRIOR_CARD_SURG_TCR Unique Vaue(s) ['Yes', 'No', 'Unknown', 'Missing']
Categories (4, object): ['Missing', 'No', 'Unknown', 'Yes']
Converted Column PRIOR_CARD_SURG_TRR Unique Vaue(s) ['Yes', 'No', 'Unknown', 'Missing']
Categories (4, object): ['Missing', 'No', 'Unknown', 'Yes']
Converted Column PRIOR_CARD_SURG_TYPE_TCR Unique Vaue(s) ['Valve Replace/Repair', 'Missing', 'CABG; Other, specify', 'Other, specify', 'CABG', ..., 'CABG; Left Vent. Remodeling', 'CABG; Left Vent. Remodeling; Other, specify', 'Valve Replace/Repair; Left Vent. Remodeling; Other, specify', 'Valve Replace/Repair; Congenital; Left Vent. Remodeling', 'CABG; Valve Replace/Repair; Congenital']
Length: 24
Categories (24, object): ['CABG', 'CABG; Congenital', 'CABG; Congenital; Other, specify', 'CABG; Left Vent. Remodeling', ..., 'Valve Replace/Repair; Congenital; Other, specify', 'Valve Replace/Repair; Left Vent. Remodeling', 'Valve Replace/Repair; Left Vent. Remodeling; Other, specify', 'Valve Replace/

Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
223,PriorCardiacSurgery_CAN,TCR PRIOR CARDIAC SURGERY AT LISTING (NON-TRANSPLANT),TCR,2004-06-30,NaT,CLINICAL INFORMATION,CHAR(1),,,PRIOR_CARD_SURG_TCR,Category,
224,PriorCardiacSurgeryListAndTransplant_CAN,TRR CARDIAC SURGERY BETWEEN LISTING AND TRANSPLANT (NON-TRANSPLANT),TRR,2004-06-30,NaT,PRETRANSPLANT CLINICAL INFORMATION,CHAR(1),,,PRIOR_CARD_SURG_TRR,Category,
225,PriorCardiacSurgeryType_CAN,TRR PRIOR CARDIAC SURGERY TYPE AT LISTING (NON-TRANSPLANT),TCR,2004-06-30,NaT,PRETRANSPLANT CLINICAL INFORMATION,NUM,CARDSURG,,PRIOR_CARD_SURG_TYPE_TCR,Category,FMTNAME: CARDSURG
226,PriorCardiacSurgeryTypeListAndTransplant_CAN,TRR CARDIAC SURGERY TYPE BETWEEN LISTING AND TRANSPLANT (NON-TRANSPLANT),TRR,2004-06-30,NaT,PRETRANSPLANT CLINICAL INFORMATION,NUM,CARDSURG,,PRIOR_CARD_SURG_TYPE_TRR,Category,FMTNAME: CARDSURG


### DAYS

In [163]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'DAYS_', False)

               count       mean         std  min  25%  50%   75%     max
DAYS_STAT1   28751.0   0.002365    0.401035  0.0  0.0  0.0   0.0    68.0
DAYS_STAT1A  28751.0  26.568154   59.251866  0.0  0.0  2.0  31.0  1569.0
DAYS_STAT2   28751.0  41.314285  185.705396  0.0  0.0  0.0   0.0  4425.0
DAYS_STAT1B  28751.0  87.512226  205.531295  0.0  0.0  1.0  75.0  3507.0
DAYS_STATA4  28751.0  22.439915   92.446189  0.0  0.0  0.0   0.0  1107.0
DAYS_STATA5  28751.0   0.745678   14.756890  0.0  0.0  0.0   0.0   754.0
DAYS_STATA2  28751.0   2.750026   11.928445  0.0  0.0  0.0   0.0   877.0
DAYS_STATA3  28751.0   4.793642   29.134372  0.0  0.0  0.0   0.0   961.0
DAYS_STATA1  28751.0   0.230740    2.303821  0.0  0.0  0.0   0.0   159.0
DAYS_STATA6  28751.0   5.819241   46.213313  0.0  0.0  0.0   0.0  1106.0

NaNs:
DAYS_STAT1     0
DAYS_STAT1A    0
DAYS_STAT2     0
DAYS_STAT1B    0
DAYS_STATA4    0
DAYS_STATA5    0
DAYS_STATA2    0
DAYS_STATA3    0
DAYS_STATA1    0
DAYS_STATA6    0
dtype: int64

Dataty

In [164]:
# mapping
colMap = {'DAYS_STAT1': 'DAYS_STAT1','DAYS_STAT1A':'StatusDays_1A', 'DAYS_STAT1B':'StatusDays_1B', 'DAYS_STAT2':'StatusDays_2',
          'DAYS_STATA1':'StatusDays_1', 'DAYS_STATA2':'StatusDays_A2','DAYS_STATA3':'StatusDays_A3',
          'DAYS_STATA4':'StatusDays_A4', 'DAYS_STATA5':'StatusDays_A5','DAYS_STATA6':'StatusDays_A6'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Nemeric', txt=f"{UNKNOWN}")

# update dataframe
df_unknown  = uf.insertIntoDataFrame(df_unknown , list(colMap.values()))

# display
df_dict.loc[idx]

Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
52,DAYS_STAT1,DAYS IN STATUS 1,CALCULATED,NaT,NaT,,NUM,,,DAYS_STAT1,Nemeric,** UNKNOWN **
53,StatusDays_1A,DAYS IN STATUS 1A,CALCULATED,NaT,NaT,,NUM,,,DAYS_STAT1A,Nemeric,** UNKNOWN **
54,StatusDays_1B,DAYS IN STATUS 1B,CALCULATED,NaT,NaT,,NUM,,,DAYS_STAT1B,Nemeric,** UNKNOWN **
55,StatusDays_2,DAYS IN STATUS 2,CALCULATED,NaT,NaT,,NUM,,,DAYS_STAT2,Nemeric,** UNKNOWN **
56,StatusDays_1,DAYS IN ADULT STATUS 1,CALCULATED,2018-10-18,NaT,,NUM,,,DAYS_STATA1,Nemeric,** UNKNOWN **
57,StatusDays_A2,DAYS IN ADULT STATUS 2,CALCULATED,2018-10-18,NaT,,NUM,,,DAYS_STATA2,Nemeric,** UNKNOWN **
58,StatusDays_A3,DAYS IN ADULT STATUS 3,CALCULATED,2018-10-18,NaT,,NUM,,,DAYS_STATA3,Nemeric,** UNKNOWN **
59,StatusDays_A4,DAYS IN ADULT STATUS 4,CALCULATED,2018-10-18,NaT,,NUM,,,DAYS_STATA4,Nemeric,** UNKNOWN **
60,StatusDays_A5,DAYS IN ADULT STATUS 5,CALCULATED,2018-10-18,NaT,,NUM,,,DAYS_STATA5,Nemeric,** UNKNOWN **
61,StatusDays_A6,DAYS IN ADULT STATUS 6,CALCULATED,2018-10-18,NaT,,NUM,,,DAYS_STATA6,Nemeric,** UNKNOWN **


### INACTACTIVE STATUS REASON

In [165]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'INACT', True)

                    count      mean       std  min  25%  50%   75%   max
LAST_INACT_REASON  7470.0  7.491165  3.099089  1.0  7.0  7.0  11.0  16.0

NaNs:
LAST_INACT_REASON    21281
dtype: int64

Datatypes:
LAST_INACT_REASON    float64
dtype: object


               Feature                                Description FormSection DataType SASAnalysisFormat Comment Information
188  LAST_INACT_REASON  Candidate Reason for Last Inactive Status                  NUM                               Unknown


LAST_INACT_REASON: [nan 11.  7. 13.  4.  8.  2. 10.  5.  3. 12.  1.  9.  6. 15. 16.]


In [166]:
# mapping
colMap = {'LAST_INACT_REASON': 'LastInactiveStatusReason'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt=f"{UNKNOWN} No Mapping Information.")

# update dataframe
df_unknown  = uf.insertIntoDataFrame(df_unknown , list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.loc[idx]

Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
188,LastInactiveStatusReason,Candidate Reason for Last Inactive Status,,NaT,NaT,,NUM,,,LAST_INACT_REASON,Category,** UNKNOWN ** No Mapping Information.


### INIT_STAT

In [167]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'INIT_STAT', True)

             count         mean         std     min     25%     50%     75%     max
INIT_STAT  28751.0  2068.586971  125.863869  2010.0  2020.0  2020.0  2120.0  2999.0

NaNs:
INIT_STAT    0
dtype: int64

Datatypes:
INIT_STAT    float64
dtype: object


       Feature                       Description        FormSection DataType SASAnalysisFormat Comment Information
169  INIT_STAT  INITIAL WAITING LIST STATUS CODE  WAITING LIST DATA      NUM              STAT             Unknown


INIT_STAT: [2010. 2020. 2030. 2999. 2090. 2120. 2160. 2140. 2110. 2130. 2150.]


In [168]:
# change datatype to integer
df[features] = df[features].astype(int)

# df_flat FMTNAME: STAT
mapping = {
  2010: 'HR: Status 1A',
  2020: 'HR: Status 1B',
  2030: 'HR: Status 2',
  2110: 'HR: Adult Status 1',
  2120: 'HR: Adult Status 2',
  2130: 'HR: Adult Status 3',
  2140: 'HR: Adult Status 4',
  2150: 'HR: Adult Status 5',
  2160: 'HR: Adult Status 6',
  2999: 'HR: Temporarily inactive'
}

# map
df = uf.mappingCol(df, 'INIT_STAT', mapping, False)

# mapping
colMap = {'INIT_STAT': 'InitialWaitingListStatusCode_CAN'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Numeric', txt=f"UNKNOWN Unable to Determine the Meaning.")

# update dataframe
df_unknown  = uf.insertIntoDataFrame(df_unknown , list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.loc[idx]

Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
169,InitialWaitingListStatusCode_CAN,INITIAL WAITING LIST STATUS CODE,WAITING LIST DATA,1990-01-01,NaT,WAITING LIST DATA,NUM,STAT,,INIT_STAT,Numeric,UNKNOWN Unable to Determine the Meaning.


### REM_CD

In [169]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'REM_CD', True)

          count      mean       std  min  25%  50%  75%   max
REM_CD  28751.0  4.030712  0.717141  4.0  4.0  4.0  4.0  21.0

NaNs:
REM_CD    0
dtype: int64

Datatypes:
REM_CD    int64
dtype: object


    Feature                               Description        FormSection DataType SASAnalysisFormat                                                              Comment Information
258  REM_CD  REASON FOR REMOVAL FROM THE WAITING LIST  WAITING LIST DATA      NUM             REMCD  THIS IS MISSING IF PATIENT IS STILL WAITING AT TIME DATASET CREATED     Unknown


REM_CD: [ 4 21 15]


In [170]:
# df_flat FMTNAME: REMCD
mapping = {
  4: 'Deceased Donor tx, removed by tx center',
  15: 'Living Donor tx, removed by tx center',
  21: 'Patient died during TX procedure	'
}

# map
df = uf.mappingCol(df, 'REM_CD', mapping, False)

# mapping
colMap = {'REM_CD': 'ReasonRemovalWaitingList_CAN'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt='FMTNAME: REMCD')

# update dataframe
df_unknown  = uf.insertIntoDataFrame(df_unknown , list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.loc[idx]

Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
258,ReasonRemovalWaitingList_CAN,REASON FOR REMOVAL FROM THE WAITING LIST,WAITING LIST DATA,1987-10-01,NaT,WAITING LIST DATA,NUM,REMCD,THIS IS MISSING IF PATIENT IS STILL WAITING AT TIME DATASET CREATED,REM_CD,Category,FMTNAME: REMCD


### TXED

In [171]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'TXED', True)

        count      mean       std  min  25%  50%  75%  max
TXED  28751.0  0.999896  0.010215  0.0  1.0  1.0  1.0  1.0

NaNs:
TXED    0
dtype: int64

Datatypes:
TXED    int64
dtype: object


    Feature                                          Description        FormSection DataType SASAnalysisFormat Comment Information
281    TXED  CANDIDATE RECEIVED DECEASED DONOR TRANSPLANT? 1=YES  WAITING LIST DATA      NUM                               Unknown


TXED: [1 0]


In [172]:
# mapping
colMap = {'TXED': 'ReceivedDeceasedDonorTramsplant_CAN'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt='')

# update dataframe
df_drop  = uf.insertIntoDataFrame(df_drop , list(colMap.values()))

# display
df_dict.loc[idx]

Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
281,ReceivedDeceasedDonorTramsplant_CAN,CANDIDATE RECEIVED DECEASED DONOR TRANSPLANT? 1=YES,WL,NaT,NaT,WAITING LIST DATA,NUM,,,TXED,Category,


### DAYSWAIT

In [173]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'DAYSWAIT', True)

                  count        mean       std  min   25%   50%    75%     max
DAYSWAIT_CHRON  28751.0  221.156551  376.9342  0.0  19.0  76.0  258.0  6412.0

NaNs:
DAYSWAIT_CHRON    0
dtype: int64

Datatypes:
DAYSWAIT_CHRON    int64
dtype: object


           Feature                 Description FormSection DataType SASAnalysisFormat Comment Information
62  DAYSWAIT_CHRON  TOTAL DAYS ON WAITING LIST                  NUM                               Unknown


DAYSWAIT_CHRON: [  11  348   55 ...  956 2297  940]


In [174]:
# mapping
colMap = {'DAYSWAIT_CHRON': 'TotalDayWaitList_CAN'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Numeric', txt='')

# update dataframe
df_can  = uf.insertIntoDataFrame(df_can , list(colMap.values()))
df_numeric  = uf.insertIntoDataFrame(df_numeric , list(colMap.values()))

# display
df_dict.loc[idx]

Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
62,TotalDayWaitList_CAN,TOTAL DAYS ON WAITING LIST,CALCULATED,NaT,NaT,,NUM,,,DAYSWAIT_CHRON,Numeric,


### END_STAT

In [175]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'END_STAT', True)

            count         mean        std     min     25%     50%     75%     max
END_STAT  28751.0  2053.460749  54.591497  2010.0  2010.0  2020.0  2120.0  2160.0

NaNs:
END_STAT    0
dtype: int64

Datatypes:
END_STAT    float64
dtype: object


      Feature                                               Description        FormSection DataType SASAnalysisFormat Comment Information
102  END_STAT  CANDIDATE STATUS AT TRANSPLANT OFFER/REMOVALCURRENT TIME  WAITING LIST DATA      NUM              STAT             Unknown


END_STAT: [2010. 2020. 2030. 2120. 2140. 2110. 2130. 2160. 2150.]


In [176]:
# df_label FMTNAME: CHDMULT
mapping = {
    2010: "Hypoplastic Left Heart Syndrome; Atrioventricular Septal Defect; Other left Heart Valvar/Structural",
    2020: "Transposition of the Great Arteries; Truncus Arteriosus; Congenitally Corrected Transposition (L-TGA)",
    2030: "Hypoplastic Left Heart Syndrome; Transposition of the Great Arteries; Atrioventricular Septal Defect",
    2110: "Hypoplastic Left Heart Syndrome; Transposition of the Great Arteries; Atrioventricular Septal Defect",
    2120: "Atrioventricular Septal Defect; Congenitally Corrected Transposition (L-TGA); Other",
    2130: "Hypoplastic Left Heart Syndrome; Other left Heart Valvar/Structural Hypoplasia; Congenitally Correct",
    2140: "Transposition of the Great Arteries; Atrioventricular Septal Defect; Other left Heart Valvar/Structural",
    2150: "Hypoplastic Left Heart Syndrome; Transposition of the Great Arteries; Truncus Arteriosus; Congenital",
    2160: "Other left Heart Valvar/Structural Hypoplasia; Truncus Arteriosus; Congenitally Corrected Transposition"
}

# iterate
for col in features:
    # mapping feature
    df = uf.mappingCol(df, col, mapping, display=True)
    

# mapping
colMap = {'END_STAT': 'StatusAtTransplant_CAN'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt='FMTNAME: CHDMULT - This Feature could be Ordinal but using as Nominal')

# update dataframe
df_can  = uf.insertIntoDataFrame(df_can , list(colMap.values()))
df_nominal  = uf.insertIntoDataFrame(df_nominal , list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.loc[idx]

Converted Column END_STAT Unique Vaue(s) ['Hypoplastic Left Heart Syndrome; Atrioventricular Septal Defect; Other left Heart Valvar/Structural', 'Transposition of the Great Arteries; Truncus Arteriosus; Congenitally Corrected Transposition (L-TGA)', 'Hypoplastic Left Heart Syndrome; Transposition of the Great Arteries; Atrioventricular Septal Defect', 'Atrioventricular Septal Defect; Congenitally Corrected Transposition (L-TGA); Other', 'Transposition of the Great Arteries; Atrioventricular Septal Defect; Other left Heart Valvar/Structural', 'Hypoplastic Left Heart Syndrome; Other left Heart Valvar/Structural Hypoplasia; Congenitally Correct', 'Other left Heart Valvar/Structural Hypoplasia; Truncus Arteriosus; Congenitally Corrected Transposition', 'Hypoplastic Left Heart Syndrome; Transposition of the Great Arteries; Truncus Arteriosus; Congenital']
Categories (8, object): ['Atrioventricular Septal Defect; Congenitally Corrected Transposition (L-TGA); Other', 'Hypoplastic Left Heart S

Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
102,StatusAtTransplant_CAN,CANDIDATE STATUS AT TRANSPLANT OFFER/REMOVALCURRENT TIME,TRR>TCR,1990-01-01,NaT,WAITING LIST DATA,NUM,STAT,,END_STAT,Category,FMTNAME: CHDMULT - This Feature could be Ordinal but using as Nominal


### ETHNICITY

In [177]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'ETHNICITY|ETHCAT', True)

              count      mean       std  min  25%  50%  75%  max
ETHNICITY   28751.0  0.089632  0.285658  0.0  0.0  0.0  0.0  1.0
ETHCAT      28751.0  1.722514  1.289937  1.0  1.0  1.0  2.0  9.0
ETHCAT_DON  28751.0  1.844945  1.411145  1.0  1.0  1.0  2.0  9.0

NaNs:
ETHNICITY     0
ETHCAT        0
ETHCAT_DON    0
dtype: int64

Datatypes:
ETHNICITY     int64
ETHCAT        int64
ETHCAT_DON    int64
dtype: object


        Feature                                      Description            FormSection DataType SASAnalysisFormat Comment Information
104      ETHCAT                     RECIPIENT ETHNICITY CATEGORY  CANDIDATE INFORMATION      NUM            ETHCAT             Unknown
105  ETHCAT_DON                         DONOR ETHNICITY CATEGORY      DONOR INFORMATION      NUM                               Unknown
106   ETHNICITY  RECIPIENT ETHNICITY (HISPANIC VS. NON-HISPANIC)  CANDIDATE INFORMATION      NUM              ETHN             Unknown


ETHNICITY: [1 0]
ETHCAT: [4 1 2 7 9 5 6]
E

In [178]:
# df_flat FMTNAME: ETHCAT
mapping = {
    1: 'White, Non-Hispanic',
    2: 'Black',
    4: 'Hispanic',
    5: 'Asian',
    6: 'Amer Ind/Alaska Native',
    7: 'Native Hawaiian/other Pacific Islander',
    9: 'Multiracial'
}

# map
df = uf.mappingCol(df, 'ETHCAT', mapping, False)
df = uf.mappingCol(df, 'ETHCAT_DON', mapping, False).copy()

# df_flat FMTNAME: ETHN (2 as Non-Hispanic/Non-Latino and Infer to 0)
mapping = {
    1: 'Hispanic/Latino',
    0: 'Non-Hispanic/Non-Latino'
}

# map
df = uf.mappingCol(df, 'ETHNICITY', mapping, False)


# mapping
colMap = {'ETHCAT': 'Ethnicity_CAN', 'ETHCAT_DON':'Ethnicity_DON', 'ETHNICITY':'Hispanic_CAN'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt='FMTNAME: ETHCAT')
df_dict = uf.updateDictionaryInformation(df_dict, [119], txt=f"FMTNAME: ETHN (2 as Non-Hispanic/Non-Latino and Infer 0 as Non-Hispanic/Non-Latino)").copy()

# update dataframe
df_can  = uf.insertIntoDataFrame(df_can , ['Ethnicity_CAN','Hispanic_CAN'])
df_don  = uf.insertIntoDataFrame(df_don , ['Ethnicity_DON'])
df_nominal  = uf.insertIntoDataFrame(df_nominal , list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.loc[idx]

Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
104,Ethnicity_CAN,RECIPIENT ETHNICITY CATEGORY,TCR-CALCULATED,NaT,NaT,CANDIDATE INFORMATION,NUM,ETHCAT,,ETHCAT,Category,FMTNAME: ETHCAT
105,Ethnicity_DON,DONOR ETHNICITY CATEGORY,DDR/LDR-CALCULATED,NaT,NaT,DONOR INFORMATION,NUM,,,ETHCAT_DON,Category,FMTNAME: ETHCAT
106,Hispanic_CAN,RECIPIENT ETHNICITY (HISPANIC VS. NON-HISPANIC),TCR,1994-04-01,NaT,CANDIDATE INFORMATION,NUM,ETHN,,ETHNICITY,Category,FMTNAME: ETHCAT


### VENTILATOR

In [179]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'VENTILATOR', True)

                  count      mean       std  min  25%  50%  75%  max
VENTILATOR_TCR  28751.0  0.014852  0.120961  0.0  0.0  0.0  0.0  1.0
VENTILATOR_TRR  28751.0  0.014469  0.119416  0.0  0.0  0.0  0.0  1.0

NaNs:
VENTILATOR_TCR    0
VENTILATOR_TRR    0
dtype: int64

Datatypes:
VENTILATOR_TCR    int64
VENTILATOR_TRR    int64
dtype: object


            Feature                                          Description            FormSection DataType SASAnalysisFormat Comment Information
295  VENTILATOR_TCR  PATIENT ON LIFE SUPPORT - VENTILATOR @ REGISTRATION  CANDIDATE INFORMATION      NUM                               Unknown
296  VENTILATOR_TRR    PATIENT ON LIFE SUPPORT - VENTILATOR @ TRANSPLANT         PATIENT STATUS      NUM                               Unknown


VENTILATOR_TCR: [0 1]
VENTILATOR_TRR: [0 1]


In [180]:
# mapping
colMap = {'VENTILATOR_TCR': 'VentilatorRegistration_CAN', 'VENTILATOR_TRR':'VentilatorTransplant_CAN'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt='')

# update dataframe
df_can  = uf.insertIntoDataFrame(df_can , list(colMap.values()))
df_nominal  = uf.insertIntoDataFrame(df_nominal , list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.loc[idx]

Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
295,VentilatorRegistration_CAN,PATIENT ON LIFE SUPPORT - VENTILATOR @ REGISTRATION,TCR,1994-04-01,NaT,CANDIDATE INFORMATION,NUM,,,VENTILATOR_TCR,Category,
296,VentilatorTransplant_CAN,PATIENT ON LIFE SUPPORT - VENTILATOR @ TRANSPLANT,TRR,1987-10-01,NaT,PATIENT STATUS,NUM,,,VENTILATOR_TRR,Category,


### PROC_TY_HR

In [181]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'PROC_TY_HR', True)

              count      mean       std  min  25%  50%  75%  max
PROC_TY_HR  28328.0  1.210357  0.465473  1.0  1.0  1.0  1.0  4.0

NaNs:
PROC_TY_HR    423
dtype: int64

Datatypes:
PROC_TY_HR    float64
dtype: object


        Feature                    Description                      FormSection DataType SASAnalysisFormat Comment Information
228  PROC_TY_HR  PROCEDURE TYPE FOR HEART ONLY  TRANSPLANT CLINICAL INFORMATION      NUM           HR_PROC             Unknown


PROC_TY_HR: [ 1.  2.  3.  4. nan]


In [182]:
# fill NaN with X: Missing
df[features] = df[features].fillna(999).astype(int)

# df_flat FMTNAME: HR_PROC
mapping = {
    1: "Orthotopic Bicaval",
    2: "Orthotopic Traditional",
    3: "Orthotopic Total (Bicaval, PV)",
    4: "Heterotopic",
    999: "Missing"
}

# iterate
for col in features:
    # mapping feature
    df = uf.mappingCol(df, col, mapping, display=True)


# mapping
colMap = {'PROC_TY_HR':'HeartProcedureType_CAN'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt=f"N/Y/U/X to No/Yes/Unknown/Missing")

# update dataframe
df_can  = uf.insertIntoDataFrame(df_can, list(colMap.values()))
df_nominal  = uf.insertIntoDataFrame(df_nominal, list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.iloc[idx]

Converted Column PROC_TY_HR Unique Vaue(s) ['Orthotopic Bicaval', 'Orthotopic Traditional', 'Orthotopic Total (Bicaval, PV)', 'Heterotopic', 'Missing']
Categories (5, object): ['Heterotopic', 'Missing', 'Orthotopic Bicaval', 'Orthotopic Total (Bicaval, PV)', 'Orthotopic Traditional']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
228,HeartProcedureType_CAN,PROCEDURE TYPE FOR HEART ONLY,TRR,1999-10-25,NaT,TRANSPLANT CLINICAL INFORMATION,NUM,HR_PROC,,PROC_TY_HR,Category,N/Y/U/X to No/Yes/Unknown/Missing


### REGION

In [183]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'REGION', True)

          count     mean       std  min  25%  50%  75%   max
REGION  28751.0  5.98727  3.148694  1.0  3.0  5.0  9.0  11.0

NaNs:
REGION    0
dtype: int64

Datatypes:
REGION    int64
dtype: object


    Feature                            Description FormSection DataType SASAnalysisFormat Comment Information
257  REGION  UNOS REGION WHERE TRANSPLANTED/LISTED                  NUM                               Unknown


REGION: [ 5  4  9 11  1 10  7  3  2  8  6]


In [184]:
# mapping
colMap = {'REGION': 'TransplantRegion'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt=f"UNKNOWN No Mapping Information.")

# update dataframe
# df_unknown  = uf.insertIntoDataFrame(df_unknown , list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.iloc[idx]

Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
257,TransplantRegion,UNOS REGION WHERE TRANSPLANTED/LISTED,CALCULATED,NaT,NaT,,NUM,,,REGION,Category,UNKNOWN No Mapping Information.


In [185]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'WORK', True)

                 count unique top   freq
WORK_INCOME_TCR  28497      3   N  22765
WORK_INCOME_TRR  28291      3   N  23973

NaNs:
WORK_INCOME_TCR    254
WORK_INCOME_TRR    460
dtype: int64

Datatypes:
WORK_INCOME_TCR    object
WORK_INCOME_TRR    object
dtype: object


             Feature                               Description            FormSection DataType SASAnalysisFormat Comment Information
300  WORK_INCOME_TCR          WORK FOR INCOME AT REGISTRATION?  CANDIDATE INFORMATION  CHAR(1)                               Unknown
301  WORK_INCOME_TRR  RECIPIENT WORK FOR INCOME AT TRANSPLANT?         PATIENT STATUS  CHAR(1)                               Unknown


WORK_INCOME_TCR: ['N' 'Y' nan 'U']
WORK_INCOME_TRR: ['N' 'U' 'Y' nan]


In [186]:
# fill NaN with X: Missing
df[features] = df[features].fillna('X')

# feature value mapping
mapping = {'N': 'No', 'Y': 'Yes', 'U': 'Unknown', 'X': 'Missing'}

# iterate
for col in features:
    # mapping feature
    df = uf.mappingCol(df, col, mapping, display=True)


# mapping
colMap = {'WORK_INCOME_TCR': 'WorkIncomeRegistration_CAN', 'WORK_INCOME_TRR':'WorkIncomeTransplant_CAN'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt=f"")

# update dataframe
df_can  = uf.insertIntoDataFrame(df_can , list(colMap.values()))
df_nominal  = uf.insertIntoDataFrame(df_nominal , list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.loc[idx]

Converted Column WORK_INCOME_TCR Unique Vaue(s) ['No', 'Yes', 'Missing', 'Unknown']
Categories (4, object): ['Missing', 'No', 'Unknown', 'Yes']
Converted Column WORK_INCOME_TRR Unique Vaue(s) ['No', 'Unknown', 'Yes', 'Missing']
Categories (4, object): ['Missing', 'No', 'Unknown', 'Yes']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
300,WorkIncomeRegistration_CAN,WORK FOR INCOME AT REGISTRATION?,TCR,2004-06-30,NaT,CANDIDATE INFORMATION,CHAR(1),,,WORK_INCOME_TCR,Category,
301,WorkIncomeTransplant_CAN,RECIPIENT WORK FOR INCOME AT TRANSPLANT?,TRR,2004-06-30,NaT,PATIENT STATUS,CHAR(1),,,WORK_INCOME_TRR,Category,


### DQ

In [187]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'DQ', True)

       count       mean        std  min  25%  50%  75%    max
DQ1  28751.0   7.456436  45.437332  0.0  0.0  0.0  2.0  609.0
DQ2  28751.0  11.057215  68.555315  0.0  0.0  0.0  5.0  609.0

NaNs:
DQ1    0
DQ2    0
dtype: int64

Datatypes:
DQ1    int64
DQ2    int64
dtype: object


   Feature                                                      Description        FormSection DataType SASAnalysisFormat Comment Information
80     DQ1  Candidate Most Recent/at Removal DQB1 Antigen From Waiting List  WAITING LIST DATA      NUM             DQHLA             Unknown
81     DQ2  Candidate Most Recent/at Removal DQB2 Antigen From Waiting List  WAITING LIST DATA      NUM             DQHLA             Unknown


DQ1: [  0   6   2   4   5   8   7   3   9   1 303 302 201 202 319 301 501 602
 401 609 402 502 603 503 604]
DQ2: [  0   6   5   7   8   9   4   2   1   3 603 602 201 301 501 609 303 202
 502 402 604 302 503 319 601]


In [188]:
# df_flat FMTNAME: DQHLA
mapping = {
    0: '0',
    1: '1',
    2: '2',
    3: '3',
    4: '4',
    5: '5',
    6: '6',
    7: '7',
    8: '8',
    9: '9',
    97: 'Unknown',
    98: 'No second antigen detected',
    99: 'Not Tested',
    201: '02:01',
    202: '02:02',
    301: '03:01',
    302: '03:02',
    303: '03:03',
    319: '03:19',
    401: '04:01',
    402: '04:02',
    501: '05:01',
    502: '05:02',
    503: '05:03',
    601: '06:01',
    602: '06:02',
    603: '06:03',
    604: '06:04',
    609: '06:09'
}

# iterate
for col in features:
    # mapping feature
    df = uf.mappingCol(df, col, mapping, display=True)


# mapping
colMap = {'DQ1': 'AntigenDQ1_CAN', 'DQ2':'AntigenDQ2_CAN'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt=f"FMTNAME: DQHLA")

# update dataframe
df_can  = uf.insertIntoDataFrame(df_can, list(colMap.values()))
df_ordinal  = uf.insertIntoDataFrame(df_ordinal, list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.iloc[idx]

Converted Column DQ1 Unique Vaue(s) ['0', '6', '2', '4', '5', ..., '04:02', '05:02', '06:03', '05:03', '06:04']
Length: 25
Categories (25, object): ['0', '02:01', '02:02', '03:01', ..., '6', '7', '8', '9']
Converted Column DQ2 Unique Vaue(s) ['0', '6', '5', '7', '8', ..., '06:04', '03:02', '05:03', '03:19', '06:01']
Length: 25
Categories (25, object): ['0', '02:01', '02:02', '03:01', ..., '6', '7', '8', '9']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
80,AntigenDQ1_CAN,Candidate Most Recent/at Removal DQB1 Antigen From Waiting List,WAITING LIST DATA,1987-10-01,NaT,WAITING LIST DATA,NUM,DQHLA,,DQ1,Category,FMTNAME: DQHLA
81,AntigenDQ2_CAN,Candidate Most Recent/at Removal DQB2 Antigen From Waiting List,WAITING LIST DATA,1987-10-01,NaT,WAITING LIST DATA,NUM,DQHLA,,DQ2,Category,FMTNAME: DQHLA


### MEDICAL CONDITION

In [189]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'MED_COND', True)

                count      mean       std  min  25%  50%  75%  max
MED_COND_TRR  28351.0  2.096152  0.916013  1.0  1.0  2.0  3.0  3.0

NaNs:
MED_COND_TRR    400
dtype: int64

Datatypes:
MED_COND_TRR    float64
dtype: object


          Feature                                                Description     FormSection DataType SASAnalysisFormat Comment Information
200  MED_COND_TRR  RECIPIENT MEDICAL CONDITION PRE-TRANSPLANT   @ TRANSPLANT  PATIENT STATUS      NUM           MEDCOND             Unknown


MED_COND_TRR: [ 1.  3.  2. nan]


In [190]:
# fill NaN with 999: Missing
df[features] = df[features].fillna(999).astype(int)

# df_flat FMTNAME: MEDCOND
mapping = {
    1: "In Intensive Care Unit",
    2: "Hospitalized Not in ICU",
    3: "Not Hospitalized",
    999: "Missing"
}

# map
df = uf.mappingCol(df, 'MED_COND_TRR', mapping, False).copy()


# mapping
colMap = {'MED_COND_TRR': 'MedicalConditionTransplant_CAN'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt=f"FMTNAME: MEDCOND")

# update dataframe
df_can  = uf.insertIntoDataFrame(df_can, list(colMap.values()))
df_ordinal  = uf.insertIntoDataFrame(df_ordinal, list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.iloc[idx]

Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
200,MedicalConditionTransplant_CAN,RECIPIENT MEDICAL CONDITION PRE-TRANSPLANT @ TRANSPLANT,TRR,1987-10-01,NaT,PATIENT STATUS,NUM,MEDCOND,,MED_COND_TRR,Category,FMTNAME: MEDCOND


### STATUS

In [191]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'STATUS_', True)

            count unique top   freq
STATUS_TRR  28751      2   V  28249
STATUS_TCR  28751      2   V  28670
STATUS_DDR  28748      3   V  28522

NaNs:
STATUS_TRR    0
STATUS_TCR    0
STATUS_DDR    3
dtype: int64

Datatypes:
STATUS_TRR    object
STATUS_TCR    object
STATUS_DDR    object
dtype: object


        Feature      Description FormSection DataType SASAnalysisFormat Comment Information
263  STATUS_DDR  DDR Form Status              CHAR(1)           FRMSTAT             Unknown
264  STATUS_TCR  TCR Form Status              CHAR(1)           FRMSTAT             Unknown
265  STATUS_TRR  TRR Form Status              CHAR(1)           FRMSTAT             Unknown


STATUS_TRR: ['V' 'E']
STATUS_TCR: ['V' 'E']
STATUS_DDR: ['V' 'S' nan 'E']


In [192]:
# fill NaN with X: Missing
df[features] = df[features].fillna('X')

# mapping
colMap = {}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt=f"{UNKNOWN}")

# update dataframe
df_unknown  = uf.insertIntoDataFrame(df_unknown, features)

# display
df_dict.iloc[idx]

Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
263,STATUS_DDR,DDR Form Status,DDR,NaT,NaT,,CHAR(1),FRMSTAT,,STATUS_DDR,Category,** UNKNOWN **
264,STATUS_TCR,TCR Form Status,TCR,NaT,NaT,,CHAR(1),FRMSTAT,,STATUS_TCR,Category,** UNKNOWN **
265,STATUS_TRR,TRR Form Status,TRR,NaT,NaT,,CHAR(1),FRMSTAT,,STATUS_TRR,Category,** UNKNOWN **


### DRUG & COCAINE & TATTOOS

In [193]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'DRUG_DON|COCAINE|TATTOOS', True)

                     count unique top   freq
HIST_COCAINE_DON     28545      3   N  21715
CONTIN_COCAINE_DON    6323      3   Y   3307
CONTIN_OTH_DRUG_DON  15935      3   Y  12073
HIST_OTH_DRUG_DON    28545      3   Y  15935
TATTOOS              28545      3   Y  16092

NaNs:
HIST_COCAINE_DON         206
CONTIN_COCAINE_DON     22428
CONTIN_OTH_DRUG_DON    12816
HIST_OTH_DRUG_DON        206
TATTOOS                  206
dtype: int64

Datatypes:
HIST_COCAINE_DON       object
CONTIN_COCAINE_DON     object
CONTIN_OTH_DRUG_DON    object
HIST_OTH_DRUG_DON      object
TATTOOS                object
dtype: object


                 Feature                                                   Description    FormSection DataType SASAnalysisFormat Comment Information
40    CONTIN_COCAINE_DON          DECEASED DONOR-HISTORY OF COCAINE USE+RECENT 6MO USE  DONOR HISTORY  CHAR(1)                               Unknown
41   CONTIN_OTH_DRUG_DON  DECEASED DONOR-HISTORY OF OTHER DRUGS IN PAST+RECENT 6MO USE  D

In [194]:
# fill NaN with X: Missing
df[features] = df[features].fillna('X')

# feature value mapping
mapping = {'N': 'No', 'Y': 'Yes', 'U': 'Unknown', 'X': 'Missing'}

# iterate
for col in features:
    # mapping feature
    df = uf.mappingCol(df, col, mapping, display=True)

# mapping
colMap = {'CONTIN_COCAINE_DON': 'CocaineUse_DON', 'HIST_COCAINE_DON':'PastCocaineUse_DON', 'TATTOOS':'Tatoos_DON',
          'CONTIN_OTH_DRUG_DON': 'OtherDrugUse_DON', 'HIST_OTH_DRUG_DON':'PastOtherDrugUse_DON'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt=f"N/Y/U/X to No/Yes/Unknow/Missing")

# update dataframe
df_don  = uf.insertIntoDataFrame(df_don, list(colMap.values()))
df_nominal  = uf.insertIntoDataFrame(df_nominal, list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.iloc[idx]

Converted Column HIST_COCAINE_DON Unique Vaue(s) ['No', 'Yes', 'Unknown', 'Missing']
Categories (4, object): ['Missing', 'No', 'Unknown', 'Yes']
Converted Column CONTIN_COCAINE_DON Unique Vaue(s) ['Missing', 'No', 'Unknown', 'Yes']
Categories (4, object): ['Missing', 'No', 'Unknown', 'Yes']
Converted Column CONTIN_OTH_DRUG_DON Unique Vaue(s) ['Missing', 'Yes', 'No', 'Unknown']
Categories (4, object): ['Missing', 'No', 'Unknown', 'Yes']
Converted Column HIST_OTH_DRUG_DON Unique Vaue(s) ['No', 'Yes', 'Unknown', 'Missing']
Categories (4, object): ['Missing', 'No', 'Unknown', 'Yes']
Converted Column TATTOOS Unique Vaue(s) ['Yes', 'No', 'Unknown', 'Missing']
Categories (4, object): ['Missing', 'No', 'Unknown', 'Yes']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
40,CocaineUse_DON,DECEASED DONOR-HISTORY OF COCAINE USE+RECENT 6MO USE,DDR,1999-10-25,NaT,DONOR HISTORY,CHAR(1),,,CONTIN_COCAINE_DON,Category,N/Y/U/X to No/Yes/Unknow/Missing
41,OtherDrugUse_DON,DECEASED DONOR-HISTORY OF OTHER DRUGS IN PAST+RECENT 6MO USE,DDR,1994-04-01,NaT,DONOR HISTORY,CHAR(1),,,CONTIN_OTH_DRUG_DON,Category,N/Y/U/X to No/Yes/Unknow/Missing
145,PastCocaineUse_DON,DECEASED DONOR-HISTORY OF COCAINE USE IN PAST,DDR,1999-10-25,NaT,DONOR HISTORY,CHAR(1),,,HIST_COCAINE_DON,Category,N/Y/U/X to No/Yes/Unknow/Missing
149,PastOtherDrugUse_DON,DECEASED DONOR-HISTORY OF OTHER DRUG USE IN PAST,DDR,1994-04-01,NaT,DONOR HISTORY,CHAR(1),,,HIST_OTH_DRUG_DON,Category,N/Y/U/X to No/Yes/Unknow/Missing
267,Tatoos_DON,DECEASED DONOR-TATOOS,DDR,1999-10-25,NaT,DONOR HISTORY,CHAR(1),,,TATTOOS,Category,N/Y/U/X to No/Yes/Unknow/Missing


### LIFE SUPPORT

In [195]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'LIFE_SUP', True)

              count unique top   freq
LIFE_SUP_TCR  28676      2   Y  18169
LIFE_SUP_TRR  28354      2   Y  23303

NaNs:
LIFE_SUP_TCR     75
LIFE_SUP_TRR    397
dtype: int64

Datatypes:
LIFE_SUP_TCR    object
LIFE_SUP_TRR    object
dtype: object


          Feature                                           Description            FormSection DataType SASAnalysisFormat Comment Information
190  LIFE_SUP_TCR                 CANDIDATE LIFE SUPPORT @ REGISTRATION  CANDIDATE INFORMATION  CHAR(1)                               Unknown
191  LIFE_SUP_TRR  RECIPIENT LIFE SUPPORT PRE-TRANSPLANT   @ TRANSPLANT         PATIENT STATUS  CHAR(1)                               Unknown


LIFE_SUP_TCR: ['Y' 'N' nan]
LIFE_SUP_TRR: ['Y' 'N' nan]


In [196]:
# fill NaN with X: Missing
df[features] = df[features].fillna('X')

# feature value mapping
mapping = {'N': 'No', 'Y': 'Yes', 'U': 'Unknown', 'X': 'Missing'}

# iterate
for col in features:
    # mapping feature
    df = uf.mappingCol(df, col, mapping, display=True)

# mapping
colMap = {'LIFE_SUP_TCR': 'LifeSupportRegistration_CAN', 'LIFE_SUP_TRR':'LifeSupportTransplant_CAN'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt=f"N/Y/U/X to No/Yes/Unknow/Missing")

# update dataframe
df_can  = uf.insertIntoDataFrame(df_can, list(colMap.values()))
df_nominal  = uf.insertIntoDataFrame(df_nominal, list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.iloc[idx]

Converted Column LIFE_SUP_TCR Unique Vaue(s) ['Yes', 'No', 'Missing']
Categories (3, object): ['Missing', 'No', 'Yes']
Converted Column LIFE_SUP_TRR Unique Vaue(s) ['Yes', 'No', 'Missing']
Categories (3, object): ['Missing', 'No', 'Yes']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
190,LifeSupportRegistration_CAN,CANDIDATE LIFE SUPPORT @ REGISTRATION,CALCULATED TCR,1987-10-01,NaT,CANDIDATE INFORMATION,CHAR(1),,,LIFE_SUP_TCR,Category,N/Y/U/X to No/Yes/Unknow/Missing
191,LifeSupportTransplant_CAN,RECIPIENT LIFE SUPPORT PRE-TRANSPLANT @ TRANSPLANT,CALCULATED TRR,1987-10-01,NaT,PATIENT STATUS,CHAR(1),,,LIFE_SUP_TRR,Category,N/Y/U/X to No/Yes/Unknow/Missing


### LUNG SURGERY

In [197]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'PRIOR_LUNG', True)

                     count unique top   freq
PRIOR_LUNG_SURG_TRR  28329      3   N  28121

NaNs:
PRIOR_LUNG_SURG_TRR    422
dtype: int64

Datatypes:
PRIOR_LUNG_SURG_TRR    object
dtype: object


                 Feature                                                       Description                         FormSection DataType SASAnalysisFormat Comment Information
227  PRIOR_LUNG_SURG_TRR  TRR LUNG SURGERY BETWEEN LISTING AND TRANSPLANT (NON-TRANSPLANT)  PRETRANSPLANT CLINICAL INFORMATION  CHAR(1)                               Unknown


PRIOR_LUNG_SURG_TRR: ['N' 'Y' 'U' nan]


In [198]:
# fill NaN with X: Missing
df[features] = df[features].fillna('X')

# feature value mapping
mapping = {'N': 'No', 'Y': 'Yes', 'U': 'Unknown', 'X': 'Missing'}

# iterate
for col in features:
    # mapping feature
    df = uf.mappingCol(df, col, mapping, display=True)

# mapping
colMap = {'PRIOR_LUNG_SURG_TRR': 'PriorLungSurgeryAfterRegistration_CAN'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt=f"N/Y/U/X to No/Yes/Unknow/Missing")

# update dataframe
df_can  = uf.insertIntoDataFrame(df_can, list(colMap.values()))
df_nominal  = uf.insertIntoDataFrame(df_nominal, list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.iloc[idx]

Converted Column PRIOR_LUNG_SURG_TRR Unique Vaue(s) ['No', 'Yes', 'Unknown', 'Missing']
Categories (4, object): ['Missing', 'No', 'Unknown', 'Yes']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
227,PriorLungSurgeryAfterRegistration_CAN,TRR LUNG SURGERY BETWEEN LISTING AND TRANSPLANT (NON-TRANSPLANT),TRR,1999-10-25,NaT,PRETRANSPLANT CLINICAL INFORMATION,CHAR(1),,,PRIOR_LUNG_SURG_TRR,Category,N/Y/U/X to No/Yes/Unknow/Missing


### STEROID

In [199]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'STEROID', True)

                 count unique top   freq
STEROID          28322      3   N  26081
PT_STEROIDS_DON  28546      3   Y  20324

NaNs:
STEROID            429
PT_STEROIDS_DON    205
dtype: int64

Datatypes:
STEROID            object
PT_STEROIDS_DON    object
dtype: object


             Feature                                                         Description                         FormSection DataType SASAnalysisFormat Comment Information
240  PT_STEROIDS_DON  DECEASED DONOR-STEROIDS B/N BRAIN DEATH W/IN 24 HRS OF PROCUREMENT                CLINICAL INFORMATION  CHAR(1)                               Unknown
266          STEROID                              CHRONIC STEROID USE Y/N/U @ TRANSPLANT  PRETRANSPLANT CLINICAL INFORMATION  CHAR(1)                               Unknown


STEROID: ['N' 'Y' 'U' nan]
PT_STEROIDS_DON: ['N' 'Y' nan 'U']


In [200]:
# fill NaN with X: Missing
df[features] = df[features].fillna('X')

# feature value mapping
mapping = {'N': 'No', 'Y': 'Yes', 'U': 'Unknown', 'X': 'Missing'}

# iterate
for col in features:
    # mapping feature
    df = uf.mappingCol(df, col, mapping, display=True)


# mapping
colMap = {'PT_STEROIDS_DON': 'SteroidsUse_DON','STEROID':'SteroidsUse_CAN'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt=f"N/Y/U/X to No/Yes/Ubkbown/Missing")

# update dataframe
df_can  = uf.insertIntoDataFrame(df_can, ['SteroidsUse_CAN'])
df_don  = uf.insertIntoDataFrame(df_don, ['SteroidsUse_DON'])
df_nominal  = uf.insertIntoDataFrame(df_nominal, list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.iloc[idx]

Converted Column STEROID Unique Vaue(s) ['No', 'Yes', 'Unknown', 'Missing']
Categories (4, object): ['Missing', 'No', 'Unknown', 'Yes']
Converted Column PT_STEROIDS_DON Unique Vaue(s) ['No', 'Yes', 'Missing', 'Unknown']
Categories (4, object): ['Missing', 'No', 'Unknown', 'Yes']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
240,SteroidsUse_DON,DECEASED DONOR-STEROIDS B/N BRAIN DEATH W/IN 24 HRS OF PROCUREMENT,DDR,1994-04-01,NaT,CLINICAL INFORMATION,CHAR(1),,,PT_STEROIDS_DON,Category,N/Y/U/X to No/Yes/Ubkbown/Missing
266,SteroidsUse_CAN,CHRONIC STEROID USE Y/N/U @ TRANSPLANT,TRR,1994-04-01,NaT,PRETRANSPLANT CLINICAL INFORMATION,CHAR(1),,,STEROID,Category,N/Y/U/X to No/Yes/Ubkbown/Missing


### BILIRUBIN

In [201]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'TBILI', False)

             count      mean       std  min  25%  50%  75%   max
TBILI      28289.0  0.993461  1.714805  0.1  0.5  0.7  1.1  80.0
TBILI_DON  28542.0  1.043629  1.424194  0.0  0.5  0.7  1.1  45.4

NaNs:
TBILI        462
TBILI_DON    209
dtype: int64

Datatypes:
TBILI        float64
TBILI_DON    float64
dtype: object


       Feature                                     Description                         FormSection DataType SASAnalysisFormat Comment Information
268      TBILI  MOST RECENT SERUM TOTAL BILIRUBIN @ TRANSPLANT  PRETRANSPLANT CLINICAL INFORMATION      NUM                               Unknown
269  TBILI_DON         DECEASED DONOR-TERMINAL TOTAL BILIRUBIN                CLINICAL INFORMATION      NUM                               Unknown




In [202]:
# mapping
colMap = {'TBILI': 'TotalBilirubinTransplant_CAN','TBILI_DON':'TerminalTotalBilirubin_DON'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Numeric', txt=f"")

# update dataframe
df_can  = uf.insertIntoDataFrame(df_can, ['TotalBilirubinTransplant_CAN'])
df_don  = uf.insertIntoDataFrame(df_don, ['TerminalTotalBilirubin_DON'])
df_numeric  = uf.insertIntoDataFrame(df_numeric, list(colMap.values()))

# display
df_dict.iloc[idx]

Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
268,TotalBilirubinTransplant_CAN,MOST RECENT SERUM TOTAL BILIRUBIN @ TRANSPLANT,TRR,1994-04-01,NaT,PRETRANSPLANT CLINICAL INFORMATION,NUM,,,TBILI,Numeric,
269,TerminalTotalBilirubin_DON,DECEASED DONOR-TERMINAL TOTAL BILIRUBIN,DDR,1994-04-01,NaT,CLINICAL INFORMATION,NUM,,,TBILI_DON,Numeric,


### TRANSFUSIONS

In [203]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'TRANSFUSIONS', False)

              count unique top   freq
TRANSFUSIONS  28327      3   N  22080

NaNs:
TRANSFUSIONS    424
dtype: int64

Datatypes:
TRANSFUSIONS    object
dtype: object


          Feature                                                          Description                         FormSection DataType SASAnalysisFormat Comment Information
275  TRANSFUSIONS  EVENTS OCCURRING BETWEEN LISTING AND TRANSPLANT: TRANSFUSIONS Y/N/U  PRETRANSPLANT CLINICAL INFORMATION  CHAR(1)                               Unknown




In [204]:
# fill NaN with X: Missing
df[features] = df[features].fillna('X')

# feature value mapping
mapping = {'N': 'No', 'Y': 'Yes', 'U': 'Unknown', 'X': 'Missing'}

# iterate
for col in features:
    # mapping feature
    df = uf.mappingCol(df, col, mapping, display=True)

# mapping
colMap = {'TRANSFUSIONS': 'TransfusionAfterRegistration_CAN'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt=f"N/Y/U/X to No/Yes/Ubkbown/Missing")

# update dataframe
df_can  = uf.insertIntoDataFrame(df_can, list(colMap.values()))
df_nominal  = uf.insertIntoDataFrame(df_nominal, list(colMap.values()))

# display
df_dict.iloc[idx]

Converted Column TRANSFUSIONS Unique Vaue(s) ['No', 'Yes', 'Unknown', 'Missing']
Categories (4, object): ['Missing', 'No', 'Unknown', 'Yes']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
275,TransfusionAfterRegistration_CAN,EVENTS OCCURRING BETWEEN LISTING AND TRANSPLANT: TRANSFUSIONS Y/N/U,TRR,1994-04-01,NaT,PRETRANSPLANT CLINICAL INFORMATION,CHAR(1),,,TRANSFUSIONS,Category,N/Y/U/X to No/Yes/Ubkbown/Missing


### TransfusionNumber

In [205]:
df_flat[['CODE','LABEL']][df_flat.FMTNAME.fillna('').str.contains('(?i)TRANSFUS')]

Unnamed: 0,CODE,LABEL
36264,Null or Missing,Not Reported
36265,0,NONE
36266,1,1 - 5
36267,2,6 - 10
36268,3,GREATER THAN 10
36269,998,UNKNOWN
36270,**OTHER**,Unknown


In [206]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'TRANSFUS_TERM_DON', False)

                     count      mean       std  min  25%  50%  75%    max
TRANSFUS_TERM_DON  28723.0  0.847996  8.370961  0.0  0.0  1.0  1.0  998.0

NaNs:
TRANSFUS_TERM_DON    28
dtype: int64

Datatypes:
TRANSFUS_TERM_DON    float64
dtype: object


               Feature                                                         Description           FormSection DataType SASAnalysisFormat Comment Information
274  TRANSFUS_TERM_DON  DDR:Number of transfusions during this (terminal) hospitalization:  CLINICAL INFORMATION      NUM          TRANSFUS             Unknown




In [207]:
# fill NaN with 999: Missing
df[features] = df[features].fillna(999).astype(int)

# df_flat FMTNAME: TRANSFUS
mapping = {
    0: "NONE",
    1: "1 - 5",
    2: "6 - 10",
    3: "GREATER THAN 10",
    998: "UNKNOWN",
    999: "Missing"
}

# iterate
for col in features:
    # mapping feature
    df = uf.mappingCol(df, col, mapping, display=True)

# mapping
colMap = {'TRANSFUS_TERM_DON':'TransfusionNumber_DON'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt=f"")

# update dataframe
df_don  = uf.insertIntoDataFrame(df_don, list(colMap.values()))
df_ordinal = uf.insertIntoDataFrame(df_ordinal, list(colMap.values()))

# display
df_dict.iloc[idx]

Converted Column TRANSFUS_TERM_DON Unique Vaue(s) ['1 - 5', '6 - 10', 'NONE', 'GREATER THAN 10', 'Missing', 'UNKNOWN']
Categories (6, object): ['1 - 5', '6 - 10', 'GREATER THAN 10', 'Missing', 'NONE', 'UNKNOWN']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
274,TransfusionNumber_DON,DDR:Number of transfusions during this (terminal) hospitalization:,DDR,2004-06-30,NaT,CLINICAL INFORMATION,NUM,TRANSFUS,,TRANSFUS_TERM_DON,Category,


### VENTILATORY

In [208]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'VENT', False)

                         count unique top   freq
VENT_SUPPORT_TRR         28327      3   N  22820
VENT_SUPPORT_AFTER_LIST  28327      3   N  22820

NaNs:
VENT_SUPPORT_TRR           424
VENT_SUPPORT_AFTER_LIST    424
dtype: int64

Datatypes:
VENT_SUPPORT_TRR           object
VENT_SUPPORT_AFTER_LIST    object
dtype: object


                     Feature                                                                      Description                         FormSection DataType SASAnalysisFormat Comment Information
293  VENT_SUPPORT_AFTER_LIST  EVENTS OCCURRING BETWEEN LISTING AND TRANSPLANT: EPISODE OF VENTILATORY SUPPORT  PRETRANSPLANT CLINICAL INFORMATION  CHAR(1)                               Unknown
294         VENT_SUPPORT_TRR                                               TRR EPISODE OF VENTILATORY SUPPORT                                      CHAR(1)                               Unknown




In [209]:
# fill NaN with X: Missing
df[features] = df[features].fillna('X')

# feature value mapping
mapping = {'N': 'No', 'Y': 'Yes', 'U': 'Unknown', 'X': 'Missing'}

# iterate
for col in features:
    # mapping feature
    df = uf.mappingCol(df, col, mapping, display=True)


# mapping
colMap = {'VENT_SUPPORT_AFTER_LIST': 'VentilatorySupportAfterRegistration_CAN', 'VENT_SUPPORT_TRR': 'VentilatorySupport_CAN'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt=f"N/Y/U/X to No/Yes/Ubkbown/Missing")

# update dataframe
df_can  = uf.insertIntoDataFrame(df_can, list(colMap.values()))
df_nominal  = uf.insertIntoDataFrame(df_nominal, list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.iloc[idx]

Converted Column VENT_SUPPORT_TRR Unique Vaue(s) ['Yes', 'No', 'Unknown', 'Missing']
Categories (4, object): ['Missing', 'No', 'Unknown', 'Yes']
Converted Column VENT_SUPPORT_AFTER_LIST Unique Vaue(s) ['Yes', 'No', 'Unknown', 'Missing']
Categories (4, object): ['Missing', 'No', 'Unknown', 'Yes']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
293,VentilatorySupportAfterRegistration_CAN,EVENTS OCCURRING BETWEEN LISTING AND TRANSPLANT: EPISODE OF VENTILATORY SUPPORT,TRR,1999-10-25,NaT,PRETRANSPLANT CLINICAL INFORMATION,CHAR(1),,,VENT_SUPPORT_AFTER_LIST,Category,N/Y/U/X to No/Yes/Ubkbown/Missing
294,VentilatorySupport_CAN,TRR EPISODE OF VENTILATORY SUPPORT,,NaT,NaT,,CHAR(1),,,VENT_SUPPORT_TRR,Category,N/Y/U/X to No/Yes/Ubkbown/Missing


### EPSTEIN BARR

In [210]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'EBV', True)

                 count unique top   freq
EBV_SEROSTATUS   28337      4   P  23469
EBV_IGG_CAD_DON  28727      6   P  25618
EBV_IGM_CAD_DON  28722      6   N  22991

NaNs:
EBV_SEROSTATUS     414
EBV_IGG_CAD_DON     24
EBV_IGM_CAD_DON     29
dtype: int64

Datatypes:
EBV_SEROSTATUS     object
EBV_IGG_CAD_DON    object
EBV_IGM_CAD_DON    object
dtype: object


            Feature                                           Description           FormSection DataType SASAnalysisFormat Comment Information
89  EBV_IGG_CAD_DON  DECEASED DONOR EPSTEIN BARR VIRUS BY IGG TEST RESULT  CLINICAL INFORMATION  CHAR(2)           SERSTAT             Unknown
90  EBV_IGM_CAD_DON  DECEASED DONOR EPSTEIN BARR VIRUS BY IGM TEST RESULT  CLINICAL INFORMATION  CHAR(2)           SERSTAT             Unknown
91   EBV_SEROSTATUS                     RECIPIENT EBV STATUS @ TRANSPLANT                        CHAR(2)           SERSTAT             Unknown


EBV_SEROSTATUS: ['ND' 'P' 'N' 'U' nan]
EBV_IGG_CAD_DON: ['P' 'N' 'N

In [211]:
# fill NaN with X: Missing
df[features] = df[features].fillna('X')

# df_flat FMTNAME: SERSTAT
SERSTAT = {
    'C': 'Cannot Disclose',
    'I': 'Indeterminate',
    'N': 'Negative',
    'ND': 'Not Done',
    'P': 'Positive',
    'PD': 'Pending',
    'U': 'Unknown',
    'X': 'Missing'
}

# iterate
for col in features:
    # mapping feature
    df = uf.mappingCol(df, col, SERSTAT, display=True)


# mapping
colMap = {'EBV_IGG_CAD_DON':'EpsteinBarr_IGG_DON','EBV_IGM_CAD_DON':'EpsteinBarr_IGM_DON', 'EBV_SEROSTATUS':'EpsteinBarrSeroStatusTransplant_CAN'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt=f"FMTNAME: SERSTAT")

# update dataframe
df_can  = uf.insertIntoDataFrame(df_can, ['EpsteinBarrSeroStatusTransplant_CAN'])
df_don  = uf.insertIntoDataFrame(df_don, ['EpsteinBarr_IGG_DON','EpsteinBarr_IGM_DON'])
df_nominal  = uf.insertIntoDataFrame(df_nominal, list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.iloc[idx]

Converted Column EBV_SEROSTATUS Unique Vaue(s) ['Not Done', 'Positive', 'Negative', 'Unknown', 'Missing']
Categories (5, object): ['Missing', 'Negative', 'Not Done', 'Positive', 'Unknown']
Converted Column EBV_IGG_CAD_DON Unique Vaue(s) ['Positive', 'Negative', 'Not Done', 'Indeterminate', 'Pending', 'Missing', 'Unknown']
Categories (7, object): ['Indeterminate', 'Missing', 'Negative', 'Not Done', 'Pending', 'Positive', 'Unknown']
Converted Column EBV_IGM_CAD_DON Unique Vaue(s) ['Negative', 'Not Done', 'Positive', 'Indeterminate', 'Pending', 'Missing', 'Unknown']
Categories (7, object): ['Indeterminate', 'Missing', 'Negative', 'Not Done', 'Pending', 'Positive', 'Unknown']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
89,EpsteinBarr_IGG_DON,DECEASED DONOR EPSTEIN BARR VIRUS BY IGG TEST RESULT,DDR,2006-05-03,NaT,CLINICAL INFORMATION,CHAR(2),SERSTAT,,EBV_IGG_CAD_DON,Category,FMTNAME: SERSTAT
90,EpsteinBarr_IGM_DON,DECEASED DONOR EPSTEIN BARR VIRUS BY IGM TEST RESULT,DDR,2006-05-03,NaT,CLINICAL INFORMATION,CHAR(2),SERSTAT,,EBV_IGM_CAD_DON,Category,FMTNAME: SERSTAT
91,EpsteinBarrSeroStatusTransplant_CAN,RECIPIENT EBV STATUS @ TRANSPLANT,CALCULATED,NaT,NaT,,CHAR(2),SERSTAT,,EBV_SEROSTATUS,Category,FMTNAME: SERSTAT


### HBV

In [212]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'HBV', True)

                     count unique top   freq
HBV_CORE             28343      4   N  25699
HBV_SUR_ANTIGEN      28342      4   N  27403
HBV_SURF_TOTAL       19386      4   N  13453
HBV_NAT              13240      4  ND  10697
HBV_CORE_DON         28751      5   N  28146
HBV_SUR_ANTIGEN_DON  28736      4   N  28698
HBV_NAT_DON          20183      4   N  20051

NaNs:
HBV_CORE                 408
HBV_SUR_ANTIGEN          409
HBV_SURF_TOTAL          9365
HBV_NAT                15511
HBV_CORE_DON               0
HBV_SUR_ANTIGEN_DON       15
HBV_NAT_DON             8568
dtype: int64

Datatypes:
HBV_CORE               object
HBV_SUR_ANTIGEN        object
HBV_SURF_TOTAL         object
HBV_NAT                object
HBV_CORE_DON           object
HBV_SUR_ANTIGEN_DON    object
HBV_NAT_DON            object
dtype: object


                 Feature                                        Description                         FormSection DataType SASAnalysisFormat Comment                                 

In [213]:
# fill NaN with X: Missing
df[features] = df[features].fillna('X')

# iterate
for col in features:
    # mapping feature
    df = uf.mappingCol(df, col, SERSTAT, display=True)


# mapping
colMap = {'HBV_CORE': 'Hepatitis_B_CoreAntibody_CAN','HBV_CORE_DON':'Hepatitis_B_CoreAntibody_DON', 
          'HBV_NAT':'HBV_NAT_Result_CAN', 'HBV_NAT_DON':'HBV_NAT_Result_DON', 
          'HBV_SURF_TOTAL':'SurfaceHBVAntibodyTotalTransplant_CAN', 
          'HBV_SUR_ANTIGEN':'SurfaceAntigenHEP_B_CAN', 'HBV_SUR_ANTIGEN_DON':'SurfaceAntigenHEP_B_DON'}


# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt=f"FMTNAME: SERSTAT")

# update dataframe
df_can  = uf.insertIntoDataFrame(df_can, ['Hepatitis_B_CoreAntibody_CAN','HBV_NAT_Result_CAN','SurfaceHBVAntibodyTotalTransplant_CAN','SurfaceAntigenHEP_B_CAN'])
df_don  = uf.insertIntoDataFrame(df_don, ['Hepatitis_B_CoreAntibody_DON','HBV_NAT_Result_DON','SurfaceAntigenHEP_B_DON'])
df_nominal  = uf.insertIntoDataFrame(df_nominal, list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.iloc[idx]

Converted Column HBV_CORE Unique Vaue(s) ['Negative', 'Positive', 'Not Done', 'Unknown', 'Missing']
Categories (5, object): ['Missing', 'Negative', 'Not Done', 'Positive', 'Unknown']
Converted Column HBV_SUR_ANTIGEN Unique Vaue(s) ['Negative', 'Positive', 'Not Done', 'Unknown', 'Missing']
Categories (5, object): ['Missing', 'Negative', 'Not Done', 'Positive', 'Unknown']
Converted Column HBV_SURF_TOTAL Unique Vaue(s) ['Missing', 'Negative', 'Not Done', 'Unknown', 'Positive']
Categories (5, object): ['Missing', 'Negative', 'Not Done', 'Positive', 'Unknown']
Converted Column HBV_NAT Unique Vaue(s) ['Missing', 'Not Done', 'Unknown', 'Negative', 'Positive']
Categories (5, object): ['Missing', 'Negative', 'Not Done', 'Positive', 'Unknown']
Converted Column HBV_CORE_DON Unique Vaue(s) ['Negative', 'Positive', 'Not Done', 'Pending', 'Indeterminate']
Categories (5, object): ['Indeterminate', 'Negative', 'Not Done', 'Pending', 'Positive']
Converted Column HBV_SUR_ANTIGEN_DON Unique Vaue(s) ['Neg

Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
117,Hepatitis_B_CoreAntibody_CAN,RECIPIENT HEPATITIS B-CORE ANTIBODY,TRR,1987-10-01,NaT,,CHAR(2),SERSTAT,,HBV_CORE,Category,FMTNAME: SERSTAT
118,Hepatitis_B_CoreAntibody_DON,DONOR HBV CORE ANTIBODY,DDR/LDR,1994-04-01,NaT,,CHAR(2),SERSTAT,,HBV_CORE_DON,Category,FMTNAME: SERSTAT
119,HBV_NAT_Result_CAN,TRR HBV NAT RESULT,TRR,2018-02-28,NaT,PRETRANSPLANT CLINICAL INFORMATION,CHAR(2),,,HBV_NAT,Category,FMTNAME: SERSTAT
120,HBV_NAT_Result_DON,DDR HBV NAT Results:,DDR,2015-03-31,NaT,CLINICAL INFORMATION,CHAR(2),SERSTAT,,HBV_NAT_DON,Category,FMTNAME: SERSTAT
121,SurfaceAntigenHEP_B_CAN,RECIPIENT HEP B SURFACE ANTIGEN,TRR,1987-10-01,NaT,,CHAR(2),SERSTAT,,HBV_SUR_ANTIGEN,Category,FMTNAME: SERSTAT
122,SurfaceAntigenHEP_B_DON,DONOR HEP B SURFACE ANTIGEN,DDR/LDR,1987-10-01,NaT,,CHAR(2),SERSTAT,,HBV_SUR_ANTIGEN_DON,Category,FMTNAME: SERSTAT
123,SurfaceHBVAntibodyTotalTransplant_CAN,RECIPIENT HBV Surface Antibody Total @ TRANSPLANT,TRR,2015-03-31,NaT,CLINICAL INFORMATION,CHAR(2),SERSTAT,,HBV_SURF_TOTAL,Category,FMTNAME: SERSTAT


### CMV

In [214]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'CMV', True)

            count unique top   freq
CMV_STATUS  28344      4   P  16029
CMV_IGG      8564      4   P   5010
CMV_IGM      8563      4   N   5869
CMV_DON     28742      5   P  17600

NaNs:
CMV_STATUS      407
CMV_IGG       20187
CMV_IGM       20188
CMV_DON           9
dtype: int64

Datatypes:
CMV_STATUS    object
CMV_IGG       object
CMV_IGM       object
CMV_DON       object
dtype: object


       Feature                                                     Description                         FormSection DataType SASAnalysisFormat                                                                       Comment Information
35     CMV_DON  DONOR SEROLOGY ANTI CMV (FOR LIVING DONOR, PRE UNET DATA ONLY)                CLINICAL INFORMATION  CHAR(2)           SERSTAT  START DATE: DECEASED DONORS: 10/1/87-PRESENT; LIVING DONORS 10/1/90-10/25/99     Unknown
36     CMV_IGG                  RECIPIENT-CMV BY IGG TEST RESULT  @ TRANSPLANT  PRETRANSPLANT CLINICAL INFORMATION  CHAR(2)           SERSTAT   

In [215]:
# fill NaN with X: Missing
df[features] = df[features].fillna('X')

# iterate
for col in features:
    # mapping feature
    df = uf.mappingCol(df, col, SERSTAT, display=True)


# mapping
colMap = {'CMV_DON': 'SerologyAntiCMV_DON','CMV_IGG': 'CMV_IGG_Transplant_CAN', 'CMV_IGM': 'CMV_IGM_Transplant_CAN', 'CMV_STATUS': 'CMVStatus_Transplant_CAN'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt=f"")

# update dataframe
df_can  = uf.insertIntoDataFrame(df_can, ['CMV_IGG_Transplant_CAN','CMV_IGM_Transplant_CAN','CMVStatus_Transplant_CAN'])
df_don  = uf.insertIntoDataFrame(df_don, ['SerologyAntiCMV_DON'])
df_nominal  = uf.insertIntoDataFrame(df_nominal, list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.iloc[idx]

Converted Column CMV_STATUS Unique Vaue(s) ['Positive', 'Negative', 'Unknown', 'Not Done', 'Missing']
Categories (5, object): ['Missing', 'Negative', 'Not Done', 'Positive', 'Unknown']
Converted Column CMV_IGG Unique Vaue(s) ['Positive', 'Negative', 'Not Done', 'Unknown', 'Missing']
Categories (5, object): ['Missing', 'Negative', 'Not Done', 'Positive', 'Unknown']
Converted Column CMV_IGM Unique Vaue(s) ['Negative', 'Not Done', 'Unknown', 'Positive', 'Missing']
Categories (5, object): ['Missing', 'Negative', 'Not Done', 'Positive', 'Unknown']
Converted Column CMV_DON Unique Vaue(s) ['Positive', 'Negative', 'Indeterminate', 'Not Done', 'Missing', 'Unknown']
Categories (6, object): ['Indeterminate', 'Missing', 'Negative', 'Not Done', 'Positive', 'Unknown']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
35,SerologyAntiCMV_DON,"DONOR SEROLOGY ANTI CMV (FOR LIVING DONOR, PRE UNET DATA ONLY)",DDR/LDR,NaT,NaT,CLINICAL INFORMATION,CHAR(2),SERSTAT,START DATE: DECEASED DONORS: 10/1/87-PRESENT; LIVING DONORS 10/1/90-10/25/99,CMV_DON,Category,
36,CMV_IGG_Transplant_CAN,RECIPIENT-CMV BY IGG TEST RESULT @ TRANSPLANT,TRR,1999-10-25,2015-03-31,PRETRANSPLANT CLINICAL INFORMATION,CHAR(2),SERSTAT,,CMV_IGG,Category,
37,CMV_IGM_Transplant_CAN,RECIPIENT-CMV BY IGM TEST RESULT @ TRANSPLANT,TRR,1999-10-25,2015-03-31,PRETRANSPLANT CLINICAL INFORMATION,CHAR(2),SERSTAT,,CMV_IGM,Category,
38,CMVStatus_Transplant_CAN,RECIPIENT CMV Status @ TRANSPLANT,TRR,2015-03-31,NaT,CLINICAL INFORMATION,CHAR(2),SERSTAT,,CMV_STATUS,Category,


### HIV

In [216]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, '^HIV', True)

                count unique top   freq
HIV_SEROSTATUS  28342      4   N  27604
HIV_NAT         13241      4  ND  10877
HIV_NAT_DON     20183      3   N  20122

NaNs:
HIV_SEROSTATUS      409
HIV_NAT           15510
HIV_NAT_DON        8568
dtype: int64

Datatypes:
HIV_SEROSTATUS    object
HIV_NAT           object
HIV_NAT_DON       object
dtype: object


            Feature                             Description                         FormSection DataType SASAnalysisFormat Comment Information
150         HIV_NAT                      TRR HIV NAT RESULT  PRETRANSPLANT CLINICAL INFORMATION  CHAR(2)                               Unknown
151     HIV_NAT_DON                    DDR HIV NAT Results:                CLINICAL INFORMATION  CHAR(2)           SERSTAT             Unknown
152  HIV_SEROSTATUS  RECIPIENT HIV SEROSTATUS AT TRANSPLANT                CLINICAL INFORMATION  CHAR(2)                               Unknown


HIV_SEROSTATUS: ['N' 'U' 'ND' 'P' nan]
HIV_NAT: [nan 'ND' 'U' 'N' 'P']


In [217]:
# fill NaN with X: Missing
df[features] = df[features].fillna('X')

# iterate
for col in features:
    # mapping feature
    df = uf.mappingCol(df, col, SERSTAT, display=True)


# mapping
colMap = {'HIV_NAT':'HIV_NAT_PreTransplant_CAN','HIV_NAT_DON':'HIV_NAT_Result_DON', 'HIV_SEROSTATUS':'HIV_SeroStatusTransplant_CAN'}


# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt=f"FMTNAME: SERSTAT")

# update dataframe
df_can  = uf.insertIntoDataFrame(df_can, ['HIV_NAT_PreTransplant_CAN','HIV_SeroStatusTransplant_CAN'])
df_don  = uf.insertIntoDataFrame(df_don, ['HIV_NAT_Result_DON'])
df_nominal  = uf.insertIntoDataFrame(df_nominal, list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.iloc[idx]

Converted Column HIV_SEROSTATUS Unique Vaue(s) ['Negative', 'Unknown', 'Not Done', 'Positive', 'Missing']
Categories (5, object): ['Missing', 'Negative', 'Not Done', 'Positive', 'Unknown']
Converted Column HIV_NAT Unique Vaue(s) ['Missing', 'Not Done', 'Unknown', 'Negative', 'Positive']
Categories (5, object): ['Missing', 'Negative', 'Not Done', 'Positive', 'Unknown']
Converted Column HIV_NAT_DON Unique Vaue(s) ['Missing', 'Negative', 'Unknown', 'Not Done']
Categories (4, object): ['Missing', 'Negative', 'Not Done', 'Unknown']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
150,HIV_NAT_PreTransplant_CAN,TRR HIV NAT RESULT,TRR,2018-02-28,NaT,PRETRANSPLANT CLINICAL INFORMATION,CHAR(2),,,HIV_NAT,Category,FMTNAME: SERSTAT
151,HIV_NAT_Result_DON,DDR HIV NAT Results:,DDR,2015-03-31,NaT,CLINICAL INFORMATION,CHAR(2),SERSTAT,,HIV_NAT_DON,Category,FMTNAME: SERSTAT
152,HIV_SeroStatusTransplant_CAN,RECIPIENT HIV SEROSTATUS AT TRANSPLANT,TRR,1987-10-01,NaT,CLINICAL INFORMATION,CHAR(2),,,HIV_SEROSTATUS,Category,FMTNAME: SERSTAT


### HCV

In [218]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'HCV', True)

                count unique top   freq
HCV_SEROSTATUS  28342      4   N  27136
HCV_NAT         13242      4  ND   9983
HCV_NAT_DON     20183      5   N  19230

NaNs:
HCV_SEROSTATUS      409
HCV_NAT           15509
HCV_NAT_DON        8568
dtype: int64

Datatypes:
HCV_SEROSTATUS    object
HCV_NAT           object
HCV_NAT_DON       object
dtype: object


            Feature             Description                         FormSection DataType SASAnalysisFormat Comment Information
124         HCV_NAT      TRR HCV NAT RESULT  PRETRANSPLANT CLINICAL INFORMATION  CHAR(2)                               Unknown
125     HCV_NAT_DON    DDR HCV NAT Results:                CLINICAL INFORMATION  CHAR(2)           SERSTAT             Unknown
126  HCV_SEROSTATUS  RECIPIENT HEP C STATUS                                      CHAR(2)           SERSTAT             Unknown


HCV_SEROSTATUS: ['N' 'U' 'P' 'ND' nan]
HCV_NAT: [nan 'ND' 'U' 'N' 'P']
HCV_NAT_DON: [nan 'N' 'U' 'ND' 'P' 'I']


In [219]:
# fill NaN with X: Missing
df[features] = df[features].fillna('X')

# iterate
for col in features:
    # mapping feature
    df = uf.mappingCol(df, col, SERSTAT, display=True)


# mapping
colMap = {'HCV_NAT': 'HCV_NAT_PreTranspant_CAN','HCV_NAT_DON':'HCV_NAT_Result_DON', 'HCV_SEROSTATUS':'HEP_C_SerostatusStatus_CAN'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt=f"FMTNAME: SERSTAT")

# update dataframe
df_can  = uf.insertIntoDataFrame(df_can, ['HCV_NAT_PreTranspant_CAN','HEP_C_SerostatusStatus_CAN'])
df_don  = uf.insertIntoDataFrame(df_don, ['HCV_NAT_Result_DON'])
df_nominal  = uf.insertIntoDataFrame(df_nominal, list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.iloc[idx]

Converted Column HCV_SEROSTATUS Unique Vaue(s) ['Negative', 'Unknown', 'Positive', 'Not Done', 'Missing']
Categories (5, object): ['Missing', 'Negative', 'Not Done', 'Positive', 'Unknown']
Converted Column HCV_NAT Unique Vaue(s) ['Missing', 'Not Done', 'Unknown', 'Negative', 'Positive']
Categories (5, object): ['Missing', 'Negative', 'Not Done', 'Positive', 'Unknown']
Converted Column HCV_NAT_DON Unique Vaue(s) ['Missing', 'Negative', 'Unknown', 'Not Done', 'Positive', 'Indeterminate']
Categories (6, object): ['Indeterminate', 'Missing', 'Negative', 'Not Done', 'Positive', 'Unknown']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
124,HCV_NAT_PreTranspant_CAN,TRR HCV NAT RESULT,TRR,2018-02-28,NaT,PRETRANSPLANT CLINICAL INFORMATION,CHAR(2),,,HCV_NAT,Category,FMTNAME: SERSTAT
125,HCV_NAT_Result_DON,DDR HCV NAT Results:,DDR,2015-03-31,NaT,CLINICAL INFORMATION,CHAR(2),SERSTAT,,HCV_NAT_DON,Category,FMTNAME: SERSTAT
126,HEP_C_SerostatusStatus_CAN,RECIPIENT HEP C STATUS,TRR,1994-04-01,NaT,,CHAR(2),SERSTAT,,HCV_SEROSTATUS,Category,FMTNAME: SERSTAT


### HBSAB_DON & VDRL_DON
- HBsAb (Hepatitis B Surface Antibody) is a blood test that detects antibodies produced by the immune system in response to the hepatitis B virus (HBV)
- RPR (Rapid Plasma Reagin) and VDRL (Venereal Disease Research Laboratory) are both nontreponemal tests used to screen for syphilis.

In [220]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'HBSAB_DON|VDRL', True)

           count unique top   freq
VDRL_DON   28731      3   N  27968
HBSAB_DON  28706      6  ND  24420

NaNs:
VDRL_DON     20
HBSAB_DON    45
dtype: int64

Datatypes:
VDRL_DON     object
HBSAB_DON    object
dtype: object


       Feature                       Description           FormSection DataType SASAnalysisFormat Comment Information
116  HBSAB_DON  DECEASED DONOR HBSAB TEST RESULT  CLINICAL INFORMATION  CHAR(2)           SERSTAT             Unknown
292   VDRL_DON    DECEASED DONOR-RPR-VDRL RESULT  CLINICAL INFORMATION  CHAR(2)           SERSTAT             Unknown


VDRL_DON: ['N' 'ND' 'P' nan]
HBSAB_DON: ['ND' 'P' 'N' 'U' 'I' nan 'C']


In [221]:
# fill NaN with X: Missing
df[features] = df[features].fillna('X')

# iterate
for col in features:
    # mapping feature
    df = uf.mappingCol(df, col, SERSTAT, display=True)


# mapping
colMap = {'HBSAB_DON':'AntibodyResultHBSAB_DON', 'VDRL_DON': 'AntibodyResultRPR_VDRL_DON'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt=f"FMTNAME: SERSTAT")

# update dataframe
df_don  = uf.insertIntoDataFrame(df_don, list(colMap.values()))
df_nominal  = uf.insertIntoDataFrame(df_nominal, list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.iloc[idx]

Converted Column VDRL_DON Unique Vaue(s) ['Negative', 'Not Done', 'Positive', 'Missing']
Categories (4, object): ['Missing', 'Negative', 'Not Done', 'Positive']
Converted Column HBSAB_DON Unique Vaue(s) ['Not Done', 'Positive', 'Negative', 'Unknown', 'Indeterminate', 'Missing', 'Cannot Disclose']
Categories (7, object): ['Cannot Disclose', 'Indeterminate', 'Missing', 'Negative', 'Not Done', 'Positive', 'Unknown']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
116,AntibodyResultHBSAB_DON,DECEASED DONOR HBSAB TEST RESULT,DDR,2006-05-03,NaT,CLINICAL INFORMATION,CHAR(2),SERSTAT,,HBSAB_DON,Category,FMTNAME: SERSTAT
292,AntibodyResultRPR_VDRL_DON,DECEASED DONOR-RPR-VDRL RESULT,DDR,1987-10-01,NaT,CLINICAL INFORMATION,CHAR(2),SERSTAT,,VDRL_DON,Category,FMTNAME: SERSTAT


##### HEP_C_ANTI_DON

In [222]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'HEP_C_ANTI_DON', True)

                count unique top   freq
HEP_C_ANTI_DON  28748      5   N  27299

NaNs:
HEP_C_ANTI_DON    3
dtype: int64

Datatypes:
HEP_C_ANTI_DON    object
dtype: object


            Feature                                    Description           FormSection DataType SASAnalysisFormat Comment Information
138  HEP_C_ANTI_DON  DECEASED DONOR-ANTIBODY TO HEP C VIRUS RESULT  CLINICAL INFORMATION  CHAR(2)           SERSTAT             Unknown


HEP_C_ANTI_DON: ['N' 'P' nan 'PD' 'ND' 'I']


In [223]:
# fill NaN with X: Missing
df[features] = df[features].fillna('X')

# iterate
for col in features:
    # mapping feature
    df = uf.mappingCol(df, col, SERSTAT, display=True)


# mapping
colMap = {'HEP_C_ANTI_DON': 'Antibody_HEP_C_DON'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt=f"FMTNAME: SERSTAT")

# update dataframe
df_don  = uf.insertIntoDataFrame(df_don, list(colMap.values()))
df_nominal  = uf.insertIntoDataFrame(df_nominal, list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.iloc[idx]

Converted Column HEP_C_ANTI_DON Unique Vaue(s) ['Negative', 'Positive', 'Missing', 'Pending', 'Not Done', 'Indeterminate']
Categories (6, object): ['Indeterminate', 'Missing', 'Negative', 'Not Done', 'Pending', 'Positive']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
138,Antibody_HEP_C_DON,DECEASED DONOR-ANTIBODY TO HEP C VIRUS RESULT,DDR,1994-04-01,NaT,CLINICAL INFORMATION,CHAR(2),SERSTAT,,HEP_C_ANTI_DON,Category,FMTNAME: SERSTAT


### CDC_RISK

In [224]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'CDC_RISK', True)

                  count unique top   freq
CDC_RISK_HIV_DON  28747      3   N  21178

NaNs:
CDC_RISK_HIV_DON    4
dtype: int64

Datatypes:
CDC_RISK_HIV_DON    object
dtype: object


             Feature                                                                           Description    FormSection DataType SASAnalysisFormat Comment Information
28  CDC_RISK_HIV_DON  DDR: Per PHS, Does the Donor Have Risk Factors for Blood-Borne Disease Transmission?  DONOR HISTORY  CHAR(1)                               Unknown


CDC_RISK_HIV_DON: ['N' 'Y' nan 'U']


In [225]:
# fill NaN with X: Missing
df[features] = df[features].fillna('X')

# feature value mapping
mapping = {'N': 'No', 'Y': 'Yes', 'U': 'Unknown', 'X': 'Missing'}

# iterate
for col in features:
    # mapping feature
    df = uf.mappingCol(df, col, mapping, display=True)


# mapping
colMap = {'CDC_RISK_HIV_DON': 'HIV_Risk_DON'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt=f"N/Y/U/X to No/Yes/Unknown/Missing")

# update dataframe
df_don  = uf.insertIntoDataFrame(df_don, list(colMap.values()))
df_nominal  = uf.insertIntoDataFrame(df_nominal, list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.iloc[idx]

Converted Column CDC_RISK_HIV_DON Unique Vaue(s) ['No', 'Yes', 'Missing', 'Unknown']
Categories (4, object): ['Missing', 'No', 'Unknown', 'Yes']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
28,HIV_Risk_DON,"DDR: Per PHS, Does the Donor Have Risk Factors for Blood-Borne Disease Transmission?",DDR,2004-06-30,NaT,DONOR HISTORY,CHAR(1),,,CDC_RISK_HIV_DON,Category,N/Y/U/X to No/Yes/Unknown/Missing


### TX (PROCEDURE TYPE)

In [226]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'TX_', True)

                  count unique  top   freq         mean       std     min     25%     50%     75%     max
TX_PROCEDUR_TY  28751.0    NaN  NaN    NaN        501.0       0.0   501.0   501.0   501.0   501.0   501.0
TX_TYPE           28328      2    O  28313          NaN       NaN     NaN     NaN     NaN     NaN     NaN
TX_YEAR         28751.0    NaN  NaN    NaN  2016.564989  3.112244  2011.0  2014.0  2017.0  2019.0  2021.0

NaNs:
TX_PROCEDUR_TY      0
TX_TYPE           423
TX_YEAR             0
dtype: int64

Datatypes:
TX_PROCEDUR_TY     int64
TX_TYPE           object
TX_YEAR            int64
dtype: object


            Feature                            Description FormSection DataType SASAnalysisFormat Comment Information
278  TX_PROCEDUR_TY  RECIPIENT PROCEDURE TYPE - CALCULATED                  NUM            THPROC             Unknown
279         TX_TYPE                     TYPE OF TRANSPLANT              CHAR(1)        TX_TYPE_TH             Unknown
280         TX_YEAR              

In [227]:
# fill NaN with X: Missing
df['TX_TYPE'] = df['TX_TYPE'].fillna('X')

# SASAnalysisFormat: THPROC
mapping = { 
    501: "Heart"
}

# mapping feature
df = uf.mappingCol(df, 'TX_PROCEDUR_TY', mapping, display=True)


# SASAnalysisFormat: TX_TYPE_TH
mapping = { 
    'D': 'Double',
    'H': 'Heterotopic',
    'O': 'Orthotopic',
    'S': 'Single',
    'X': 'Missing'
}

# mapping feature
df = uf.mappingCol(df, 'TX_PROCEDUR_TY', mapping, display=True)


# mapping
colMap = {'TX_PROCEDUR_TY':'TransplantProcedure_CAN','TX_TYPE':'TransplantType_CAN','TX_YEAR':'TransplantYear'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Numeric', txt=f"YYYY")
df_dict = uf.updateDictionaryInformation(df_dict, [279], txt=f'SASAnalysisFormat: TX_TYPE_TH', FeatureType='Category')
df_dict = uf.updateDictionaryInformation(df_dict, [278], txt=f'{DROP} SASAnalysisFormat: THPROC - Same Value', FeatureType='Category').copy()

# update dataframe
df_can = uf.insertIntoDataFrame(df_can, ['TransplantType_CAN'])
df_nominal = uf.insertIntoDataFrame(df_nominal, ['TransplantType_CAN'])
df_both = uf.insertIntoDataFrame(df_both, ['TransplantYear'])
df_drop = uf.insertIntoDataFrame(df_drop, ['TransplantProcedure_CAN', 'TransplantYear'])
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.iloc[idx]

Converted Column TX_PROCEDUR_TY Unique Vaue(s) ['Heart']
Categories (1, object): ['Heart']
Converted Column TX_PROCEDUR_TY Unique Vaue(s) ['Heart']
Categories (1, object): ['Heart']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
278,TransplantProcedure_CAN,RECIPIENT PROCEDURE TYPE - CALCULATED,CALCULATED,NaT,NaT,,NUM,THPROC,,TX_PROCEDUR_TY,Category,** DROP ** SASAnalysisFormat: THPROC - Same Value
279,TransplantType_CAN,TYPE OF TRANSPLANT,CALCULATED,NaT,NaT,,CHAR(1),TX_TYPE_TH,,TX_TYPE,Category,SASAnalysisFormat: TX_TYPE_TH
280,TransplantYear,TRANSPLANT YEAR,CALCULATED,NaT,NaT,,NUM,,,TX_YEAR,Numeric,YYYY


### RETYP

In [228]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'RETYP', True)

           count unique top   freq
DON_RETYP  28083      2   Y  17177

NaNs:
DON_RETYP    668
dtype: int64

Datatypes:
DON_RETYP    object
dtype: object


      Feature                          Description       FormSection DataType SASAnalysisFormat Comment Information
78  DON_RETYP  DECEASED DONOR-RETYPED AT TX CENTER  TEST INFORMATION  CHAR(1)                               Unknown


DON_RETYP: ['N' 'Y' nan]


In [229]:
# fill NaN with X: Missing
df[features] = df[features].fillna('X')

# feature value mapping
mapping = {'N': 'No', 'Y': 'Yes', 'X': 'Missing'}

# iterate
for col in features:
    # mapping feature
    df = uf.mappingCol(df, col, mapping, display=True)


# mapping
colMap = {'DON_RETYP': 'DeceasedRetyped_DON'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt=f"N/Y/X to No/Yes/Missing")

# update dataframe
df_don  = uf.insertIntoDataFrame(df_don, list(colMap.values()))
df_nominal  = uf.insertIntoDataFrame(df_nominal, list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.iloc[idx]

Converted Column DON_RETYP Unique Vaue(s) ['No', 'Yes', 'Missing']
Categories (3, object): ['Missing', 'No', 'Yes']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
78,DeceasedRetyped_DON,DECEASED DONOR-RETYPED AT TX CENTER,RH,1987-10-01,NaT,TEST INFORMATION,CHAR(1),,,DON_RETYP,Category,N/Y/X to No/Yes/Missing


### CRSMATCH

In [230]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'CRSMATCH', True)

               count unique top   freq
CRSMATCH_DONE  28085      2   Y  26524

NaNs:
CRSMATCH_DONE    666
dtype: int64

Datatypes:
CRSMATCH_DONE    object
dtype: object


          Feature          Description       FormSection DataType SASAnalysisFormat Comment Information
47  CRSMATCH_DONE  CROSSMATCH DONE Y/N  TEST INFORMATION  CHAR(1)                               Unknown


CRSMATCH_DONE: ['N' 'Y' nan]


In [231]:
# fill NaN with X: Missing
df[features] = df[features].fillna('X')

# feature value mapping
mapping = {'N': 'No', 'Y': 'Yes', 'X': 'Missing'}

# iterate
for col in features:
    # mapping feature
    df = uf.mappingCol(df, col, mapping, display=True)


# mapping
colMap = {'CRSMATCH_DONE': 'CrossMatchDone'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt=f"N/Y/X to No/Yes/Missing")

# update dataframe
df_both  = uf.insertIntoDataFrame(df_both, list(colMap.values()))
df_nominal  = uf.insertIntoDataFrame(df_nominal, list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.iloc[idx]

Converted Column CRSMATCH_DONE Unique Vaue(s) ['No', 'Yes', 'Missing']
Categories (3, object): ['Missing', 'No', 'Yes']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
47,CrossMatchDone,CROSSMATCH DONE Y/N,RH,1994-04-01,NaT,TEST INFORMATION,CHAR(1),,,CRSMATCH_DONE,Category,N/Y/X to No/Yes/Missing


### CPRA

###### PanelReactiveAntibody
- The Panel Reactive Antibody (PRA), or Calculated Panel Reactive Antibody (cPRA), is a metric used in organ transplantation.
    - cPRA = 0%: The recipient is unlikely to have antibodies against most potential donors and is considered less "sensitized," meaning they have a broad range of compatible donor options.
    - Higher cPRA (e.g., 80%+): The recipient has a high level of sensitization, reducing the likelihood of finding compatible donors.
    - cPRA = 100%: The recipient has antibodies against nearly all potential donors, making finding a compatible organ highly challenging.

- CPRA values typically range from 0% to 100%.
    - The results are often grouped into categories, such as:
        - 0%       No Sensitization
        - 1-20%    Low Sensitization
        - 21-50%   Some Sensitization
        - 51-80%   Moderate Sensitization
        - 81-98%   High Sensitization
        - 99-100%  Extreme Sensitization
    - Higher CPRA values indicate a higher degree of sensitization:
    - CPRA of 80-100% is considered highly sensitized
    - CPRA >98% may receive extra priority in organ allocation system

In [232]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'CPRA', False)

             count       mean        std  min  25%  50%   75%    max
CPRA       15523.0  11.305611  23.541431  0.0  0.0  0.0   7.0  100.0
CPRA_PEAK  15511.0  15.264457  26.876729  0.0  0.0  0.0  21.0  100.0

NaNs:
CPRA         13228
CPRA_PEAK    13240
dtype: int64

Datatypes:
CPRA         float64
CPRA_PEAK    float64
dtype: object


      Feature                 Description           FormSection DataType SASAnalysisFormat Comment Information
43       CPRA  Recipient Most Recent CPRA  CLINICAL INFORMATION      NUM                               Unknown
44  CPRA_PEAK          RecipientPeak CPRA  CLINICAL INFORMATION      NUM                               Unknown




In [233]:
# mapping
colMap = {'CPRA': 'CPRA_Recent_CAN', 'CPRA_PEAK':'CPRA_Peak_CAN'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Numeric', txt=f"")

# update dataframe
df_can  = uf.insertIntoDataFrame(df_can, list(colMap.values()))
df_numeric  = uf.insertIntoDataFrame(df_numeric, list(colMap.values()))

# display
df_dict.iloc[idx]

Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
43,CPRA_Recent_CAN,Recipient Most Recent CPRA,RH,2015-03-31,NaT,CLINICAL INFORMATION,NUM,,,CPRA,Numeric,
44,CPRA_Peak_CAN,RecipientPeak CPRA,RH,2015-03-31,NaT,CLINICAL INFORMATION,NUM,,,CPRA_PEAK,Numeric,


### DA1 & DA2
- DA1 and DA2 refer to specific epitopes associated with HLA-DA molecules, which are a part of the major histocompatibility complex (MHC) class II. HLA-DA molecules play critical roles in the immune system by presenting peptide antigens to CD4+ T helper cells.

In [234]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'DA\d', False)

       count       mean         std  min   25%   50%   75%     max
DA1  28748.0  13.910324  125.113362  1.0   2.0   2.0  11.0  6802.0
DA2  28738.0  58.904377  340.186315  0.0  11.0  29.0  36.0  6802.0

NaNs:
DA1     3
DA2    13
dtype: int64

Datatypes:
DA1    float64
DA2    float64
dtype: object


   Feature       Description                             FormSection DataType SASAnalysisFormat Comment Information
48     DA1  DONOR A1 ANTIGEN  DONOR CENTER HISTOCOMPATIBILITY TYPING      NUM            ALOCUS             Unknown
49     DA2  DONOR A2 ANTIGEN  DONOR CENTER HISTOCOMPATIBILITY TYPING      NUM            ALOCUS             Unknown




In [235]:
# fill NaN with X: Missing
df[features] = df[features].fillna(999).astype(int)

# df_flat FMTNAME: ALOCUS
ALOCUS = {
    999: "Missing",
    0: "0",
    1: "1",
    2: "2",
    3: "3",
    9: "9",
    10: "10",
    11: "11",
    19: "19",
    23: "23",
    24: "24",
    25: "25",
    26: "26",
    28: "28",
    29: "29",
    30: "30",
    31: "31",
    32: "32",
    33: "33",
    34: "34",
    36: "36",
    43: "43",
    66: "66",
    68: "68",
    69: "69",
    74: "74",
    80: "80",
    97: "Unknown",
    98: "No second antigen detected",
    99: "Not Tested",
    101: "01:01",
    102: "01:02",
    201: "02:01",
    202: "02:02",
    203: "02:03",
    205: "02:05",
    206: "02:06",
    207: "02:07",
    210: "210",
    211: "02:10",
    218: "02:18",
    301: "03:01",
    302: "03:02",
    1101: "11:01",
    1102: "11:02",
    2402: "24:02",
    2403: "24:03",
    2601: "26:01",
    2602: "26:02",
    2603: "26:03",
    2901: "29:01",
    2902: "29:02",
    3001: "30:01",
    3002: "30:02",
    3204: "32:04",
    3301: "33:01",
    3303: "33:03",
    3401: "34:01",
    3402: "34:02",
    6601: "66:01",
    6602: "66:02",
    6801: "68:01",
    6802: "68:02"
}

# iterate
for col in features:
    # mapping feature
    df = uf.mappingCol(df, col, ALOCUS, display=True)


# mapping
colMap = {'DA1': 'AntigenDA1_DON', 'DA2':'AntigenDA2_DON'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt=f"FMTNAME: ALOCUS")

# update dataframe
df_don  = uf.insertIntoDataFrame(df_don, list(colMap.values()))
df_ordinal  = uf.insertIntoDataFrame(df_ordinal, list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.iloc[idx]

Converted Column DA1 Unique Vaue(s) ['2', '33', '3', '36', '1', ..., '26:01', '02:02', '03:02', '01:02', '28']
Length: 41
Categories (41, object): ['01:01', '01:02', '02:01', '02:02', ..., '68:02', '69', '74', 'Missing']
Converted Column DA2 Unique Vaue(s) ['29', '68', '3', '24', '74', ..., '26:01', '01:01', '02:06', '29:01', '0']
Length: 47
Categories (47, object): ['0', '01:01', '02:01', '02:03', ..., '74', '80', 'Missing', 'No second antigen detected']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
48,AntigenDA1_DON,DONOR A1 ANTIGEN,CALCULATED,1987-10-01,NaT,DONOR CENTER HISTOCOMPATIBILITY TYPING,NUM,ALOCUS,,DA1,Category,FMTNAME: ALOCUS
49,AntigenDA2_DON,DONOR A2 ANTIGEN,CALCULATED,1987-10-01,NaT,DONOR CENTER HISTOCOMPATIBILITY TYPING,NUM,ALOCUS,,DA2,Category,FMTNAME: ALOCUS


### DB1 & DB2

In [236]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'DB\d', False)

       count       mean         std  min   25%   50%   75%     max
DB1  28748.0  57.741095  297.390539  7.0   8.0  35.0  45.0  5603.0
DB2  28736.0  94.724388  433.243047  7.0  44.0  51.0  60.0  8201.0

NaNs:
DB1     3
DB2    15
dtype: int64

Datatypes:
DB1    float64
DB2    float64
dtype: object


   Feature       Description                             FormSection DataType SASAnalysisFormat Comment Information
63     DB1  DONOR B1 ANTIGEN  DONOR CENTER HISTOCOMPATIBILITY TYPING      NUM            BLOCUS             Unknown
64     DB2  DONOR B2 ANTIGEN  DONOR CENTER HISTOCOMPATIBILITY TYPING      NUM            BLOCUS             Unknown




In [237]:
# fill NaN with X: Missing
df[features] = df[features].fillna(999).astype(int)

# df_flat FMTNAME: BLOCUS
BLOCUS = {
   999: "Missing",
    0: '0',
    5: '5',
    7: '7',
    8: '8',
    12: '12',
    13: '13',
    14: '14',
    15: '15',
    16: '16',
    17: '17',
    18: '18',
    21: '21',
    22: '22',
    27: '27',
    35: '35',
    37: '37',
    38: '38',
    39: '39',
    40: '40',
    41: '41',
    42: '42',
    44: '44',
    45: '45',
    46: '46',
    47: '47',
    48: '48',
    49: '49',
    50: '50',
    51: '51',
    52: '52',
    53: '53',
    54: '54',
    55: '55',
    56: '56',
    57: '57',
    58: '58',
    59: '59',
    60: '60',
    61: '61',
    62: '62',
    63: '63',
    64: '64',
    65: '65',
    67: '67',
    70: '70',
    71: '71',
    72: '72',
    73: '73',
    75: '75',
    76: '76',
    77: '77',
    78: '78',
    81: '81',
    82: '82',
    97: 'Unknown',
    98: 'No second antigen detected',
    99: 'Not Tested',
    702: '07:02',
    703: '703',
    704: '07:03',
    714: '07:14',
    801: '08:01',
    802: '08:02',
    803: '08:03',
    804: '08:04',
    1301: '13:01',
    1302: '13:02',
    1304: '13:04',
    1401: '14:01',
    1402: '14:02',
    1501: '15:01',
    1502: '15:02',
    1503: '15:03',
    1504: '15:04',
    1506: '15:06',
    1507: '15:07',
    1510: '15:10',
    1511: '15:11',
    1512: '15:12',
    1513: '15:13',
    1516: '15:16',
    1517: '15:17',
    1518: '15:18',
    1520: '15:20',
    1521: '15:21',
    1522: '15:22',
    1524: '15:24',
    1527: '15:27',
    2703: '27:03',
    2704: '27:04',
    2705: '27:05',
    2706: '27:06',
    2708: '27:08',
    3501: '35:01',
    3502: '35:02',
    3503: '35:03',
    3508: '35:08',
    3512: '35:12',
    3801: '38:01',
    3802: '38:02',
    3901: '39:01',
    3902: '39:02',
    3904: '39:04',
    3905: '39:05',
    3906: '39:06',
    3913: '39:13',
    4001: '40:01',
    4002: '40:02',
    4003: '40:03',
    4004: '40:04',
    4005: '40:05',
    4006: '40:06',
    4101: '41:01',
    4102: '41:02',
    4201: '42:01',
    4202: '42:02',
    4402: '44:02',
    4403: '44:03',
    4415: '44:15',
    4801: '48:01',
    4802: '48:02',
    5001: '50:01',
    5002: '50:02',
    5101: '51:01',
    5102: '51:02',
    5103: '51:03',
    5501: '55:01',
    5502: '55:02',
    5504: '55:04',
    5601: '56:01',
    5603: '56:03',
    5701: '57:01',
    5703: '57:03',
    7801: '78:01',
    8201: '82:01',
    8301: '83:01'
}

# iterate
for col in features:
    # mapping feature
    df = uf.mappingCol(df, col, BLOCUS, display=True)
    

# mapping
colMap = {'DB1': 'AntigenDB1_DON', 'DB2':'AntigenDB2_DON'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt=f"FMTNAME: BLOCUS")

# update dataframe
df_don  = uf.insertIntoDataFrame(df_don, list(colMap.values()))
df_ordinal  = uf.insertIntoDataFrame(df_ordinal, list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.iloc[idx]

Converted Column DB1 Unique Vaue(s) ['44', '49', '7', '39', '61', ..., '41:02', '35:02', '15:16', '15:10', '56:03']
Length: 83
Categories (83, object): ['07:02', '08:01', '13', '13:02', ..., '78', '8', '81', 'Missing']
Converted Column DB2 Unique Vaue(s) ['49', '60', '44', '51', '64', ..., '35:02', '41:02', '15:16', '42:02', '82:01']
Length: 90
Categories (90, object): ['07:02', '08:01', '12', '13', ..., '82', '82:01', 'Missing', 'No second antigen detected']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
63,AntigenDB1_DON,DONOR B1 ANTIGEN,CALCULATED,1987-10-01,NaT,DONOR CENTER HISTOCOMPATIBILITY TYPING,NUM,BLOCUS,,DB1,Category,FMTNAME: BLOCUS
64,AntigenDB2_DON,DONOR B2 ANTIGEN,CALCULATED,1987-10-01,NaT,DONOR CENTER HISTOCOMPATIBILITY TYPING,NUM,BLOCUS,,DB2,Category,FMTNAME: BLOCUS


### RA1 & RA2

In [238]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'RA\d', False)

       count        mean         std  min   25%   50%   75%     max
RA1  26549.0   36.913368  276.723984  0.0   2.0   2.0  23.0  6802.0
RA2  26549.0  117.584994  615.089575  0.0  11.0  30.0  68.0  6802.0

NaNs:
RA1    2202
RA2    2202
dtype: int64

Datatypes:
RA1    float64
RA2    float64
dtype: object


    Feature           Description           FormSection DataType SASAnalysisFormat Comment                        Information
248     RA1  RECIPIENT A1 ANTIGEN  RECIPIENT HLA TYPING      NUM            ALOCUS          N/Y/U/X to No/Yes/Unknown/Missing
249     RA2  RECIPIENT A2 ANTIGEN  RECIPIENT HLA TYPING      NUM            ALOCUS          N/Y/U/X to No/Yes/Unknown/Missing




In [239]:
# fill NaN with X: Missing
df[features] = df[features].fillna(999).astype(int)

# iterate
for col in features:
    # mapping feature
    df = uf.mappingCol(df, col, ALOCUS, display=True)
    

# mapping
colMap = {'RA1': 'AntigenRA1_CAN', 'RA2':'AntigenRA2_CAN'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt=f"FMTNAME: ALOCUS")

# update dataframe
df_can  = uf.insertIntoDataFrame(df_can, list(colMap.values()))
df_ordinal  = uf.insertIntoDataFrame(df_ordinal, list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.iloc[idx]

Converted Column RA1 Unique Vaue(s) ['24', '30', '2', '3', 'Missing', ..., '34:02', '80', '33:01', '66:01', '02:07']
Length: 51
Categories (51, object): ['0', '01:01', '01:02', '02:01', ..., '74', '80', 'Missing', 'Not Tested']
Converted Column RA2 Unique Vaue(s) ['24', '68', '3', '26', 'Missing', ..., '10', '34:01', '02:05', '03:02', '11:02']
Length: 52
Categories (52, object): ['0', '01:01', '02:01', '02:02', ..., '9', 'Missing', 'No second antigen detected', 'Not Tested']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
248,AntigenRA1_CAN,RECIPIENT A1 ANTIGEN,RH,1987-10-01,NaT,RECIPIENT HLA TYPING,NUM,ALOCUS,,RA1,Category,FMTNAME: ALOCUS
249,AntigenRA2_CAN,RECIPIENT A2 ANTIGEN,RH,1987-10-01,NaT,RECIPIENT HLA TYPING,NUM,ALOCUS,,RA2,Category,FMTNAME: ALOCUS


### RB1 & RB1

In [240]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'RB\d')

       count        mean         std  min   25%   50%   75%     max
RB1  26549.0  114.082866  511.823424  0.0   8.0  35.0  49.0  5703.0
RB2  26549.0  176.155034  726.858520  0.0  44.0  52.0  61.0  8201.0

NaNs:
RB1    2202
RB2    2202
dtype: int64

Datatypes:
RB1    float64
RB2    float64
dtype: object


    Feature           Description           FormSection DataType SASAnalysisFormat Comment Information
250     RB1  RECIPIENT B1 ANTIGEN  RECIPIENT HLA TYPING      NUM            BLOCUS             Unknown
251     RB2  RECIPIENT B2 ANTIGEN  RECIPIENT HLA TYPING      NUM            BLOCUS             Unknown




In [241]:
# fill NaN with X: Missing
df[features] = df[features].fillna(999).astype(int)

# iterate
for col in features:
    # mapping feature
    df = uf.mappingCol(df, col, BLOCUS, display=True)
    
# mapping
colMap = {'RB1': 'AntigenRB1_CAN', 'RB2':'AntigenRB2_CAN'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt=f"FMTNAME: BLOCUS")

# update dataframe
df_can  = uf.insertIntoDataFrame(df_can, list(colMap.values()))
df_ordinal  = uf.insertIntoDataFrame(df_ordinal, list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.iloc[idx]

Converted Column RB1 Unique Vaue(s) ['35', '48', '7', 'Missing', '18', ..., '15:07', '15:17', '15:13', '15:02', '15:24']
Length: 101
Categories (101, object): ['0', '07:02', '07:14', '08:01', ..., '81', '82', 'Missing', 'Not Tested']
Converted Column RB2 Unique Vaue(s) ['61', '65', '44', '38', 'Missing', ..., '42:02', '35:08', '27:03', '27:04', '5']
Length: 99
Categories (99, object): ['0', '07:02', '08:01', '13', ..., '82:01', 'Missing', 'No second antigen detected', 'Not Tested']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
250,AntigenRB1_CAN,RECIPIENT B1 ANTIGEN,RH,1987-10-01,NaT,RECIPIENT HLA TYPING,NUM,BLOCUS,,RB1,Category,FMTNAME: BLOCUS
251,AntigenRB2_CAN,RECIPIENT B2 ANTIGEN,RH,1987-10-01,NaT,RECIPIENT HLA TYPING,NUM,BLOCUS,,RB2,Category,FMTNAME: BLOCUS


### [RDR1 & RDR1](https://www.sciencedirect.com/science/article/pii/S0041134503006481?casa_token=ysGMHfKhFkcAAAAA:_oK775b3pZYzCZuzGrA13GO1c9kLTI7SKGOv9panij8OIQLUyKEDVTEeg-nxnp3R9ouIeaB2Wcp7)

In [242]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'RDR\d', True)

        count       mean         std  min   25%   50%   75%      max
RDR1  26549.0  31.596030  207.411179  0.0   4.0   7.0  13.0  10300.0
RDR2  26549.0  58.861275  240.266451  0.0  11.0  14.0  15.0  10300.0

NaNs:
RDR1    2202
RDR2    2202
dtype: int64

Datatypes:
RDR1    float64
RDR2    float64
dtype: object


    Feature            Description           FormSection DataType SASAnalysisFormat Comment Information
252    RDR1  RECIPIENT DR1 ANTIGEN  RECIPIENT HLA TYPING      NUM           DRLOCUS             Unknown
253    RDR2  RECIPIENT DR2 ANTIGEN  RECIPIENT HLA TYPING      NUM           DRLOCUS             Unknown


RDR1: [4.000e+00 8.000e+00 1.300e+01       nan 1.700e+01 1.800e+01 1.000e+00
 7.000e+00 1.500e+01 1.200e+01 1.400e+01 1.100e+01 9.000e+00 1.600e+01
 1.000e+01 1.030e+02 3.000e+00 1.404e+03 0.000e+00 2.000e+00 3.020e+02
 1.101e+03 1.301e+03 1.201e+03 3.010e+02 1.104e+03 1.303e+03 4.010e+02
 9.010e+02 1.010e+02 1.501e+03 1.503e+03 4.030e+02 1.454e+03 1.020e+02
 4.070e+02 9

In [243]:
# fill NaN with X: Missing
df[features] = df[features].fillna(999).astype(int)

# iterate
for col in features:
    # mapping feature
    df = uf.mappingCol(df, col, DRLOCUS, display=True)
    
# mapping
colMap = {'RDR1': 'AntigenRDR1_CAN', 'RDR2':'AntigenRDR2_CAN'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt=f"FMTNAME: DRLOCUS")

# update dataframe
df_can  = uf.insertIntoDataFrame(df_can, list(colMap.values()))
df_ordinal  = uf.insertIntoDataFrame(df_ordinal, list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.iloc[idx]

Converted Column RDR1 Unique Vaue(s) ['4', '8', '13', 'Missing', '17', ..., '15:02', '14:01', '12:02', '04:06', '103']
Length: 54
Categories (54, object): ['0', '01:01', '01:02', '01:03', ..., '8', '9', 'Missing', 'Not Tested']
Converted Column RDR2 Unique Vaue(s) ['4', '9', '13', '11', '14', ..., '13:05', '14:03', '08:02', '14:06', '103']
Length: 57
Categories (57, object): ['0', '01:01', '01:02', '01:03', ..., '9', 'Missing', 'No second antigen detected', 'Not Tested']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
252,AntigenRDR1_CAN,RECIPIENT DR1 ANTIGEN,RH,1987-10-01,NaT,RECIPIENT HLA TYPING,NUM,DRLOCUS,,RDR1,Category,FMTNAME: DRLOCUS
253,AntigenRDR2_CAN,RECIPIENT DR2 ANTIGEN,RH,1987-10-01,NaT,RECIPIENT HLA TYPING,NUM,DRLOCUS,,RDR2,Category,FMTNAME: DRLOCUS


### PRAMR

In [244]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'PRAMR', True)

            count      mean        std  min  25%  50%  75%    max
PRAMR_CL1  8142.0  7.132031  18.656263  0.0  0.0  0.0  2.0  100.0
PRAMR_CL2  7987.0  5.980218  17.542667  0.0  0.0  0.0  0.0  100.0

NaNs:
PRAMR_CL1    20609
PRAMR_CL2    20764
dtype: int64

Datatypes:
PRAMR_CL1    float64
PRAMR_CL2    float64
dtype: object


       Feature                                        Description           FormSection DataType SASAnalysisFormat Comment Information
217  PRAMR_CL1   RECIPIENT MOST RECENT PRA% CLASS I @ TRANSPLANT   RECIPIENT HLA TYPING      NUM                               Unknown
218  PRAMR_CL2  RECIPIENT MOST RECENT PRA% CLASS II @ TRANSPLANT   RECIPIENT HLA TYPING      NUM                               Unknown


PRAMR_CL1: [  2.   0.   7.  11.  nan  12.  84.   9.  23.  42.  31.  24.  19.  20.
   1.   5.  73.  13.  52.   3.  10.  83.  46.  87.   4.  14.   8.  27.
   6.  29.  92.  26.  75.  17.  65.  58.  77.  88.  22.  95.  44.  49.
  63.  99.  61.  76.  94.  47.  15.  72.  7

In [245]:
# mapping
colMap = {'PRAMR_CL1': 'Class1PRA_TransplantPercentage_CAN', 'PRAMR_CL2':'Class2PRA_TransplantPercentage_CAN'}


# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Numeric', txt=f"")

# update dataframe
df_can  = uf.insertIntoDataFrame(df_can, list(colMap.values()))
df_numeric  = uf.insertIntoDataFrame(df_numeric, list(colMap.values()))

# display
df_dict.iloc[idx]

Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
217,Class1PRA_TransplantPercentage_CAN,RECIPIENT MOST RECENT PRA% CLASS I @ TRANSPLANT,RH,2004-06-30,2015-03-31,RECIPIENT HLA TYPING,NUM,,,PRAMR_CL1,Numeric,
218,Class2PRA_TransplantPercentage_CAN,RECIPIENT MOST RECENT PRA% CLASS II @ TRANSPLANT,RH,2004-06-30,2015-03-31,RECIPIENT HLA TYPING,NUM,,,PRAMR_CL2,Numeric,


### DON_TY

In [246]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'DON_TY', True)

        count unique top   freq
DON_TY  28751      2   C  28748

NaNs:
DON_TY    0
dtype: int64

Datatypes:
DON_TY    object
dtype: object


   Feature                      Description        FormSection DataType SASAnalysisFormat Comment Information
79  DON_TY  DONOR TYPE - DECEASED OR LIVING  DONOR INFORMATION  CHAR(3)           DON_TYP             Unknown


DON_TY: ['C' 'L']


In [247]:
# fill NaN with X: Missing
df[features] = df[features].fillna('X')

# df_flat FMTNAME: DON_TYP
mapping = {'C': 'Deceased Donor', 'L': 'Living Donor', 'F': 'Foreign Donor'}

# iterate
for col in features:
    # mapping feature
    df = uf.mappingCol(df, col, mapping, display=True)


# mapping
colMap = {'DON_TY': 'DeceasedOrLiving_DON'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt=f"FMTNAME: DON_TYP")

# update dataframe
df_drop  = uf.insertIntoDataFrame(df_drop, list(colMap.values()))

# display
df_dict.iloc[idx]

Converted Column DON_TY Unique Vaue(s) ['Deceased Donor', 'Living Donor']
Categories (2, object): ['Deceased Donor', 'Living Donor']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
79,DeceasedOrLiving_DON,DONOR TYPE - DECEASED OR LIVING,TRR,1987-10-01,NaT,DONOR INFORMATION,CHAR(3),DON_TYP,,DON_TY,Category,FMTNAME: DON_TYP


### HRT

In [248]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'HRT', True)

             count unique top   freq
TXHRT        28751      1   Y  28751
NON_HRT_DON  28746      2   N  28428

NaNs:
TXHRT          0
NON_HRT_DON    5
dtype: int64

Datatypes:
TXHRT          object
NON_HRT_DON    object
dtype: object


         Feature                             Description     FormSection DataType SASAnalysisFormat Comment Information
202  NON_HRT_DON  DECEASED DONOR-NON-HEART BEATING DONOR  ORGAN RECOVERY  CHAR(1)                               Unknown
282        TXHRT                      SIMULTANEOUS HEART                  CHAR(1)                               Unknown


TXHRT: ['Y']
NON_HRT_DON: ['N' nan 'Y']


In [249]:
# fill NaN with X: Missing
df[features] = df[features].fillna('X')

# feature value mapping
mapping = {'N': 'No', 'Y': 'Yes', 'U': 'Unknown', 'X': 'Missing'}

# iterate
for col in features:
    # mapping feature
    df = uf.mappingCol(df, col, mapping, display=True)

# mapping
colMap = {'NON_HRT_DON': 'NonHeartBeating_DON'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt=f"N/Y/U/X to No/Yes/Unknown/Missing")
df_dict = uf.updateDictionaryInformation(df_dict, [282], txt=f'{DROP}').copy()

# update dataframe
df_don  = uf.insertIntoDataFrame(df_don, list(colMap.values()))
df_nominal  = uf.insertIntoDataFrame(df_nominal, list(colMap.values()))
df_drop  = uf.insertIntoDataFrame(df_drop, ['TXHRT'])
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.iloc[idx]

Converted Column TXHRT Unique Vaue(s) ['Yes']
Categories (1, object): ['Yes']
Converted Column NON_HRT_DON Unique Vaue(s) ['No', 'Missing', 'Yes']
Categories (3, object): ['Missing', 'No', 'Yes']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
202,NonHeartBeating_DON,DECEASED DONOR-NON-HEART BEATING DONOR,DDR,1994-04-01,NaT,ORGAN RECOVERY,CHAR(1),,,NON_HRT_DON,Category,N/Y/U/X to No/Yes/Unknown/Missing
282,TXHRT,SIMULTANEOUS HEART,CALCULATED,NaT,NaT,,CHAR(1),,,TXHRT,Category,** DROP **


### PCO2
- Maintaining appropriate PCO2 levels is crucial for preserving organ function and optimizing outcomes in transplantation. 

In [250]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'PCO2_DON', False)


            count       mean      std   min   25%   50%   75%    max
PCO2_DON  28515.0  38.617131  7.06791  10.1  34.0  38.0  42.1  116.3

NaNs:
PCO2_DON    236
dtype: int64

Datatypes:
PCO2_DON    float64
dtype: object


      Feature Description     FormSection DataType SASAnalysisFormat Comment Information
208  PCO2_DON   DDR:pCO2:  ORGAN RECOVERY      NUM                               Unknown




In [251]:
# mapping
colMap = {'PCO2_DON': 'OrganRecovery_PCO2_DON'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Numeric', txt=f"Maintaining appropriate PCO2 levels is crucial for preserving \
organ function and optimizing outcomes in transplantation.")

# update dataframe
df_don  = uf.insertIntoDataFrame(df_don, list(colMap.values()))
df_numeric  = uf.insertIntoDataFrame(df_numeric, list(colMap.values()))

# display
df_dict.iloc[idx]

Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
208,OrganRecovery_PCO2_DON,DDR:pCO2:,DDR,2004-06-30,NaT,ORGAN RECOVERY,NUM,,,PCO2_DON,Numeric,Maintaining appropriate PCO2 levels is crucial for preserving organ function and optimizing outcomes in transplantation.


### PT_

In [252]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, '^PT_', True)

                   count unique         top   freq
PT_DIURETICS_DON   28545      3           Y  19986
PT_T3_DON          28545      3           N  28416
PT_T4_DON          28545      3           Y  18755
PT_OTH2_OSTXT_DON  24892  11743       ZOSYN    985
PT_OTH3_OSTXT_DON  16947   7176  ROCURONIUM    915
PT_OTH1_OSTXT_DON  28002  13924       ZOSYN   1281

NaNs:
PT_DIURETICS_DON       206
PT_T3_DON              206
PT_T4_DON              206
PT_OTH2_OSTXT_DON     3859
PT_OTH3_OSTXT_DON    11804
PT_OTH1_OSTXT_DON      749
dtype: int64

Datatypes:
PT_DIURETICS_DON     object
PT_T3_DON            object
PT_T4_DON            object
PT_OTH2_OSTXT_DON    object
PT_OTH3_OSTXT_DON    object
PT_OTH1_OSTXT_DON    object
dtype: object


              Feature                                                                    Description           FormSection DataType SASAnalysisFormat Comment Information
239  PT_DIURETICS_DON            DECEASED DONOR-DIURETICS B/N BRAIN DEATH W/IN 24 HRS OF PROCUR

In [253]:
# get new features
features = ['PT_DIURETICS_DON','PT_T3_DON','PT_T4_DON']

# fill NaN with X: Missing
df[features] = df[features].fillna('X')

# feature value mapping
mapping = {'N': 'No', 'Y': 'Yes', 'U': 'Unknown', 'X': 'Missing'}

# iterate
for col in features:
    # mapping feature
    df = uf.mappingCol(df, col, mapping, display=True)


# mapping
colMap = {'PT_DIURETICS_DON': 'Diuretics_DON', 'PT_T3_DON': 'TriiodothyronineT3_DON', 'PT_T4_DON':'ThyroxineT4_DON',
         'PT_OTH1_OSTXT_DON':'OtherMedsText1_DON', 'PT_OTH2_OSTXT_DON':'OtherMedsText2_DON','PT_OTH3_OSTXT_DON':'OtherMedsText3_DON'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt=f"N/Y/U/X to No/Yes/Unknown/Missing")

# update dataframe
df_don  = uf.insertIntoDataFrame(df_don, list(colMap.values()))
df_nominal  = uf.insertIntoDataFrame(df_nominal, ['Diuretics_DON','TriiodothyronineT3_DON','ThyroxineT4_DON'])
df_object  = uf.insertIntoDataFrame(df_object, ['OtherMedsText1_DON','OtherMedsText2_DON','OtherMedsText3_DON'])

# convert to category
df = uf.toCategory(df,  ['Diuretics_DON','TriiodothyronineT3_DON','ThyroxineT4_DON'])

# display
df_dict.iloc[idx]

Converted Column PT_DIURETICS_DON Unique Vaue(s) ['Yes', 'No', 'Missing', 'Unknown']
Categories (4, object): ['Missing', 'No', 'Unknown', 'Yes']
Converted Column PT_T3_DON Unique Vaue(s) ['No', 'Yes', 'Missing', 'Unknown']
Categories (4, object): ['Missing', 'No', 'Unknown', 'Yes']
Converted Column PT_T4_DON Unique Vaue(s) ['No', 'Yes', 'Missing', 'Unknown']
Categories (4, object): ['Missing', 'No', 'Unknown', 'Yes']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
239,Diuretics_DON,DECEASED DONOR-DIURETICS B/N BRAIN DEATH W/IN 24 HRS OF PROCUREMENT,DDR,1994-04-01,NaT,CLINICAL INFORMATION,CHAR(1),,,PT_DIURETICS_DON,Category,N/Y/U/X to No/Yes/Unknown/Missing
241,TriiodothyronineT3_DON,DECEASED DONOR-TRIIODOTHYRONINE-T3 B/N BRAIN DEATH W/IN 24 HRS OF PROCUREMENT,DDR,1999-10-25,NaT,CLINICAL INFORMATION,CHAR(1),,,PT_T3_DON,Category,N/Y/U/X to No/Yes/Unknown/Missing
242,ThyroxineT4_DON,DECEASED DONOR-THYROXINE-T4 B/N BRAIN DEATH W/IN 24 HRS OF PROCUREMENT,DDR,1994-04-01,NaT,CLINICAL INFORMATION,CHAR(1),,,PT_T4_DON,Category,N/Y/U/X to No/Yes/Unknown/Missing


In [254]:
df_nominal

Unnamed: 0,column
0,AcuteRejectionEpisode
1,AirwayDehiscencePostTransplant
2,StrokePostTransplant
3,PacemakerPostTransplant
4,DialysisPostDischarge
5,GraftFailStatus
6,GraftStatus
7,TransplantStatus
8,RecipientStatus
9,RejectionTreatmentWithinOneYear


### PULM_CATH_DON & PROTEIN_URINE

In [255]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'PULM|PROTEIN_URINE', True)

               count unique top   freq
PROTEIN_URINE  28546      3   N  15141
PULM_CATH_DON  28545      2   N  26749

NaNs:
PROTEIN_URINE    205
PULM_CATH_DON    206
dtype: int64

Datatypes:
PROTEIN_URINE    object
PULM_CATH_DON    object
dtype: object


           Feature                      Description           FormSection DataType SASAnalysisFormat Comment    Information
233  PROTEIN_URINE  DECEASED DONOR PROTEIN IN URINE  CLINICAL INFORMATION  CHAR(1)                                  Unknown
244  PULM_CATH_DON                DDR PA CATH (Y,N)        ORGAN RECOVERY  CHAR(1)                            Y/N to Yes/No


PROTEIN_URINE: ['N' 'Y' 'U' nan]
PULM_CATH_DON: ['N' 'Y' nan]


In [256]:
# fill NaN with X: Missing
df[features] = df[features].fillna('X')

# feature value mapping
mapping = {'N': 'No', 'Y': 'Yes', 'U': 'Unknown', 'X': 'Missing'}

# iterate
for col in features:
    # mapping feature
    df = uf.mappingCol(df, col, mapping, display=True)


# mapping
colMap = {'PULM_CATH_DON':'PulmonaryCatheter_DON', 'PROTEIN_URINE':'UrinePortein_DON'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt=f"N/Y/U/X to No/Yes/Unknown/Missing")

# update dataframe
df_don  = uf.insertIntoDataFrame(df_don, list(colMap.values()))
df_nominal  = uf.insertIntoDataFrame(df_nominal, list(colMap.values()))

# display
df_dict.iloc[idx]

Converted Column PROTEIN_URINE Unique Vaue(s) ['No', 'Yes', 'Unknown', 'Missing']
Categories (4, object): ['Missing', 'No', 'Unknown', 'Yes']
Converted Column PULM_CATH_DON Unique Vaue(s) ['No', 'Yes', 'Missing']
Categories (3, object): ['Missing', 'No', 'Yes']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
233,UrinePortein_DON,DECEASED DONOR PROTEIN IN URINE,DDR,1999-10-25,NaT,CLINICAL INFORMATION,CHAR(1),,,PROTEIN_URINE,Category,N/Y/U/X to No/Yes/Unknown/Missing
244,PulmonaryCatheter_DON,"DDR PA CATH (Y,N)",DDR,1999-10-25,NaT,ORGAN RECOVERY,CHAR(1),,,PULM_CATH_DON,Category,N/Y/U/X to No/Yes/Unknown/Missing


### SGOT & SGPT

In [257]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'SGOT|SGPT', True)

            count        mean         std  min   25%   50%   75%      max
SGOT_DON  28541.0  101.952167  300.421156  0.3  26.0  44.0  87.0  10254.0
SGPT_DON  28541.0  114.102099  404.318823  3.0  23.0  41.0  83.0  44117.0

NaNs:
SGOT_DON    210
SGPT_DON    210
dtype: int64

Datatypes:
SGOT_DON    float64
SGPT_DON    float64
dtype: object


      Feature                       Description           FormSection DataType SASAnalysisFormat Comment Information
259  SGOT_DON  DECEASED DONOR-TERMINAL SGOT/AST  CLINICAL INFORMATION      NUM                               Unknown
260  SGPT_DON  DECEASED DONOR-TERMINAL SGPT/ALT  CLINICAL INFORMATION      NUM                               Unknown


SGOT_DON: [  46.   38.  445. ... 4376.  574. 2229.]
SGPT_DON: [ 40. 171. 217. ... 563. 666. 884.]


In [258]:
# mapping
colMap = {'SGOT_DON': 'Level_SGOT_AST_DON', 'SGPT_DON':'Level_SGOT_ALT_DON'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Numeric', txt=f"")

# update dataframe
df_don  = uf.insertIntoDataFrame(df_don, list(colMap.values()))
df_numeric  = uf.insertIntoDataFrame(df_numeric, list(colMap.values()))

# display
df_dict.iloc[idx]

Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
259,Level_SGOT_AST_DON,DECEASED DONOR-TERMINAL SGOT/AST,DDR,1994-04-01,NaT,CLINICAL INFORMATION,NUM,,,SGOT_DON,Numeric,
260,Level_SGOT_ALT_DON,DECEASED DONOR-TERMINAL SGPT/ALT,DDR,1994-04-01,NaT,CLINICAL INFORMATION,NUM,,,SGPT_DON,Numeric,


### CARDARREST_NEURO & DDAVP_DON

In [259]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'CARDARREST|DDAVP', True)

                  count unique top   freq
DDAVP_DON         28545      3   N  24575
CARDARREST_NEURO  28238      2   N  26165

NaNs:
DDAVP_DON           206
CARDARREST_NEURO    513
dtype: int64

Datatypes:
DDAVP_DON           object
CARDARREST_NEURO    object
dtype: object


             Feature                                             Description           FormSection DataType SASAnalysisFormat Comment Information
27  CARDARREST_NEURO          DECEASED DONOR-CARDIAC ARREST POST BRAIN DEATH  CLINICAL INFORMATION  CHAR(1)                               Unknown
65         DDAVP_DON  DECEASED DONOR-SYNTHETIC ANTI DIURETIC HORMONE (DDAVP)  CLINICAL INFORMATION  CHAR(1)                               Unknown


DDAVP_DON: ['Y' 'N' nan 'U']
CARDARREST_NEURO: ['N' 'Y' nan]


In [260]:
# fill NaN with X: Missing
df[features] = df[features].fillna('X')

# feature value mapping
mapping = {'N': 'No', 'Y': 'Yes', 'U': 'Unknown', 'X': 'Missing'}

# iterate
for col in features:
    # mapping feature
    df = uf.mappingCol(df, col, mapping, display=True)

# mapping
colMap = {'CARDARREST_NEURO':'CardiacArrest_DON', 'DDAVP_DON': 'SynthicAntiDiureticHormone_DON'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt=f"N/Y/U/X to No/Yes/Unknown/Missing")

# update dataframe
df_don  = uf.insertIntoDataFrame(df_don, list(colMap.values()))
df_nominal  = uf.insertIntoDataFrame(df_nominal, list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.iloc[idx]

Converted Column DDAVP_DON Unique Vaue(s) ['Yes', 'No', 'Missing', 'Unknown']
Categories (4, object): ['Missing', 'No', 'Unknown', 'Yes']
Converted Column CARDARREST_NEURO Unique Vaue(s) ['No', 'Yes', 'Missing']
Categories (3, object): ['Missing', 'No', 'Yes']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
27,CardiacArrest_DON,DECEASED DONOR-CARDIAC ARREST POST BRAIN DEATH,DDR,1999-10-25,NaT,CLINICAL INFORMATION,CHAR(1),,,CARDARREST_NEURO,Category,N/Y/U/X to No/Yes/Unknown/Missing
65,SynthicAntiDiureticHormone_DON,DECEASED DONOR-SYNTHETIC ANTI DIURETIC HORMONE (DDAVP),DDR,1999-10-25,NaT,CLINICAL INFORMATION,CHAR(1),,,DDAVP_DON,Category,N/Y/U/X to No/Yes/Unknown/Missing


In [261]:
df_nominal

Unnamed: 0,column
0,AcuteRejectionEpisode
1,AirwayDehiscencePostTransplant
2,StrokePostTransplant
3,PacemakerPostTransplant
4,DialysisPostDischarge
5,GraftFailStatus
6,GraftStatus
7,TransplantStatus
8,RecipientStatus
9,RejectionTreatmentWithinOneYear


### CHEST_XRAY_DON

In [262]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'CHEST_XRAY', True)

                  count      mean        std  min  25%  50%  75%    max
CHEST_XRAY_DON  27346.0  6.097235  45.350541  1.0  3.0  5.0  5.0  998.0

NaNs:
CHEST_XRAY_DON    1405
dtype: int64

Datatypes:
CHEST_XRAY_DON    float64
dtype: object


           Feature     Description     FormSection DataType SASAnalysisFormat Comment Information
30  CHEST_XRAY_DON  DDR CHEST XRAY  ORGAN RECOVERY      NUM          LUNGXRAY             Unknown


CHEST_XRAY_DON: [  4.   3.   5.   2. 998.  nan   1.]


In [263]:
# fill NaN with 1000: Unknown
df[features] = df[features].fillna(1000).astype(int)

# SASAnalysisFormat LUNGXRAY
mapping = {
    1: "No chest x-ray",
    2: "Normal",
    3: "Abnormal-left",
    4: "Abnormal-right",
    5: "Abnormal-both",
    998: "Results Unknown",
    999: "Unknown if chest x-ray performed",
    1000: "Missing"
}

# mapping feature
df = uf.mappingCol(df, 'CHEST_XRAY_DON', mapping, display=True)


# mapping
colMap = {'CHEST_XRAY_DON':'ChestXray_DON'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt=f"SASAnalysisFormat LUNGXRAY N/Y/U/X to No/Yes/Unknown/Missing")

# update dataframe
df_don  = uf.insertIntoDataFrame(df_don, list(colMap.values()))
df_nominal  = uf.insertIntoDataFrame(df_nominal, list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.iloc[idx]

Converted Column CHEST_XRAY_DON Unique Vaue(s) ['Abnormal-right', 'Abnormal-left', 'Abnormal-both', 'Normal', 'Results Unknown', 'Missing', 'No chest x-ray']
Categories (7, object): ['Abnormal-both', 'Abnormal-left', 'Abnormal-right', 'Missing', 'No chest x-ray', 'Normal', 'Results Unknown']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
30,ChestXray_DON,DDR CHEST XRAY,DDR,1999-10-25,NaT,ORGAN RECOVERY,NUM,LUNGXRAY,,CHEST_XRAY_DON,Category,SASAnalysisFormat LUNGXRAY N/Y/U/X to No/Yes/Unknown/Missing


### CORONARY_ANGIO

In [264]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'CORONARY_ANGIO', True)

                  count      mean      std  min  25%  50%  75%  max
CORONARY_ANGIO  28542.0  1.398255  0.55103  1.0  1.0  1.0  2.0  3.0

NaNs:
CORONARY_ANGIO    209
dtype: int64

Datatypes:
CORONARY_ANGIO    float64
dtype: object


           Feature                                                                                          Description                     FormSection DataType SASAnalysisFormat Comment Information
42  CORONARY_ANGIO  DECEASED DONOR CORONARY ANGIOGRAM: Y/N IF YES NORMAL: Y/N IF ABNORMAL # VESSELS WITH > 50% STENOSIS  HEART DONOR'S CARDIAC FUNCTION      NUM             ANGIO             Unknown


CORONARY_ANGIO: [ 1.  2.  3. nan]


In [265]:
# fill NaN with 1000: Unknown
df[features] = df[features].fillna(999).astype(int)

# df_flat FMTNAME: ANGIO
mapping = {
    1: "No",
    2: "Yes, normal",
    3: "Yes, not normal",
    999: "Missing"
}

# mapping feature
df = uf.mappingCol(df, 'CORONARY_ANGIO', mapping, display=True)


# mapping
colMap = {'CORONARY_ANGIO':'CoronaryAngiogram_DON'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt=f"FMTNAME: ANGIO")

# update dataframe
df_don  = uf.insertIntoDataFrame(df_don, list(colMap.values()))
df_nominal  = uf.insertIntoDataFrame(df_nominal, list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.iloc[idx]

Converted Column CORONARY_ANGIO Unique Vaue(s) ['No', 'Yes, normal', 'Yes, not normal', 'Missing']
Categories (4, object): ['Missing', 'No', 'Yes, normal', 'Yes, not normal']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
42,CoronaryAngiogram_DON,DECEASED DONOR CORONARY ANGIOGRAM: Y/N IF YES NORMAL: Y/N IF ABNORMAL # VESSELS WITH > 50% STENOSIS,DDR,1999-10-25,NaT,HEART DONOR'S CARDIAC FUNCTION,NUM,ANGIO,,CORONARY_ANGIO,Category,FMTNAME: ANGIO


### DISTANCE

In [266]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'DISTANCE|LOS', False)

            count        mean         std  min   25%    50%    75%     max
LOS       27840.0   22.514655   26.192994  0.0  11.0   16.0   24.0   823.0
DISTANCE  28751.0  200.200097  221.177082  0.0  20.0  126.0  321.0  2215.0

NaNs:
LOS         911
DISTANCE      0
dtype: int64

Datatypes:
LOS         float64
DISTANCE    float64
dtype: object


      Feature                                             Description                                        FormSection DataType SASAnalysisFormat Comment Information
77   DISTANCE  DISTANCE FROM DONOR HOSP TO TX CENTER (Nautical Miles)                                                         NUM                               Unknown
193       LOS                        RECIPIENT LENGTH OF STAY POST TX  PATIENT STATUS(PRIORITY KIDNEY,THEN PANCREAS TRR)      NUM                               Unknown




In [267]:
# mapping
colMap = {'DISTANCE':'DistanceFromDonorHospitaltoTXCenter', 'LOS':'LengthOfStay'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Numeric', txt=f"{LABEL} N/Y/U/X to No/Yes/Unknown/Missing")

# update dataframe
df_don  = uf.insertIntoDataFrame(df_don, ['DistanceFromDonorHospitaltoTXCenter'])
df_label  = uf.insertIntoDataFrame(df_label, ['LengthOfStay'])
df_numeric  = uf.insertIntoDataFrame(df_numeric, list(colMap.values()))

# display
df_dict.iloc[idx]

Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
77,DistanceFromDonorHospitaltoTXCenter,DISTANCE FROM DONOR HOSP TO TX CENTER (Nautical Miles),CALCULATED,NaT,NaT,,NUM,,,DISTANCE,Numeric,** LABEL ** N/Y/U/X to No/Yes/Unknown/Missing
193,LengthOfStay,RECIPIENT LENGTH OF STAY POST TX,TRR-CALCULATED,1999-10-25,NaT,"PATIENT STATUS(PRIORITY KIDNEY,THEN PANCREAS TRR)",NUM,,,LOS,Numeric,** LABEL ** N/Y/U/X to No/Yes/Unknown/Missing


### ECD_DONOR

In [268]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'ECD', True)

             count      mean       std  min  25%  50%  75%  max
ECD_DONOR  28748.0  0.027028  0.162168  0.0  0.0  0.0  0.0  1.0

NaNs:
ECD_DONOR    3
dtype: int64

Datatypes:
ECD_DONOR    float64
dtype: object


      Feature                                            Description FormSection DataType SASAnalysisFormat Comment Information
92  ECD_DONOR  EXPANDED DONOR PER KIDNEY ALLOCATION DEFINITION 1=YES                  NUM                               Unknown


ECD_DONOR: [ 0.  1. nan]


In [269]:
# fill NaN with X: Missing
df[features] = df[features].fillna(999).astype(int)

# mapping
colMap = {'ECD_DONOR':'KidneyAllocation_DON'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt=f"")

# update dataframe
df_don  = uf.insertIntoDataFrame(df_don, list(colMap.values()))
df_nominal  = uf.insertIntoDataFrame(df_nominal, list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.iloc[idx]

Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
92,KidneyAllocation_DON,EXPANDED DONOR PER KIDNEY ALLOCATION DEFINITION 1=YES,CALCULATED,1994-04-01,NaT,,NUM,,,ECD_DONOR,Category,


### HEMATOCRIT & ISCHTIME
- Hematocrit is often measured alongside hemoglobin levels as part of a complete blood count (CBC) to assess overall blood health and oxygen-carrying capacity
- Ischemic time is the period when an organ is preserved in a hypothermic state before being transplanted. Shortening the ischemic time can reduce the risk of graft failure and patient mortality after transplantation, and may also shorten the hospital stay. The cold ischemic time (CIT) for an organ donor is the time the organ spends outside of the body between procurement and transplantation.

In [270]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'HEMATOCRIT|ISCHTIME|PH_DON', False)

                  count       mean       std  min    25%    50%    75%    max
ISCHTIME        28187.0   3.233434  1.070273  0.3   2.50   3.30   3.90  12.00
PH_DON          28521.0   7.416000  0.073821  5.0   7.38   7.42   7.46   7.93
HEMATOCRIT_DON  28546.0  28.433506  5.001157  2.3  25.00  28.00  31.30  75.00

NaNs:
ISCHTIME          564
PH_DON            230
HEMATOCRIT_DON    205
dtype: int64

Datatypes:
ISCHTIME          float64
PH_DON            float64
HEMATOCRIT_DON    float64
dtype: object


            Feature             Description           FormSection DataType SASAnalysisFormat Comment Information
127  HEMATOCRIT_DON         DDR:Hematocrit:  CLINICAL INFORMATION      NUM                               Unknown
187        ISCHTIME  ISCHEMIC TIME IN HOURS                            NUM                               Unknown
213          PH_DON           DDR:Blood PH:  CLINICAL INFORMATION      NUM                               Unknown




In [271]:
# mapping
colMap = {'HEMATOCRIT_DON':'Hematocrit_DON', 'ISCHTIME':'IschemicTimeHour_DON', 'PH_DON':'BloodPH_DON'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Numeric', txt=f"")

# update dataframe
df_don  = uf.insertIntoDataFrame(df_don, list(colMap.values()))
df_numeric  = uf.insertIntoDataFrame(df_numeric, list(colMap.values()))

# display
df_dict.iloc[idx]

Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
127,Hematocrit_DON,DDR:Hematocrit:,DDR,2004-06-30,NaT,CLINICAL INFORMATION,NUM,,,HEMATOCRIT_DON,Numeric,
187,IschemicTimeHour_DON,ISCHEMIC TIME IN HOURS,CALCULATED,NaT,NaT,,NUM,,,ISCHTIME,Numeric,
213,BloodPH_DON,DDR:Blood PH:,DDR,2004-06-30,NaT,CLINICAL INFORMATION,NUM,,,PH_DON,Numeric,


### HIST_HYPERTENS & HEPARIN & HIST_MI

In [272]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'HEPARIN|HIST_HYPERTENS|HIST_MI|INSULIN|VASODIL', True)

                    count unique top   freq
VASODIL_DON         28545      3   N  23849
HIST_HYPERTENS_DON  28747      3   N  24127
HEPARIN_DON         28545      3   Y  28085
INSULIN_DON         28546      3   Y  16387
HIST_MI             28544      3   N  28116

NaNs:
VASODIL_DON           206
HIST_HYPERTENS_DON      4
HEPARIN_DON           206
INSULIN_DON           205
HIST_MI               207
dtype: int64

Datatypes:
VASODIL_DON           object
HIST_HYPERTENS_DON    object
HEPARIN_DON           object
INSULIN_DON           object
HIST_MI               object
dtype: object


                Feature                                                                     Description                     FormSection DataType SASAnalysisFormat Comment Information
139         HEPARIN_DON  DECEASED DONOR MANAGEMENT - HEPARIN (NO COLLECTION BETWEEN 10/25/99 - 1/27/03)            CLINICAL INFORMATION  CHAR(1)                               Unknown
147  HIST_HYPERTENS_DON                        

In [273]:
# fill NaN with X: Missing
df[features] = df[features].fillna('X')

# feature value mapping
mapping = {'N': 'No', 'Y': 'Yes', 'U': 'Unknown', 'X': 'Missing'}

# iterate
for col in features:
    # mapping feature
    df = uf.mappingCol(df, col, mapping, display=True)


# mapping
colMap = {'HEPARIN_DON':'HeparinManagement_DON', 'HIST_HYPERTENS_DON': 'HypertensionHistory_DON', 'VASODIL_DON':'Vasodilators_DON',
         'HIST_MI':'MyocardialInfarctionHistory_DON', 'INSULIN_DON':'InsulinManagement_DON'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt=f"N/Y/U/X to No/Yes/Unknown/Missing")

# update dataframe
df_don  = uf.insertIntoDataFrame(df_don, list(colMap.values()))
df_nominal  = uf.insertIntoDataFrame(df_nominal, list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.iloc[idx]

Converted Column VASODIL_DON Unique Vaue(s) ['No', 'Yes', 'Missing', 'Unknown']
Categories (4, object): ['Missing', 'No', 'Unknown', 'Yes']
Converted Column HIST_HYPERTENS_DON Unique Vaue(s) ['No', 'Yes', 'Unknown', 'Missing']
Categories (4, object): ['Missing', 'No', 'Unknown', 'Yes']
Converted Column HEPARIN_DON Unique Vaue(s) ['Yes', 'No', 'Missing', 'Unknown']
Categories (4, object): ['Missing', 'No', 'Unknown', 'Yes']
Converted Column INSULIN_DON Unique Vaue(s) ['Yes', 'No', 'Missing', 'Unknown']
Categories (4, object): ['Missing', 'No', 'Unknown', 'Yes']
Converted Column HIST_MI Unique Vaue(s) ['No', 'Yes', 'Unknown', 'Missing']
Categories (4, object): ['Missing', 'No', 'Unknown', 'Yes']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
139,HeparinManagement_DON,DECEASED DONOR MANAGEMENT - HEPARIN (NO COLLECTION BETWEEN 10/25/99 - 1/27/03),DDR,1994-04-01,NaT,CLINICAL INFORMATION,CHAR(1),,,HEPARIN_DON,Category,N/Y/U/X to No/Yes/Unknown/Missing
147,HypertensionHistory_DON,DECEASED DONOR-HISTORY OF HYPERTENSION,DDR,1994-04-01,NaT,DONOR HISTORY,CHAR(1),,,HIST_HYPERTENS_DON,Category,N/Y/U/X to No/Yes/Unknown/Missing
148,MyocardialInfarctionHistory_DON,DECEASED DONOR HISTORY OF PREVIOUS MI (MYOCARDIAL INFARCTION),DDR,1999-10-25,NaT,HEART DONOR'S CARDIAC FUNCTION,CHAR(1),,,HIST_MI,Category,N/Y/U/X to No/Yes/Unknown/Missing
185,InsulinManagement_DON,DECEASED DONOR-WAS DONOR GIVEN INSULIN WITHIN 24 HRS PRE CROSS CLAMP?,DDR,2004-06-30,NaT,CLINICAL INFORMATION,CHAR(1),,,INSULIN_DON,Category,N/Y/U/X to No/Yes/Unknown/Missing
291,Vasodilators_DON,DECEASED DONOR-VASODILATORS W/IN 24HRS PRE-CROSS CLAMP,DDR,1994-04-01,NaT,CLINICAL INFORMATION,CHAR(1),,,VASODIL_DON,Category,N/Y/U/X to No/Yes/Unknown/Missing


### LV_EJECT & LV_EJECT_METH

In [274]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'LV_EJECT', True)

                 count       mean       std   min   25%   50%   75%   max
LV_EJECT_METH  28704.0   1.041876  0.286358   1.0   1.0   1.0   1.0   3.0
LV_EJECT       28718.0  61.618541  6.660556  10.0  56.0  60.0  65.0  99.0

NaNs:
LV_EJECT_METH    47
LV_EJECT         33
dtype: int64

Datatypes:
LV_EJECT_METH    float64
LV_EJECT         float64
dtype: object


           Feature                                                        Description                     FormSection DataType SASAnalysisFormat Comment Information
195       LV_EJECT                              DECEASED DONOR LV EJECTION FRACTION %  HEART DONOR'S CARDIAC FUNCTION      NUM                               Unknown
196  LV_EJECT_METH  DECEASED DONOR LV EJECTION FRACTION METHOD: ECHO, MUGA, ANGIOGRAM  HEART DONOR'S CARDIAC FUNCTION      NUM          LVEJECTM             Unknown


LV_EJECT_METH: [ 1.  3. nan]
LV_EJECT: [60.   55.   59.   65.   35.   63.   64.   67.   50.   62.   70.   80.
 45.   75.   56.   81.     nan 68

In [275]:
# fill NaN with 999: Missing
df['LV_EJECT_METH'] = df['LV_EJECT_METH'].fillna(999).astype(int)

# df_flat FMTNAME: LVEJECTM
mapping = {
    999: "Missing",
    1: "Echo",
    2: "MUGA",
    3: "Angiogram"
}

# mapping feature
df = uf.mappingCol(df, 'LV_EJECT_METH', mapping, display=True)


 # mapping
colMap = {'LV_EJECT':'LV_EjectionFractionPercent_DON', 'LV_EJECT_METH':'LV_EjectionFractionMedthod_DON'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Numeric', txt=f"")
df_dict = uf.updateDictionaryInformation(df_dict, [196], txt='FMTNAME: LVEJECTM', FeatureType='Category').copy()

# update dataframe
df_don  = uf.insertIntoDataFrame(df_don, list(colMap.values()))
df_nominal  = uf.insertIntoDataFrame(df_nominal, ['LV_EjectionFractionMedthod_DON'])
df_numeric  = uf.insertIntoDataFrame(df_numeric, ['LV_EjectionFractionPercent_DON'])

# display
df_dict.iloc[idx]

Converted Column LV_EJECT_METH Unique Vaue(s) ['Echo', 'Angiogram', 'Missing']
Categories (3, object): ['Angiogram', 'Echo', 'Missing']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
195,LV_EjectionFractionPercent_DON,DECEASED DONOR LV EJECTION FRACTION %,DDR,1999-10-25,NaT,HEART DONOR'S CARDIAC FUNCTION,NUM,,,LV_EJECT,Numeric,
196,LV_EjectionFractionMedthod_DON,"DECEASED DONOR LV EJECTION FRACTION METHOD: ECHO, MUGA, ANGIOGRAM",DDR,1999-10-25,NaT,HEART DONOR'S CARDIAC FUNCTION,NUM,LVEJECTM,,LV_EJECT_METH,Category,FMTNAME: LVEJECTM


### SHARE_TY

In [276]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'SHARE_TY', True)

            count      mean       std  min  25%  50%  75%  max
SHARE_TY  28751.0  3.790129  0.878173  3.0  3.0  3.0  5.0  6.0

NaNs:
SHARE_TY    0
dtype: int64

Datatypes:
SHARE_TY    int64
dtype: object


      Feature                                                                        Description FormSection DataType SASAnalysisFormat Comment Information
261  SHARE_TY  ALLOCATION TYPE-LOCAL/REGIONAL/NATIONAL - 3=LOCAL/4=REGIONAL/5=NATIONAL/6=FOREIGN                  NUM           SHARETY             Unknown


SHARE_TY: [3 4 5 6]


In [277]:
# df_flat FMTNAME: SHARETY
mapping = {
    1: "Zero Antigen Mismatch",
    2: "Payback",
    3: "Local",
    4: "Regional",
    5: "National",
    6: "Foreign Donor"
}

# iterate
for col in features:
    # mapping feature
    df = uf.mappingCol(df, col, mapping, display=True)


# mapping
colMap = {'SHARE_TY':'AllocationType_DON'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Category', txt=f"FMTNAME: SHARETY")

# update dataframe
df_don  = uf.insertIntoDataFrame(df_don, list(colMap.values()))
df_nominal  = uf.insertIntoDataFrame(df_nominal, list(colMap.values()))
# convert to category
df = uf.toCategory(df,  list(colMap.values()))

# display
df_dict.iloc[idx]

Converted Column SHARE_TY Unique Vaue(s) ['Local', 'Regional', 'National', 'Foreign Donor']
Categories (4, object): ['Foreign Donor', 'Local', 'National', 'Regional']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
261,AllocationType_DON,ALLOCATION TYPE-LOCAL/REGIONAL/NATIONAL - 3=LOCAL/4=REGIONAL/5=NATIONAL/6=FOREIGN,CALCULATED,NaT,NaT,,NUM,SHARETY,,SHARE_TY,Category,FMTNAME: SHARETY


### PO2
- Lung PO2 (Partial Pressure of Oxygen) is a measure of the pressure exerted by oxygen in the lungs. It is an important indicator of how well oxygen is being transferred from the air in the lungs to the blood. The value of PO2 is typically measured in millimeters of mercury (mmHg) and provides insight into the efficiency of the respiratory system. In clinical and transplant contexts, particularly with donors, it helps assess the oxygenation status of the donor’s lungs, which is critical when evaluating organs for transplantation.

In [278]:
# display feature info
features, idx = uf.featureInfo(df, df_dict, 'PO2', True)

                count unique  top  freq        mean         std   min    25%     50%      75%    max
PO2           28504.0    NaN  NaN   NaN  280.746738  159.653759  7.38  131.0  256.15  425.025  698.0
PO2_DONE_DON     8726      2    Y  8690         NaN         NaN   NaN    NaN     NaN      NaN    NaN
PO2_FIO2_DON  28425.0    NaN  NaN   NaN    82.11803   25.544685   1.0   60.0   100.0    100.0  100.0

NaNs:
PO2               247
PO2_DONE_DON    20025
PO2_FIO2_DON      326
dtype: int64

Datatypes:
PO2             float64
PO2_DONE_DON     object
PO2_FIO2_DON    float64
dtype: object


          Feature                                          Description     FormSection DataType SASAnalysisFormat Comment Information
214           PO2                           DECEASED DONOR PO2 ON 100%  ORGAN RECOVERY      NUM                               Unknown
215  PO2_DONE_DON                             DDR:Lung - Was pO2 done:  ORGAN RECOVERY  CHAR(1)                               Unknown
216  PO2

In [279]:
# fill NaN with X: Missing
df['PO2_DONE_DON'] = df['PO2_DONE_DON'].fillna('X')

# feature value mapping
mapping = {'N': 'No', 'Y': 'Yes', 'U': 'Unknown', 'X': 'Missing'}

# mapping feature
df = uf.mappingCol(df, 'PO2_DONE_DON', mapping, display=True)


# mapping
colMap = {'PO2':'LungPO2_DON', 'PO2_DONE_DON':'LungPO2_Done_DON', 'PO2_FIO2_DON':'LungPO2_FIO2_DON'}

# update column names & data dictionary
df, df_dict = uf.mappingDataAndDictionary(df, df_dict, colMap, idx, Type='Numeric', txt=f"")
df_dict = uf.updateDictionaryInformation(df_dict, [215], txt='N/Y/U/X to No/Yes/Unknown/Missing', FeatureType='Category')

# update dataframe
df_don  = uf.insertIntoDataFrame(df_don, list(colMap.values()))
df_numeric  = uf.insertIntoDataFrame(df_numeric, ['LungPO2_DON','LungPO2_FIO2_DON'])
df_nominal  = uf.insertIntoDataFrame(df_nominal, ['LungPO2_Done_DON'])

# display
df_dict.iloc[idx]

Converted Column PO2_DONE_DON Unique Vaue(s) ['Yes', 'No', 'Missing']
Categories (3, object): ['Missing', 'No', 'Yes']


Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
214,LungPO2_DON,DECEASED DONOR PO2 ON 100%,DDR,1999-10-25,NaT,ORGAN RECOVERY,NUM,,,PO2,Numeric,
215,LungPO2_Done_DON,DDR:Lung - Was pO2 done:,DDR,1999-10-25,NaT,ORGAN RECOVERY,CHAR(1),,,PO2_DONE_DON,Category,N/Y/U/X to No/Yes/Unknown/Missing
216,LungPO2_FIO2_DON,"DDR:Lung pO2 on Fio2 //If Yes, Lung pO2 on FiO2 of:",DDR,1999-10-25,NaT,ORGAN RECOVERY,NUM,,,PO2_FIO2_DON,Numeric,


### Save

In [280]:
# vertify all the features
dictList = df_dict['Feature'][df_dict.Information.isin(['Unknown'])].to_list()
print(dictList)

[]


In [281]:
# heart dataset
uf.writeToFile(df, 'New_Heart',path='../Data/', format='pkl')

# heart label
uf.writeToFile(df_label, 'New_Label',path='../Data/', format='pkl')

# heart candidate
uf.writeToFile(df_can, 'New_CAN', format='pkl')

# heart donor
uf.writeToFile(df_don, 'New_DON', format='pkl')

# heart drop
uf.writeToFile(df_drop, 'New_DROP', format='pkl')

# heart both
uf.writeToFile(df_both, 'New_BOTH', format='pkl')

# heart nominal
uf.writeToFile(df_nominal, 'New_Nominal', format='pkl')

# heart ordinal
uf.writeToFile(df_ordinal, 'New_Ordinal', format='pkl')

# heart numeric
uf.writeToFile(df_numeric, 'New_Numeric', format='pkl')

# heart object
uf.writeToFile(df_object, 'New_Object', format='pkl')

# heart unknown
uf.writeToFile(df_unknown, 'New_Unknown', format='pkl')

# heart date
uf.writeToFile(df_date, 'New_Date', format='pkl')

# heart data dictionary
uf.writeToFile(df_dict, 'New_Dictionary', format='pkl')

28,751 records written to ../Data/New_Heart.pkl
15 records written to ../Data/New_Label.pkl
133 records written to ../Data/New_CAN.pkl
102 records written to ../Data/New_DON.pkl
29 records written to ../Data/New_DROP.pkl
7 records written to ../Data/New_BOTH.pkl
169 records written to ../Data/New_Nominal.pkl
21 records written to ../Data/New_Ordinal.pkl
45 records written to ../Data/New_Numeric.pkl
11 records written to ../Data/New_Object.pkl
16 records written to ../Data/New_Unknown.pkl
14 records written to ../Data/New_Date.pkl
302 records written to ../Data/New_Dictionary.pkl
