In [1]:
from platform import python_version

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# set seaborn theme
sns.set_theme()

# print version
print("Numpy Version: " + np.__version__)
print("Pandas Version: " + pd.__version__)
print("Seaborn Version: " + sns.__version__)
print("Matplotlib Version: " + plt.matplotlib.__version__)
print("Python Version: " + python_version())

# adjust pandas display options to max
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
# adjust pandas display options to ensure full display of content
pd.set_option('display.max_colwidth', None)

Numpy Version: 1.26.4
Pandas Version: 2.2.3
Seaborn Version: 0.13.2
Matplotlib Version: 3.9.2
Python Version: 3.9.20


In [2]:
# import pickle dataset
df = pd.read_pickle("../Data/cleanDF.pkl")

In [3]:
df.dtypes

NUM_PREV_TX                         int64
THORACIC_DGN                        Int64
GENDER                             object
ABO                                object
WGT_KG_TCR                        float64
HGT_CM_TCR                        float64
BMI_TCR                           float64
CITIZENSHIP                         Int64
PERM_STATE                         object
EDUCATION                           Int64
ECMO_TCR                            int64
IABP_TCR                            int64
INOTROPES_TCR                       int64
PGE_TCR                             int64
OTH_LIFE_SUP_TCR                    Int64
VAD_DEVICE_TY_TCR                   Int64
VAD_BRAND1_TCR                    float64
FUNC_STAT_TCR                       Int64
PRI_PAYMENT_TCR                     Int64
TCR_DGN                             Int64
DIAB                                Int64
DIAL_TY_TCR                         Int64
CEREB_VASC                         object
MALIG_TCR                         

## Wrangle Dataset

In [4]:
print(f"There are total of {df.shape[0]:,} records and {df.shape[1]:,} features in the Heart Transplant Dataset.")

There are total of 27,494 records and 288 features in the Heart Transplant Dataset.


### User Function(s)

In [5]:
def corrCols(df, threshold=0.9):
    """
    Determine highly correlated features
    """
    # initilaize variable
    feature = list()
    # calculate the correlation matrix
    correlation_matrix = df.select_dtypes(exclude='object').corr()
    
    # get the number of features
    num_features = correlation_matrix.shape[0]
    
    # iterate over the upper triangular part of the matrix
    for i in range(num_features):
        for j in range(i+1, num_features):
            feature1 = correlation_matrix.index[i]
            feature2 = correlation_matrix.columns[j]
            correlation = correlation_matrix.iloc[i, j]
            if abs(correlation) > threshold:
                feature.append(feature2)
                print(f"Correlation between {feature1} and {feature2}: {correlation:.3f}")

    return feature
    

def percentageNull(df):
    """
    Calculate percentage of NaN & NaN count
    """
    # calculate the percentage of non-null values for each column
    per_calc = pd.DataFrame(100 - (df.count() / len(df) * 100))
    
    # rename columns name
    per_calc.rename(columns={0: 'percentage'}, inplace=True)

    # add counter
    per_calc['NaNCount'] = df.isna().sum()
    
    # sort
    per_calc.sort_values(by='percentage', inplace=True, ascending=False)

    # 
    NanReturn = per_calc[per_calc.NaNCount != 0]
    
    return NanReturn


def plotBox(df, cols, width=14, height=10):
    # figure size
    plt.figure(figsize=(width, height))
    
    # iterate through each column and create a box plot
    for i, col in enumerate(cols, start=1):
        plt.subplot(3, 4, i)
        df.boxplot(column=col)
        plt.title(f'Boxplot of {col}')
        plt.ylabel(col)
    
    plt.tight_layout()
    plt.show()


def plotCount(df, cols, width=16, height=10, angle=False):
    # figure size
    plt.figure(figsize=(width, height))
    
    # assuming cols is your list of categorical columns
    for i, col in enumerate(cols, start=1):
        plt.subplot(3, 4, i)
        sns.countplot(data=df, x=col)
        plt.title(f'Countplot of {col}')
        plt.xlabel(col)
        if angle:
            plt.xticks(rotation=90) 
    
    plt.tight_layout()  # Adjust the layout
    plt.show()


def plotBar(df, cols, width=16, height=10, angle=False):
    # figure size
    plt.figure(figsize=(width, height))
    
    # assuming cols is your list of categorical columns
    for i, col in enumerate(cols, start=1):
        plt.subplot(3, 4, i)
        sns.barplot(data=df,x=df.index, y=col)
        plt.title(f'Barplot of {col}')
        plt.xlabel(col)
        if angle:
            plt.xticks(rotation=90) 
    
    plt.tight_layout()  # Adjust the layout
    plt.show()

### Create Data Dictionary

In [6]:
# import Column description dataframe
colDef = pd.read_csv("../data/thoraticColumnNames.csv")

# current columns dataframe
dfColumns = pd.DataFrame(np.sort(df.columns.values), columns= ['featureName'])

In [7]:
# display random 5
colDef.sample(5)

Unnamed: 0,featureName,desc,form,varStartDate,VarEndDate,formSection,dataType,labelSAS,COMMENT
303,LISTYR,ACTUAL YEAR REGISTRANT LISTED (WITHOUT DATE OFFSET),CALCULATED,1987-10-01 00:00:00,Unknown,Unknown,NUM,ACTUAL YEAR REGISTRANT LISTED (WITHOUT DATE OFFSET),Unknown
400,PST_DIAL,EVENTS PRIOR TO DISCHARGE: DIALYSIS,TRR,1994-04-01 00:00:00,Unknown,POST TRANSPLANT CLINICAL INFORMATION,CHAR,TRR DIALYSIS - POST TRANSPLANT,Unknown
23,BLOOD_INF_DON,DECEASED DONOR-BLOOD AS INFECTION SOURCE,DDR,1994-04-01 00:00:00,Unknown,CLINICAL INFORMATION,NUM,DDR INFECTION BLOOD SOURCE,Unknown
338,OTH_DON_MED3_OSTXT_DON_OLD,DECEASED DONOR OTHER MEDICATIONS3 W/IN 24 HRS PRE-CROSS CLAMP,DDR,1994-04-01 00:00:00,2003-01-27 00:00:00,CLINICAL INFORMATION,CHAR,DDR OTHER MEDICATIONS3 PRE CROSS CLAMP,Unknown
530,WL_ID_CODE,ENCRYPTED REGISTRATION IDENTIFIER,CALCULATED,Unknown,Unknown,Unknown,NUM,ENCRYPTED WL_ID,Unknown


In [8]:
# display
dfColumns.head()

Unnamed: 0,featureName
0,ABO
1,ABO_DON
2,ABO_MAT
3,ACTIVATE_DATE
4,ACUTE_REJ_EPI


In [9]:
# shape of dataframe
dfColumns.shape, colDef.shape

((288, 1), (545, 9))

In [10]:
# columns to rename
selectName = ['CARDARREST_NEURO', 'DDR1', 'DDR2', 'DA1', 'DA2', 'DB1', 'DB2', 'HIST_MI', 'LV_EJECT', 'LV_EJECT_METH', 'CPRA', 'TATTOOS']

# '|'.join(selectName) creates a regex pattern that matches strings in selectName - display
colDef[colDef['featureName'].str.contains('|'.join(selectName))]

Unnamed: 0,featureName,desc,form,varStartDate,VarEndDate,formSection,dataType,labelSAS,COMMENT
41,CARDARREST_NEURO,DECEASED DONOR-CARDIAC ARREST POST BRAIN DEATH,DDR,1999-10-25 00:00:00,Unknown,CLINICAL INFORMATION,CHAR,DON CARDIAC ARR. SINCE EVENT THAT LED TO DECL OF DEATH,Unknown
84,CPRA,Recipient Most Recent CPRA,RH,2015-03-31 00:00:00,Unknown,CLINICAL INFORMATION,NUM,RH Most Recent CPRA,Unknown
85,CPRA_PEAK,RecipientPeak CPRA,RH,2015-03-31 00:00:00,Unknown,CLINICAL INFORMATION,NUM,RH Peak CPRA,Unknown
94,DA1,DONOR A1 ANTIGEN,CALCULATED,1987-10-01 00:00:00,Unknown,DONOR CENTER HISTOCOMPATIBILITY TYPING,NUM,COMPUTED DONOR A1 ANTIGEN,Unknown
95,DA2,DONOR A2 ANTIGEN,CALCULATED,1987-10-01 00:00:00,Unknown,DONOR CENTER HISTOCOMPATIBILITY TYPING,NUM,COMPUTED DONOR A2 ANTIGEN,Unknown
110,DB1,DONOR B1 ANTIGEN,CALCULATED,1987-10-01 00:00:00,Unknown,DONOR CENTER HISTOCOMPATIBILITY TYPING,NUM,COMPUTED DONOR B1 ANTIGEN,Unknown
111,DB2,DONOR B2 ANTIGEN,CALCULATED,1987-10-01 00:00:00,Unknown,DONOR CENTER HISTOCOMPATIBILITY TYPING,NUM,COMPUTED DONOR B2 ANTIGEN,Unknown
113,DDR1,DONOR DR1 ANTIGEN,CALCULATED,1987-10-01 00:00:00,Unknown,DONOR CENTER HISTOCOMPATIBILITY TYPING,NUM,COMPUTED DONOR DR1 ANTIGEN,Unknown
114,DDR2,DONOR DR2 ANTIGEN,CALCULATED,1987-10-01 00:00:00,Unknown,DONOR CENTER HISTOCOMPATIBILITY TYPING,NUM,COMPUTED DONOR DR2 ANTIGEN,Unknown
235,HIST_MI,DECEASED DONOR HISTORY OF PREVIOUS MI (MYOCARDIAL INFARCTION),DDR,1999-10-25 00:00:00,Unknown,HEART DONOR'S CARDIAC FUNCTION,CHAR,DON HISTORY OF PREVIOUS MI,Unknown


In [11]:
# list column definitions display from data
dfColumns[dfColumns['featureName'].str.contains('|'.join(selectName))]

Unnamed: 0,featureName
27,CARDARREST_NEURO_DON
43,CPRA
44,CPRA_PEAK
48,DA1_DON
49,DA2_DON
60,DB1_DON
61,DB2_DON
63,DDR1_DON
64,DDR2_DON
142,HIST_MI_DON


In [12]:
# map to rename
name_map = {'CARDARREST_NEURO':'CARDARREST_NEURO_DON', 'DA1': 'DA1_DON', 'DA2': 'DA2_DON', 'HIST_MI': 'HIST_MI_DON', 'LV_EJECT': 'LV_EJECT_DON', 
            'DB1':'DB1_DON', 'DB2':'DB2_DON', 'LV_EJECT_METH':'LV_EJECT_METH_DON','DDR1': 'DDR1_DON','DDR2': 'DDR2_DON','TATTOOS':'TATTOOS_DON'}

# Update the featureName column based on the mapping dictionary
colDef['featureName'] = colDef['featureName'].replace(name_map)

In [13]:
# join two DataFrames
dfColDef = pd.merge(colDef, dfColumns, how='right', on='featureName')

# display features in current DataFrame NOT in colDef 
dfColDef[dfColDef.desc.isna()]

Unnamed: 0,featureName,desc,form,varStartDate,VarEndDate,formSection,dataType,labelSAS,COMMENT


In [14]:
# shape column dictionary & Data
dfColDef.shape, df.shape

((288, 9), (27494, 288))

In [15]:
# display random 5 rows
dfColDef.sample(5)

Unnamed: 0,featureName,desc,form,varStartDate,VarEndDate,formSection,dataType,labelSAS,COMMENT
202,PO2,DECEASED DONOR PO2 ON 100%,DDR,1999-10-25 00:00:00,Unknown,ORGAN RECOVERY,NUM,DON LUNG PO2 ON 100%,Unknown
102,FUNC_STAT_TCR,RECIPIENT FUNCTIONAL STATUS @ REGISTRATION,TCR,1994-04-01 00:00:00,Unknown,CANDIDATE INFORMATION,NUM,TCR FUNCTIONAL STATUS @ LISTING,Unknown
27,CARDARREST_NEURO_DON,DECEASED DONOR-CARDIAC ARREST POST BRAIN DEATH,DDR,1999-10-25 00:00:00,Unknown,CLINICAL INFORMATION,CHAR,DON CARDIAC ARR. SINCE EVENT THAT LED TO DECL OF DEATH,Unknown
83,DR53,Candidate Most Recent/at Removal DR53 Antigen From Waiting List,WAITING LIST DATA,1987-10-01 00:00:00,Unknown,WAITING LIST DATA,NUM,Candidate Most Recent/at Removal DR53 Antigen From Waiting List,Unknown
9,ALCOHOL_HEAVY_DON,Heavy Alcohol Use (heavy=2+ drinks/day),DDR,2004-06-30 00:00:00,Unknown,LIFESTYLE FACTORS,CHAR,DDR HEAVY ALCOHOL USE (Y/N/U),Unknown


#### Checking Data for NaNs

In [16]:
# NaN counts
NaNDf = percentageNull(df)

# list NaNs >= than 35%
NaNDf[NaNDf.percentage >= 35]

Unnamed: 0,percentage,NaNCount
PRIOR_CARD_SURG_TYPE_OSTXT_TCR,78.871754,21685
PRIOR_CARD_SURG_TYPE_TRR,77.92973,21426
CONTIN_COCAINE_DON,77.736961,21373
LAST_INACT_REASON,75.092748,20646
PRAMR_CL2,74.899978,20593
PRAMR_CL1,74.438059,20466
TOT_SERUM_ALBUM,73.205063,20127
MALIG_TRR,73.10686,20100
CMV_IGM,73.092311,20096
CMV_IGG,73.088674,20095


In [17]:
# re-index
dfColDef.reset_index(inplace=True, drop=True)

# missing data column description
dfColDef[dfColDef.featureName.str.contains('|'.join(NaNDf.index))].head()

Unnamed: 0,featureName,desc,form,varStartDate,VarEndDate,formSection,dataType,labelSAS,COMMENT
4,ACUTE_REJ_EPI,DID RECIPIENT HAVE ANY ACUTE REJECTION EPISODES PRE DISCHARGE?,TRR,2004-06-30 00:00:00,Unknown,POST TRANSPLANT CLINICAL INFORMATION,NUM,ACUTE REJECTION EPISODE,Unknown
5,ADMISSION_DATE,RECIPIENT DATE OF ADMISSION TO TX CENTER,TRR,1999-10-25 00:00:00,Unknown,PATIENT STATUS,NUM,TRR DATE OF ADMISSION TO TX CENTER,Unknown
6,ADMIT_DATE_DON,DONOR ADMIT DATE,DDR,2006-04-26 00:00:00,Unknown,DONOR INFORMATION,NUM,DON ADMIT DATE,Unknown
9,ALCOHOL_HEAVY_DON,Heavy Alcohol Use (heavy=2+ drinks/day),DDR,2004-06-30 00:00:00,Unknown,LIFESTYLE FACTORS,CHAR,DDR HEAVY ALCOHOL USE (Y/N/U),Unknown
10,AMIS,A LOCUS MISMATCH LEVEL,CALCULATED,Unknown,Unknown,Unknown,NUM,A LOCUS MISMATCH LEVEL,Unknown


In [18]:
# determine year & counts
df.TX_YEAR.value_counts().sort_values(ascending=False)

TX_YEAR
2021    3325
2020    3191
2019    3040
2018    2933
2017    2795
2016    2721
2015    2318
2014    2191
2013    2047
2012    1774
2011    1159
Name: count, dtype: int64

In [19]:
# last five years
df_5 = df[df.TX_YEAR > 2016]
# shape
df_5.shape

(15284, 288)

In [20]:
# count NaNs
NaN_5 = percentageNull(df_5)
# display
NaN_5[NaN_5.percentage >= 40]

Unnamed: 0,percentage,NaNCount
PO2_DONE_DON,100.0,15284
CMV_IGM,100.0,15284
CMV_IGG,100.0,15284
MALIG_TRR,100.0,15284
PRAMR_CL2,100.0,15284
PRAMR_CL1,100.0,15284
TOT_SERUM_ALBUM,97.853965,14956
PRIOR_CARD_SURG_TYPE_TRR,80.731484,12339
PRIOR_CARD_SURG_TYPE_OSTXT_TCR,79.776237,12193
LAST_INACT_REASON,76.478671,11689


#### Excessive Missing Data 

In [21]:
# remove features
removeCols = ['TX_YEAR', 'CMV_IGM', 'CMV_IGG', 'PRAMR_CL1', 'PO2_DONE_DON', 
              'PRAMR_CL2', 'MALIG_TRR'
             ]


# describe columns to be removed
df_5[removeCols].describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
TX_YEAR,15284.0,,,,2019.086234,1.412462,2017.0,2018.0,2019.0,2020.0,2021.0
CMV_IGM,0.0,0.0,,,,,,,,,
CMV_IGG,0.0,0.0,,,,,,,,,
PRAMR_CL1,0.0,,,,,,,,,,
PO2_DONE_DON,0.0,0.0,,,,,,,,,
PRAMR_CL2,0.0,,,,,,,,,,
MALIG_TRR,0.0,0.0,,,,,,,,,


In [22]:
# select columns for the dataframe
dfColDef = dfColDef.iloc[:, [0,1,2,5,6,7,8]][~dfColDef.featureName.str.contains('|'.join(removeCols))]

# shape
dfColDef.shape

(281, 7)

In [23]:
# remove columns
df_5 = df_5.drop(columns=removeCols)

# shape
df_5.shape

(15284, 281)

In [24]:
# count NaNs from new dataframe
NaN_5 = percentageNull(df_5)

# display 
NaN_5.head(10)

Unnamed: 0,percentage,NaNCount
TOT_SERUM_ALBUM,97.853965,14956
PRIOR_CARD_SURG_TYPE_TRR,80.731484,12339
PRIOR_CARD_SURG_TYPE_OSTXT_TCR,79.776237,12193
LAST_INACT_REASON,76.478671,11689
CONTIN_COCAINE_DON,73.985868,11308
VAD_BRAND1_TCR,71.918346,10992
INO_PROCURE_AGENT_1,65.185815,9963
PRIOR_CARD_SURG_TYPE_TCR,62.974352,9625
VAD_BRAND1_TRR,61.502225,9400
TCR_DUR_ABSTAIN,57.969118,8860


#### Examine `DATE` Features

In [25]:
# display descriptions
dateCols = df_5.columns[df_5.columns.str.contains('DATE')].values

# date column description
dfColDef[dfColDef.featureName.str.contains('|'.join(dateCols))]

Unnamed: 0,featureName,desc,form,formSection,dataType,labelSAS,COMMENT
3,ACTIVATE_DATE,ALLOCATION TIME BEGINNING DATE,WAITING LIST DATA,WAITING LIST DATA,NUM,WL LISTING BACK DATE,Unknown
5,ADMISSION_DATE,RECIPIENT DATE OF ADMISSION TO TX CENTER,TRR,PATIENT STATUS,NUM,TRR DATE OF ADMISSION TO TX CENTER,Unknown
6,ADMIT_DATE_DON,DONOR ADMIT DATE,DDR,DONOR INFORMATION,NUM,DON ADMIT DATE,Unknown
73,DISCHARGE_DATE,RECIPIENT DISCHARGE DATE FROM TX CENTER,TRR,PATIENT STATUS,NUM,TRR DATE OF DISCHARGE FROM TX CENTER,Unknown
94,END_DATE,"EARLIEST OF DATES OF REMOVAL FROM WAITING LIST, TRANSPLANT, DEATH, OR TIME COPY OF DATA CREATED",WAITING LIST DATA,WAITING LIST DATA,NUM,ENDING DATE FOR REGISTRATION,"IF PATIENT TRANSPLANTED OR DIED, BUT WAS REMOVED AFTER THE EVENT, END_DATE IS BACKDATED TO GIVE THE DATE OF EVENT"
157,INIT_DATE,BEGINNING DATE FOR REGISTRATION,WAITING LIST DATA,WAITING LIST DATA,NUM,BEGINNING DATE FOR REGISTRATION,Unknown
235,PX_STAT_DATE,RECIPIENT STATUS DATE,TRR/TRF-CALCULATED,PATIENT STATUS,NUM,"DATE OF DEATH, RE-TX OR LAST FOLLOW-UP",Unknown
242,RECOVERY_DATE_DON,ORGAN RECOVERY DATE,DDR / LDR,ORGAN RECOVERY,NUM,RECOVERY DATE (SENT TO OR),Unknown
244,REFERRAL_DATE,DATE OF REFERRAL CALL,DDR,PROVIDER INFORMATION,NUM,DDR DATE OF REFERRAL CALL,Unknown
266,TX_DATE,TRANSPLANT DATE,TRR,RECIPIENT INFORMATION,NUM,TRR TRANSPLANT DATE,Unknown


In [26]:
# convert to dattime
df_5[dateCols] = df_5[dateCols].apply(pd.to_datetime)

# reset index
df_5.reset_index(inplace=True, drop=True)
dfColDef.reset_index(inplace=True, drop=True)

In [27]:
# full dataframe
df.to_pickle("../Data/cleanDF.pkl")

# cleaned dataframe save to pickle file
df_5.to_pickle("../Data/cleanDF_5_years.pkl")

# save to pickle file
dfColDef.to_pickle("../Data/columnDefinition.pkl")