In [1]:
# path to user functions
import sys  
sys.path.append('../Src/')

from platform import python_version

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import importlib 

# import user functions
import UserUtilityFunctions as uf
import UserStatisticalFunctions as usf
import UserVisualization as uv

# set seaborn theme
sns.set_theme()

# initializing variables
REMOVE = '** REMOVE ** 3V2 Clean'
RANDOM_STATE = 1776

# print versions
print("Numpy Version: " + np.__version__)
print("Pandas Version: " + pd.__version__)
print("Seaborn Version: " + sns.__version__)
print("Matplotlib Version: " + plt.matplotlib.__version__)
print("Python Version: " + python_version())

# adjust pandas display options to max
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
# adjust pandas display options to ensure full display of content
pd.set_option('display.max_colwidth', None)

Numpy Version: 1.26.4
Pandas Version: 2.2.3
Seaborn Version: 0.13.2
Matplotlib Version: 3.9.2
Python Version: 3.9.20


### Import Data

In [2]:
# import data
df = pd.read_pickle("../Data/Version2_Heart_Engineer.pkl")
df_label = pd.read_pickle("../Data/Version2_Label_Engineer.pkl")
df_can = pd.read_pickle("../Data/Version2_CAN_Engineer.pkl")
df_don = pd.read_pickle("../Data/Version2_DON_Engineer.pkl")
df_both = pd.read_pickle("../Data/Version2_BOTH_Engineer.pkl")
df_nominal = pd.read_pickle("../Data/Version2_Nominal_Engineer.pkl")
df_ordinal = pd.read_pickle("../Data/Version2_Ordinal_Engineer.pkl")
df_numeric = pd.read_pickle("../Data/Version2_Numeric_Engineer.pkl")
df_remove = pd.read_pickle("../Data/Version2_Remove_Engineer.pkl")
df_dict = pd.read_pickle("../Data/Version2_Dictionary_Engineer.pkl")

# checking for duplicated column name
df.columns[df.columns.duplicated()]

Index([], dtype='object')

### Feature Information

In [3]:
print(f"Heart Dataset Rows: {df.shape[0]:,} & Columns: {df.shape[1]:,}")
print(f"Label Features: {df_label.shape[0]:,}")
print(f"Candidate Features: {df_can.shape[0]:,}")
print(f"Donor Features: {df_don.shape[0]:,}")
print(f"Both Features: {df_both.shape[0]:,}")
print(f"Nominal Features: {df_nominal.shape[0]:,}")
print(f"Ordinal Features: {df_ordinal.shape[0]:,}")
print(f"Numeric Features: {df_numeric.shape[0]:,}")
print(f"Remove Features: {df_remove.shape[0]:,}")

Heart Dataset Rows: 16,126 & Columns: 260
Label Features: 14
Candidate Features: 33
Donor Features: 33
Both Features: 9
Nominal Features: 189
Ordinal Features: 189
Numeric Features: 46
Remove Features: 32


### Examine Features

In [4]:
df = df[df.ListingYear.isin([2016,2017,2018,2019,2020, 2021])].copy()

df.shape

AttributeError: 'DataFrame' object has no attribute 'ListingYear'

In [None]:
def getFeatureList(data, string):
    # initialize features
    features = data.columns[data.columns.str.contains(string)].to_list()

    # display
    print(data[features].describe(include='all').T.to_string())

    return features


def contingencyTable(data, group, label, observed=False, fill_value=0):
    # create contingency table
    data = data.groupby(group, observed=observed)[label].value_counts().unstack(fill_value=fill_value)
    
    # get index name
    indexName = data.index.name

     # add row totals
    rowTotals = data.sum(axis=1)
    
    # calculate row-wise percentages
    if 'Dead' in data.columns:
        data['Dead %'] = (data['Dead'] / rowTotals) * 100
    else:
        data['Dead %'] = 0

    if 'Living' in data.columns:
        data['Living %'] = (data['Living'] / rowTotals) * 100
    else:
        data['Living %'] = 0
    
    return data


from sklearn.mixture import GaussianMixture

def GaussianMixtureBinning(data, columList, n_init=10, seed=RANDOM_STATE):
    """
    This function is designed to fit a Gaussian Mixture Model (GMM) with different numbers of 
    components (clusters) and use information criteria (AIC and BIC) to determine the optimal 
    number of components. It then visualizes the results using a plot to help identify the best 
    number of components for the GMM.
    """
    # initialize fit GMM with different number of components and select the best using AIC or BIC
    aic = [] # AIC (Akaike Information Criterion)    Lower the Better
    bic = [] # BIC (Bayesian Information Criterion)  Lower the Better
    components_range = range(1, 11)  # 1 to 10 components
    # remove any NaNs
    data = data[columList].dropna()
    
    for n in components_range:
        gmm = GaussianMixture(n_components=n, n_init=n_init, random_state=seed)
        gmm.fit(data[columList])
        aic.append(gmm.aic(data[columList]))
        bic.append(gmm.bic(data[columList]))
    
    # Plot AIC and BIC to find the optimal number of components
    plt.plot(components_range, aic, label='AIC')
    plt.plot(components_range, bic, label='BIC')
    plt.xlabel('Number of Components')
    plt.ylabel('AIC/BIC')
    plt.legend()
    plt.title('AIC and BIC for GMM')
    plt.show()


def imputeGaussianMixture(dataSeries):
    """
    This function is designed to impute missing values in a given dataSeries 
    (a pandas Series or similar object) using a Gaussian Mixture Model (GMM).
    """
    # change series into Array
    data = np.array(dataSeries.to_list())
    # fit GMM to the observed data
    gmm = GaussianMixture(n_components=2)
    observed_data = data[~np.isnan(data)]
    gmm.fit(observed_data.reshape(-1, 1))
    
    # impute missing values by sampling from the GMM
    missing_data = np.isnan(data)
    imputed_values = gmm.sample(np.sum(missing_data))[0]
    
    # fill missing data with the imputed values
    data[missing_data] = imputed_values.flatten()
    
    return data

In [None]:
NaNsDF = uf.percentageNull(df)

# display NaNs
NaNsDF

In [6]:
# get numeric data
numericCols = df_numeric.column.to_list()

# display
df[numericCols].head()

Unnamed: 0,GraftLifeSpanDay,LastFollowupNumber,TransplantSurvivalDay,Age_DON,Age_CAN,Age_Listing_CAN,BMI_CAN,BMI_DON,BloodUreaNitrogenLevel_DON,WeightKg_DON,WeightKg_CAN,HeightCm_CAN,HeightCm_DON,Creatinine_DON,CreatinineTransplant_CAN,CreatinineListing_CAN,HemodynamicsRegistration_CO_CAN,HemodynamicsTransplant_CO_CAN,HemodynamicsRegistration_PA_DIA_CAN,HemodynamicsTransplant_PA_DIA_CAN,HemodynamicsRegistration_PA_MN_CAN,HemodynamicsTransplant_PA_MN_CAN,HemodynamicsRegistration_PCW_CAN,HemodynamicsTransplant_PCW_CAN,HemodynamicsRegistration_SYS_CAN,HemodynamicsTransplant_SYS_CAN,TotalDayWaitList_CAN,TotalBilirubinTransplant_CAN,TerminalTotalBilirubin_DON,CPRA_Recent_CAN,CPRA_Peak_CAN,OrganRecovery_PCO2_DON,Level_SGOT_AST_DON,Level_SGOT_ALT_DON,DistanceFromDonorHospitaltoTXCenter,LengthOfStay,Hematocrit_DON,IschemicTimeHour_DON,BloodPH_DON,LV_EjectionFractionPercent_DON,LungPO2_DON,LungPO2_FIO2_DON,PanelReactiveAntibody_CPRA_Mean_CAN,Hemodynamics_Mean_CAN,Hemodynamics_CO_Mean_CAN,Creatinine_Mean_CAN
0,1549.0,50,1549.0,56.0,52,52,30.1,27.141582,9.0,73.0,87.1,170.2,164.0,0.6,1.1,1.0,3.0,3.0,22.0,22.0,30.0,30.0,22.0,22.0,44.0,44.0,1,0.7,0.8,2.0,2.0,36.5,21.0,50.0,58.0,11.0,35.9,3.0,7.42,65.0,93.7,40.0,2.0,29.5,2.0,1.05
1,1827.0,50,1827.0,18.0,34,34,24.7,33.832004,56.0,113.3,68.0,166.0,183.0,1.3,0.8,0.8,4.45,4.45,21.0,21.0,25.0,25.0,14.0,14.0,34.0,34.0,18,0.6,2.1,0.0,0.0,46.4,148.0,167.0,102.0,8.0,27.8,3.2,7.3,70.0,105.0,40.0,0.0,23.5,0.0,0.8
2,1677.0,50,1677.0,47.0,43,43,34.6,34.130201,13.0,111.0,85.7,157.5,180.3,1.1,1.3,0.8,4.17,4.17,37.0,37.0,39.0,39.0,30.0,30.0,53.0,53.0,7,3.4,0.3,,,41.0,36.0,73.0,8.0,21.0,29.2,1.8,7.37,73.0,324.0,100.0,,39.75,4.17,1.05
3,9.0,1,9.0,32.0,72,72,23.8,23.055556,20.0,74.7,68.9,170.2,180.0,0.58,1.2,1.4,,5.61,21.0,21.0,29.0,29.0,22.0,22.0,43.0,43.0,9,0.5,4.0,0.0,0.0,40.3,88.0,95.0,616.0,9.0,32.5,4.4,7.45,70.0,236.0,100.0,0.0,28.75,0.0,1.3
4,1840.0,50,1840.0,44.0,65,65,27.4,25.577717,99.0,66.3,81.6,172.7,161.0,3.0,1.3,1.2,5.99,4.34,13.0,25.0,17.0,36.0,11.0,,25.0,54.0,30,0.9,0.9,0.0,0.0,43.0,23.0,18.0,8.0,25.0,28.2,2.7,7.49,55.0,508.0,100.0,0.0,25.857143,0.0,1.25


### Save

In [149]:
# heart dataset
uf.writeToFile(df, 'Version2_Heart_Clean',path='../Data/', format='pkl')

# heart label
uf.writeToFile(df_label, 'Version2_Label_Clean',path='../Data/', format='pkl')

# heart candidate
uf.writeToFile(df_can, 'Version2_CAN_Clean', format='pkl')

# heart donor
uf.writeToFile(df_don, 'Version2_DON_Clean', format='pkl')

# heart both
uf.writeToFile(df_both, 'Version2_BOTH_Clean', format='pkl')

# heart nominal
uf.writeToFile(df_nominal, 'Version2_Nominal_Clean', format='pkl')

# heart ordinal
uf.writeToFile(df_ordinal, 'Version2_Ordinal_Clean', format='pkl')

# heart numeric
uf.writeToFile(df_numeric, 'Version2_Numeric_Clean', format='pkl')

# heart remove
uf.writeToFile(df_remove, 'Version2_Remove_Clean', format='pkl')

# heart data dictionary
uf.writeToFile(df_dict, 'Version2_Dictionary_Clean', format='pkl')

16,126 records written to ../Data/Version2_Heart_Engineer.pkl
14 records written to ../Data/Version2_Label_Engineer.pkl
33 records written to ../Data/Version2_CAN_Engineer.pkl
33 records written to ../Data/Version2_DON__Engineer.pkl
9 records written to ../Data/Version2_BOTH_Engineer.pkl
189 records written to ../Data/Version2_Nominal_Engineer.pkl
189 records written to ../Data/Version2_Ordinal_Engineer.pkl
46 records written to ../Data/Version2_Numeric_Engineer.pkl
32 records written to ../Data/Version2_Remove_Engineer.pkl
302 records written to ../Data/Version2_Dictionary_Engineer.pkl
