In [1]:
# path to user functions
import sys  
sys.path.append('../Src/')

from platform import python_version

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import importlib 
from sklearn.preprocessing import OrdinalEncoder

# import user functions
import UserUtilityFunctions as uf
import UserStatisticalFunctions as usf
import UserVisualization as uv

# set seaborn theme
sns.set_theme()

# initializing variables
REMOVE = '** REMOVE ** CAT ML'
RANDOM_STATE = 1776

# initialize DataFrame
df_drop = pd.DataFrame(columns=['column'])
df_unknown = pd.DataFrame(columns=['column'])
df_object = pd.DataFrame(columns=['column'])
df_date = pd.DataFrame(columns=['column'])

# print versions
print("Numpy Version: " + np.__version__)
print("Pandas Version: " + pd.__version__)
print("Seaborn Version: " + sns.__version__)
print("Matplotlib Version: " + plt.matplotlib.__version__)
print("Python Version: " + python_version())

# adjust pandas display options to max
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
# adjust pandas display options to ensure full display of content
pd.set_option('display.max_colwidth', None)

Numpy Version: 1.26.4
Pandas Version: 2.2.3
Seaborn Version: 0.13.2
Matplotlib Version: 3.9.2
Python Version: 3.9.20


####  Mann-Whitney U Test

#####  Mann-Whitney U test
- The Mann-Whitney U test is a non-parametric statistical test used to determine whether there is a significant difference between the distributions of two independent samples. It is often used as an alternative to the t-test when the assumption of normality is not met.
    - Key Assumptions
        - The two samples are independent.
        - The data is ordinal, interval, or ratio scale.
        - The distributions of the two groups are similar in shape if testing medians.
    - Interpretation
        - If the p-value is less than your significance level (e.g., 0.05), you reject the null hypothesis and conclude that there is a statistically significant difference between the two groups.
        - If the p-value is greater than the significance level, you fail to reject the null hypothesis, indicating insufficient evidence to suggest a difference in distributions.

### Import Data

In [2]:
# import data
df = pd.read_pickle("../Data/Clean_Heart_CAT_Engineer.pkl")
df_label = pd.read_pickle("../Data/Clean_Label_CAT_Engineer.pkl")
df_can = pd.read_pickle("../Data/Clean_CAN_CAT_Engineer.pkl")
df_don = pd.read_pickle("../Data/Clean_DON_CAT_Engineer.pkl")
df_both = pd.read_pickle("../Data/Clean_BOTH_CAT_Engineer.pkl")
df_nominal = pd.read_pickle("../Data/Clean_Nominal_CAT_Engineer.pkl")
df_ordinal = pd.read_pickle("../Data/Clean_Ordinal_CAT_Engineer.pkl")
df_numeric = pd.read_pickle("../Data/Clean_Numeric_CAT_Engineer.pkl")
df_dict = pd.read_pickle("../Data/Clean_Dictionary_CAT_Engineer.pkl")

# checking for duplicated column name
df.columns[df.columns.duplicated()]

Index([], dtype='object')

In [3]:
print(f"Heart Dataset Rows: {df.shape[0]:,} & Columns: {df.shape[1]:,}")
print(f"Label Features: {df_label.shape[0]:,}")
print(f"Candidate Features: {df_can.shape[0]:,}")
print(f"Donor Features: {df_don.shape[0]:,}")
print(f"Date Features: {df_date.shape[0]:,}")
print(f"Both Features: {df_both.shape[0]:,}")
print(f"Object Features: {df_object.shape[0]:,}")
print(f"Numeric Features: {df_numeric.shape[0]:,}")
print(f"Ordinal Features: {df_ordinal.shape[0]:,}")
print(f"Nominal Features: {df_nominal.shape[0]:,}")
print(f"Drop Features: {df_drop.shape[0]:,}")
print(f"Unknown Features: {df_unknown.shape[0]:,}")

Heart Dataset Rows: 16,126 & Columns: 220
Label Features: 15
Candidate Features: 104
Donor Features: 94
Date Features: 0
Both Features: 8
Object Features: 0
Numeric Features: 4
Ordinal Features: 47
Nominal Features: 169
Drop Features: 0
Unknown Features: 0


### Examine Features

In [4]:
df[df_label.column.to_list()].head()

Unnamed: 0,AcuteRejectionEpisode,AirwayDehiscencePostTransplant,StrokePostTransplant,PacemakerPostTransplant,DialysisPostDischarge,GraftFailStatus,GraftLifeSpanDay,LastFollowupNumber,GraftStatus,TransplantStatus,TransplantSurvivalDay,RecipientStatus,RejectionTreatmentWithinOneYear,FunctionalStatusFollowUp,LengthOfStay
0,No,No,No,No,No,Success,1549.0,50,Yes,Alive,1549.0,Living,No,"100% - Normal, no complaints, no evidence of disease",11.0
1,No,No,No,No,No,Success,1827.0,50,Yes,Alive,1827.0,Living,No,90% - Able to carry on normal activity: minor symptoms of disease,8.0
2,"Yes, none treated with additional anti-rejection agent",No,No,No,No,Success,1677.0,50,Yes,Alive,1677.0,Living,No,80% - Normal activity with effort: some symptoms of disease,21.0
3,No,No,No,No,No,Failure,9.0,1,No,Dead,9.0,Dead,Missing,60% - Requires occasional assistance but is able to care for needs,9.0
4,No,No,No,No,No,Success,1840.0,50,Yes,Alive,1840.0,Living,No,Unknown,25.0


In [5]:
# remove unwanted labels
removeCols = df_label.column[~df_label.column.isin(['TransplantSurvivalDay'])].to_list()

# remove features
df, df_dict, df_label, df_can, df_don, df_both, df_ordinal, df_nominal, df_numeric, df_drop, df_object, df_unknown, df_date = uf.HouseKeeping(df, removeCols, df_dict, df_label, df_can,\
                                                                df_don, df_both, df_ordinal, df_nominal, df_numeric, df_drop, df_object, df_unknown, df_date, txt=REMOVE, display=True)

Data Dictionary Updated.
Remove 14 row(s) from df_label DataFrame.
Remove 0 row(s) from df_can DataFrame.
Remove 0 row(s) from df_don DataFrame.
Remove 0 row(s) from df_both DataFrame.
Remove 1 row(s) from df_ordinal DataFrame.
Remove 10 row(s) from df_nominal DataFrame.
Remove 3 row(s) from df_numeric DataFrame.
Remove 0 row(s) from df_drop DataFrame.
Remove 0 row(s) from df_object DataFrame.
Remove 0 row(s) from df_unknown DataFrame.
Remove 0 row(s) from df_date DataFrame.

Removed Features: ['AcuteRejectionEpisode', 'AirwayDehiscencePostTransplant', 'DialysisPostDischarge', 'FunctionalStatusFollowUp', 'GraftFailStatus', 'GraftLifeSpanDay', 'GraftStatus', 'LastFollowupNumber', 'LengthOfStay', 'PacemakerPostTransplant', 'RecipientStatus', 'RejectionTreatmentWithinOneYear', 'StrokePostTransplant', 'TransplantStatus']

Total Row(s) & Column(s) Before Removing Column(s): 16,126 & columns: 220
Total Row(s) & Column(s) After Removing Column(s): 16,126 & columns: 206


### Determine Candidate Survival for 600 Days

In [6]:
# initialize
removeCols = ['TransplantSurvivalDay']

# create new feature
df['Survival'] = df.TransplantSurvivalDay >= 600

# remove features
df, df_dict, df_label, df_can, df_don, df_both, df_ordinal, df_nominal, df_numeric, df_drop, df_object, df_unknown, df_date = uf.HouseKeeping(df, removeCols, df_dict, df_label, df_can,\
                                                                df_don, df_both, df_ordinal, df_nominal, df_numeric, df_drop, df_object, df_unknown, df_date, txt=REMOVE, display=True)

Data Dictionary Updated.
Remove 1 row(s) from df_label DataFrame.
Remove 0 row(s) from df_can DataFrame.
Remove 0 row(s) from df_don DataFrame.
Remove 0 row(s) from df_both DataFrame.
Remove 0 row(s) from df_ordinal DataFrame.
Remove 0 row(s) from df_nominal DataFrame.
Remove 1 row(s) from df_numeric DataFrame.
Remove 0 row(s) from df_drop DataFrame.
Remove 0 row(s) from df_object DataFrame.
Remove 0 row(s) from df_unknown DataFrame.
Remove 0 row(s) from df_date DataFrame.

Removed Features: ['TransplantSurvivalDay']

Total Row(s) & Column(s) Before Removing Column(s): 16,126 & columns: 207
Total Row(s) & Column(s) After Removing Column(s): 16,126 & columns: 206


In [7]:
# sanity check
uf.datatypeDF(df, display=True)

Total Data feature count:  206

Boolean feature count: 1
Category feature count: 205
Numeric feature count: 0
Object feature count: 0
Other feature count: 0

Total feature count:  206


In [8]:
# sanity check
print(f"Total Length: {len(df_nominal.column.to_list()) + len(df_ordinal.column.to_list())}")
nominal = df_nominal.column.to_list()
ordinal = df_ordinal.column.to_list()
allCols = df.columns.to_list()
# display the difference
set(allCols).symmetric_difference(set(ordinal)|set(nominal))

Total Length: 205


{'Survival'}

In [9]:
# display NaNs
uf.percentageNull(df)

Unnamed: 0,percentage,NaNCount


In [10]:
# check label
df.Survival.value_counts()

Survival
True     8274
False    7852
Name: count, dtype: int64

In [11]:
# remove unused categories
df = uf.removeCatZeroCount(df).copy()

### Engineer / Remove Features

##### Test of Independence for Categorical Variables
- Null Hypothesis (H0): The two variables are not related.
- Alternative Hypothesis (H1): The two variables are related.
- A low p-value (typically < 0.05) indicates a significant relationship between the variables.
- This is the Chi-Square test statistic. It measures the discrepancy between the observed and expected frequencies under the assumption that the variables are independent.
    - A larger `chi2` value indicates a greater difference between observed and expected counts, suggesting that the variables are more likely to be associated.
    - A small p_value (typically < 0.05) suggests that you can reject the null hypothesis and conclude that there is a significant association between the two variables.

##### Cramer's V values can be interpreted as:
- Cramer's V is a measure of association between two nominal (categorical) variables, based on the Chi-Square statistic and varies from `0` (corresponding to no association between the variables) to `1` (complete association) and can reach 1 only when each variable is completely determined by the other.
    - 0.00 to 0.10: Negligible association
    - 0.10 to 0.30: Weak association
    - 0.30 to 0.50: Moderate association
    - 0.50 to 0.70: Strong association
    - 0.70 to 1.00: Very strong association

#### User Function(s)

In [12]:
def getFeatureList(data, string):
    # initialize features
    features = data.columns[data.columns.str.contains(string)].to_list()

    # display
    print(data[features].describe(include='all').T.to_string())

    return features


def Consolidate_SERSTAT(value):
    if value.startswith('Positive'):
        return 'Positive'
    elif value.startswith('Indeterminate'):
        return 'Indeterminate' 
    elif value.startswith('Negative'):
        return 'Negative'
    elif value.startswith('Not'):
        return 'Not Done'
    else:
        return 'Missing/Unknown'

### Consolidate Categoriess

#### SERSTAT

In [13]:
# get features to consolidate
features = df_dict.Feature[df_dict.Information.str.contains('FMTNAME: SERSTAT')].to_list()

# change datatype
df[features] = df[features].astype(str)

# display
print(features)

# apply new mapping
df[features] = df[features].map(Consolidate_SERSTAT).fillna("Re-Check")

# change to category
df[features] = df[features].astype('category')

['EpsteinBarr_IGG_DON', 'EpsteinBarr_IGM_DON', 'EpsteinBarrSeroStatusTransplant_CAN', 'AntibodyResultHBSAB_DON', 'Hepatitis_B_CoreAntibody_CAN', 'Hepatitis_B_CoreAntibody_DON', 'HBV_NAT_Result_CAN', 'HBV_NAT_Result_DON', 'SurfaceAntigenHEP_B_CAN', 'SurfaceAntigenHEP_B_DON', 'SurfaceHBVAntibodyTotalTransplant_CAN', 'HCV_NAT_PreTranspant_CAN', 'HCV_NAT_Result_DON', 'HEP_C_SerostatusStatus_CAN', 'Antibody_HEP_C_DON', 'HIV_NAT_PreTransplant_CAN', 'HIV_NAT_Result_DON', 'HIV_SeroStatusTransplant_CAN', 'AntibodyResultRPR_VDRL_DON']


### Nominals

In [14]:
print(sorted(df_nominal.column.to_list()))

['AllocationType_DON', 'AntiHypertensive_DON', 'AntibodyResultHBSAB_DON', 'AntibodyResultRPR_VDRL_DON', 'Antibody_HEP_C_DON', 'AntigenBW4_CAN', 'AntigenBW6_CAN', 'AntigenC1_CAN', 'AntigenC2_CAN', 'AntigenDR1_DON', 'AntigenDR2_DON', 'AntigenDR51_2_CAN', 'AntigenDR51_CAN', 'AntigenDR52_2_CAN', 'AntigenDR52_CAN', 'AntigenDR53_2_CAN', 'AntigenDR53_CAN', 'ArginnieManagement_DON', 'Biopsy_DON', 'BloodGroupMatchLevel', 'BloodGroup_CAN', 'BloodGroup_DON', 'BloodInfectionSource_DON', 'BloodPH_CAT_DON', 'BronchoscopyLeft_DON', 'BronchoscopyRight_DON', 'CMVStatus_Transplant_CAN', 'CMV_IGG_Transplant_CAN', 'CMV_IGM_Transplant_CAN', 'CancerExtraCranial_DON', 'CancerHistory_DON', 'CancerIntraCranial_DON', 'CancerSkin_DON', 'CardiacArrest_DON', 'CauseOfDeath_DON', 'CerebroVascularDisease_CAN', 'ChestXray_DON', 'CigaretteHistory_DON', 'CigaretteUse_CAN', 'Citizenship_CAN', 'Citizenship_DON', 'CocaineUse_DON', 'CoronaryAngiogram_DON', 'CrossMatchDone', 'DeathCircumstance_DON', 'DeathMechanism_DON', 'De

#### AllocationType_DON

In [15]:
features = getFeatureList(df, 'AllocationType_DON')

                    count unique    top  freq
AllocationType_DON  16126      4  Local  6605


In [16]:
for col in features:
    print(uf.categoryContingencySurvival(df, col).to_string())
    print("\n")

Survival              Dead  Living  Row Total     Dead %   Living %
AllocationType_DON                                                 
Foreign Donor          4.0    11.0       15.0  26.666667  73.333333
Local               2124.0  4481.0     6605.0  32.157456  67.842544
National            3541.0  2325.0     5866.0  60.364814  39.635186
Regional            2183.0  1457.0     3640.0  59.972527  40.027473
Column Total        7852.0  8274.0    16126.0  48.691554  51.308446




#### AntiHypertensive_DON

In [17]:
features = getFeatureList(df, 'AntiHypertensive_DON')

                      count unique top   freq
AntiHypertensive_DON  16126      4  No  10374


In [18]:
for col in features:
    print(uf.categoryContingencySurvival(df, col, 'Survival').to_string())
    print("\n")

Survival                Dead  Living  Row Total     Dead %   Living %
AntiHypertensive_DON                                                 
Missing                181.0    12.0      193.0  93.782383   6.217617
No                    4976.0  5398.0    10374.0  47.966069  52.033931
Unknown                  1.0     5.0        6.0  16.666667  83.333333
Yes                   2694.0  2859.0     5553.0  48.514317  51.485683
Column Total          7852.0  8274.0    16126.0  48.691554  51.308446




#### Antibody
- Hepatitis_B_CoreAntibody_CAN & SurfaceHBVAntibodyTotalTransplant_CAN & Hepatitis_B_CoreAntibody_DON & Antibody_HEP_C_DON & AntibodyResultRPR_VDRL_DON & AntibodyResultHBSAB_DON & PanelReactiveAntibody_CPRA_CAT_CAN

In [19]:
features = getFeatureList(df, 'Antibody')

                                       count unique               top   freq
Hepatitis_B_CoreAntibody_CAN           16126      4          Negative  14454
SurfaceHBVAntibodyTotalTransplant_CAN  16126      4          Negative  10565
Hepatitis_B_CoreAntibody_DON           16126      4          Negative  15744
Antibody_HEP_C_DON                     16126      4          Negative  14735
AntibodyResultRPR_VDRL_DON             16126      4          Negative  15965
AntibodyResultHBSAB_DON                16126      5          Not Done  13954
PanelReactiveAntibody_CPRA_CAT_CAN     16126      7  No Sensitization   7459


In [20]:
# test of Independence for Categorical Variables
usf.pairColsMultiIndependenceCat(df, features)

Unnamed: 0,column1,column2,chi2,p_value,cramer_v
0,Hepatitis_B_CoreAntibody_CAN,SurfaceHBVAntibodyTotalTransplant_CAN,7942.404844,0.0,0.405184
18,AntibodyResultRPR_VDRL_DON,AntibodyResultHBSAB_DON,5783.059078,0.0,0.345744
15,Antibody_HEP_C_DON,AntibodyResultRPR_VDRL_DON,5000.901316,0.0,0.321514
11,Hepatitis_B_CoreAntibody_DON,Antibody_HEP_C_DON,2486.559347,0.0,0.226712
16,Antibody_HEP_C_DON,AntibodyResultHBSAB_DON,846.641275,1.6349210000000002e-173,0.13229
12,Hepatitis_B_CoreAntibody_DON,AntibodyResultRPR_VDRL_DON,341.845434,3.3673940000000002e-68,0.08406
13,Hepatitis_B_CoreAntibody_DON,AntibodyResultHBSAB_DON,232.868147,5.05166e-43,0.069379
4,Hepatitis_B_CoreAntibody_CAN,AntibodyResultHBSAB_DON,197.943445,8.665125e-36,0.063966
5,Hepatitis_B_CoreAntibody_CAN,PanelReactiveAntibody_CPRA_CAT_CAN,186.768384,4.3536999999999996e-30,0.062134
10,SurfaceHBVAntibodyTotalTransplant_CAN,PanelReactiveAntibody_CPRA_CAT_CAN,136.281028,3.3264909999999996e-20,0.053075


In [21]:
for col in features:
    print(uf.categoryContingencySurvival(df, col).to_string())
    print("\n")

Survival                        Dead  Living  Row Total     Dead %   Living %
Hepatitis_B_CoreAntibody_CAN                                                 
Missing/Unknown                420.0    22.0      442.0  95.022624   4.977376
Negative                      6860.0  7594.0    14454.0  47.460910  52.539090
Not Done                       218.0   283.0      501.0  43.512974  56.487026
Positive                       354.0   375.0      729.0  48.559671  51.440329
Column Total                  7852.0  8274.0    16126.0  48.691554  51.308446


Survival                                 Dead  Living  Row Total     Dead %   Living %
SurfaceHBVAntibodyTotalTransplant_CAN                                                 
Missing/Unknown                         632.0   358.0      990.0  63.838384  36.161616
Negative                               4890.0  5675.0    10565.0  46.284903  53.715097
Not Done                                182.0   405.0      587.0  31.005111  68.994889
Positive         

#### Antigen

##### AntigenBW
- AntigenBW4_CAN & AntigenBW6_CAN

In [22]:
features = getFeatureList(df, 'AntigenBW')

                count unique top   freq
AntigenBW4_CAN  16126      4   0  10589
AntigenBW6_CAN  16126      4   0  10383


In [23]:
# test of Independence for Categorical Variables
usf.pairColsMultiIndependenceCat(df, features)

Unnamed: 0,column1,column2,chi2,p_value,cramer_v
0,AntigenBW4_CAN,AntigenBW6_CAN,14787.906206,0.0,0.552878


In [24]:
for col in features:
    print(uf.categoryContingencySurvival(df, col).to_string())
    print("\n")

Survival          Dead  Living  Row Total     Dead %    Living %
AntigenBW4_CAN                                                  
0               5093.0  5496.0    10589.0  48.097082   51.902918
Negative         845.0   929.0     1774.0  47.632469   52.367531
Not Tested         0.0     2.0        2.0   0.000000  100.000000
Positive        1914.0  1847.0     3761.0  50.890721   49.109279
Column Total    7852.0  8274.0    16126.0  48.691554   51.308446


Survival          Dead  Living  Row Total      Dead %   Living %
AntigenBW6_CAN                                                  
0               4994.0  5389.0    10383.0   48.097852  51.902148
Negative         381.0   384.0      765.0   49.803922  50.196078
Not Tested         1.0     0.0        1.0  100.000000   0.000000
Positive        2476.0  2501.0     4977.0   49.748845  50.251155
Column Total    7852.0  8274.0    16126.0   48.691554  51.308446




#### ArginnieManagement_DON

In [25]:
features = getFeatureList(df, 'Arginnie')

                        count unique  top   freq
ArginnieManagement_DON  16126      4  Yes  11275


In [26]:
for col in features:
    print(uf.categoryContingencySurvival(df, col).to_string())
    print("\n")

Survival                  Dead  Living  Row Total     Dead %   Living %
ArginnieManagement_DON                                                 
Missing                  181.0    12.0      193.0  93.782383   6.217617
No                      2249.0  2405.0     4654.0  48.324022  51.675978
Unknown                    2.0     2.0        4.0  50.000000  50.000000
Yes                     5420.0  5855.0    11275.0  48.070953  51.929047
Column Total            7852.0  8274.0    16126.0  48.691554  51.308446




#### Biopsy_DON

In [27]:
features = getFeatureList(df, 'Biopsy')

            count unique              top   freq
Biopsy_DON  16126      5  Biopsy not done  15920


In [28]:
for col in features:
    print(uf.categoryContingencySurvival(df, col).to_string())
    print("\n")

Survival                        Dead  Living  Row Total     Dead %    Living %
Biopsy_DON                                                                    
Biopsy not done               7667.0  8253.0    15920.0  48.159548   51.840452
Missing                        181.0    12.0      193.0  93.782383    6.217617
Unknow                           2.0     2.0        4.0  50.000000   50.000000
Yes, rejection confirmed         2.0     3.0        5.0  40.000000   60.000000
Yes, rejection not confirmed     0.0     4.0        4.0   0.000000  100.000000
Column Total                  7852.0  8274.0    16126.0  48.691554   51.308446




#### BloodGroup
- BloodGroup_CAN & BloodGroup_DON & BloodGroupMatchLevel

In [29]:
features = getFeatureList(df, 'BloodGroup')

                      count unique        top   freq
BloodGroup_CAN        16126      8          A   6419
BloodGroup_DON        16126      8          O   8110
BloodGroupMatchLevel  16126      3  Identical  13718


In [30]:
# test of Independence for Categorical Variables
usf.pairColsMultiIndependenceCat(df, features)

Unnamed: 0,column1,column2,chi2,p_value,cramer_v
0,BloodGroup_CAN,BloodGroup_DON,26338.660587,0.0,0.483042
1,BloodGroup_CAN,BloodGroupMatchLevel,3616.335729,0.0,0.334854
2,BloodGroup_DON,BloodGroupMatchLevel,1007.907794,3.1463500000000005e-206,0.17678


In [31]:
for col in features:
    print(uf.categoryContingencySurvival(df, col).to_string())
    print("\n")

Survival          Dead  Living  Row Total     Dead %    Living %
BloodGroup_CAN                                                  
A               3012.0  3407.0     6419.0  46.923197   53.076803
A1                34.0     9.0       43.0  79.069767   20.930233
A1B                2.0     1.0        3.0  66.666667   33.333333
A2                 5.0     1.0        6.0  83.333333   16.666667
A2B                0.0     2.0        2.0   0.000000  100.000000
AB               417.0   489.0      906.0  46.026490   53.973510
B               1237.0  1311.0     2548.0  48.547881   51.452119
O               3145.0  3054.0     6199.0  50.733989   49.266011
Column Total    7852.0  8274.0    16126.0  48.691554   51.308446


Survival          Dead  Living  Row Total     Dead %   Living %
BloodGroup_DON                                                 
A               1317.0  1413.0     2730.0  48.241758  51.758242
A1              1190.0  1405.0     2595.0  45.857418  54.142582
A1B               44.0    5

#### BloodPH_CAT_DON

In [32]:
features = getFeatureList(df, 'BloodPH')

                 count unique     top  freq
BloodPH_CAT_DON  16126      4  Normal  9454


In [33]:
for col in features:
    print(uf.categoryContingencySurvival(df, col).to_string())
    print("\n")

Survival           Dead  Living  Row Total     Dead %   Living %
BloodPH_CAT_DON                                                 
Acidic            857.0  1057.0     1914.0  44.775340  55.224660
Alkaline         2212.0  2333.0     4545.0  48.668867  51.331133
Missing           188.0    25.0      213.0  88.262911  11.737089
Normal           4595.0  4859.0     9454.0  48.603766  51.396234
Column Total     7852.0  8274.0    16126.0  48.691554  51.308446




#### Bronchoscopy
- BronchoscopyLeft_DON & BronchoscopyRight_DON
- New Feature Bronchoscopy_Combined

In [34]:
features = getFeatureList(df, 'Bronchoscopy')

                       count unique      top  freq
BronchoscopyLeft_DON   16126      9  Missing  8284
BronchoscopyRight_DON  16126      9  Missing  8344


In [35]:
# test of Independence for Categorical Variables
usf.pairColsMultiIndependenceCat(df, features)

Unnamed: 0,column1,column2,chi2,p_value,cramer_v
0,BronchoscopyLeft_DON,BronchoscopyRight_DON,87934.054043,0.0,0.825601


In [36]:
for col in features:
    print(uf.categoryContingencySurvival(df, col).to_string())
    print("\n")

Survival                               Dead  Living  Row Total     Dead %    Living %
BronchoscopyLeft_DON                                                                 
Abnormal-anatomy/other lesion          50.0    46.0       96.0  52.083333   47.916667
Abnormal-aspiration of foreign body    29.0    31.0       60.0  48.333333   51.666667
Abnormal-blood                        200.0   274.0      474.0  42.194093   57.805907
Abnormal-purulent secretions          492.0   674.0     1166.0  42.195540   57.804460
Missing                              4304.0  3980.0     8284.0  51.955577   48.044423
No Bronchoscopy                       177.0   185.0      362.0  48.895028   51.104972
Normal                               2595.0  3075.0     5670.0  45.767196   54.232804
Unknown                                 5.0     8.0       13.0  38.461538   61.538462
Unknown if bronchoscopy performed       0.0     1.0        1.0   0.000000  100.000000
Column Total                         7852.0  8274.0   

In [37]:
def combine_bronchoscopy(left, right):
    if left == 'Normal' and right == 'Normal':
        return 'Normal'
    elif left.startswith('Abnormal') or right.startswith('Abnormal'):
        return 'Abnormal'
    elif left == 'No Bronchoscopy' and right == 'No Bronchoscopy':
        return 'No Bronchoscopy'
    else:
        return 'Missing/Unknown'

In [38]:
# new feature
df['Bronchoscopy_Combined_DON'] = df.apply(lambda row: combine_bronchoscopy(row['BronchoscopyLeft_DON'], row['BronchoscopyRight_DON']), axis=1)

# update DataFrame
df_drop  = uf.insertIntoDataFrame(df_drop, features)
df_don  = uf.insertIntoDataFrame(df_don, ['Bronchoscopy_Combined_DON'])
df_nominal  = uf.insertIntoDataFrame(df_nominal, ['Bronchoscopy_Combined_DON'])

# convert to category
df = uf.toCategory(df, ['Bronchoscopy_Combined_DON'])

In [39]:
uf.categoryContingencySurvival(df, 'Bronchoscopy_Combined_DON')

Survival,Dead,Living,Row Total,Dead %,Living %
Bronchoscopy_Combined_DON,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Abnormal,1029.0,1338.0,2367.0,43.47275,56.52725
Missing/Unknown,4362.0,4093.0,8455.0,51.590775,48.409225
No Bronchoscopy,165.0,171.0,336.0,49.107143,50.892857
Normal,2296.0,2672.0,4968.0,46.215781,53.784219
Column Total,7852.0,8274.0,16126.0,48.691554,51.308446


#### CMV
- CMVStatus_Transplant_CAN & CMV_IGG_Transplant_CAN & CMV_IGM_Transplant_CAN & SerologyAntiCMV_DON

In [40]:
features = getFeatureList(df, 'CMV')

                          count unique       top   freq
CMVStatus_Transplant_CAN  16126      5  Positive   8692
CMV_IGG_Transplant_CAN    16126      1   Missing  16126
CMV_IGM_Transplant_CAN    16126      1   Missing  16126
SerologyAntiCMV_DON       16126      6  Positive   9880


In [41]:
# test of Independence for Categorical Variables
usf.pairColsMultiIndependenceCat(df, features)

Unnamed: 0,column1,column2,chi2,p_value,cramer_v
2,CMVStatus_Transplant_CAN,SerologyAntiCMV_DON,37.081308,0.011442,0.023976
0,CMVStatus_Transplant_CAN,CMV_IGG_Transplant_CAN,0.0,1.0,
1,CMVStatus_Transplant_CAN,CMV_IGM_Transplant_CAN,0.0,1.0,
3,CMV_IGG_Transplant_CAN,CMV_IGM_Transplant_CAN,0.0,1.0,
4,CMV_IGG_Transplant_CAN,SerologyAntiCMV_DON,0.0,1.0,
5,CMV_IGM_Transplant_CAN,SerologyAntiCMV_DON,0.0,1.0,


In [42]:
for col in features:
    print(uf.categoryContingencySurvival(df, col).to_string())
    print("\n")

Survival                    Dead  Living  Row Total      Dead %   Living %
CMVStatus_Transplant_CAN                                                  
Missing                    407.0     0.0      407.0  100.000000   0.000000
Negative                  3225.0  3571.0     6796.0   47.454385  52.545615
Not Done                    84.0   121.0      205.0   40.975610  59.024390
Positive                  4124.0  4568.0     8692.0   47.445927  52.554073
Unknown                     12.0    14.0       26.0   46.153846  53.846154
Column Total              7852.0  8274.0    16126.0   48.691554  51.308446


Survival                  Dead  Living  Row Total     Dead %   Living %
CMV_IGG_Transplant_CAN                                                 
Missing                 7852.0  8274.0    16126.0  48.691554  51.308446
Column Total            7852.0  8274.0    16126.0  48.691554  51.308446


Survival                  Dead  Living  Row Total     Dead %   Living %
CMV_IGM_Transplant_CAN              

In [43]:
# update DataFrame
df_drop  = uf.insertIntoDataFrame(df_drop, ['CMV_IGG_Transplant_CAN','CMV_IGM_Transplant_CAN'])

#### ChestXray_DON

In [44]:
features = getFeatureList(df, 'ChestXray')

               count unique            top  freq
ChestXray_DON  16126      7  Abnormal-both  9278


In [45]:
for col in features:
    print(uf.categoryContingencySurvival(df, col).to_string())
    print("\n")

Survival           Dead  Living  Row Total     Dead %   Living %
ChestXray_DON                                                   
Abnormal-both    4607.0  4671.0     9278.0  49.655098  50.344902
Abnormal-left     593.0   714.0     1307.0  45.371079  54.628921
Abnormal-right    823.0   946.0     1769.0  46.523460  53.476540
Missing           439.0   340.0      779.0  56.354300  43.645700
No chest x-ray      5.0     1.0        6.0  83.333333  16.666667
Normal           1358.0  1595.0     2953.0  45.987132  54.012868
Results Unknown    27.0     7.0       34.0  79.411765  20.588235
Column Total     7852.0  8274.0    16126.0  48.691554  51.308446




#### Cigarette
- CigaretteUse_CAN & CigaretteAbstinence_CAN & CigaretteHistory_DON

In [46]:
features = getFeatureList(df, 'Cigarette')

                         count unique      top   freq
CigaretteUse_CAN         16126      3       No   9276
CigaretteAbstinence_CAN  16126     10  Missing   9349
CigaretteHistory_DON     16126      4       No  13924


In [47]:
# test of Independence for Categorical Variables
usf.pairColsMultiIndependenceCat(df, features)

Unnamed: 0,column1,column2,chi2,p_value,cramer_v
0,CigaretteUse_CAN,CigaretteAbstinence_CAN,16126.0,0.0,0.707107
2,CigaretteAbstinence_CAN,CigaretteHistory_DON,19.778772,0.839911,0.02022
1,CigaretteUse_CAN,CigaretteHistory_DON,4.537247,0.604376,0.011861


In [48]:
for col in features:
    print(uf.categoryContingencySurvival(df, col).to_string())
    print("\n")

Survival            Dead  Living  Row Total      Dead %   Living %
CigaretteUse_CAN                                                  
Missing             73.0     0.0       73.0  100.000000   0.000000
No                4540.0  4736.0     9276.0   48.943510  51.056490
Yes               3239.0  3538.0     6777.0   47.794009  52.205991
Column Total      7852.0  8274.0    16126.0   48.691554  51.308446


Survival                   Dead  Living  Row Total     Dead %   Living %
CigaretteAbstinence_CAN                                                 
0-2 months                 81.0    87.0      168.0  48.214286  51.785714
13-24 months              283.0   280.0      563.0  50.266430  49.733570
25-36 months              171.0   168.0      339.0  50.442478  49.557522
3-12 months               502.0   596.0     1098.0  45.719490  54.280510
37-48 months              132.0   123.0      255.0  51.764706  48.235294
49-60 months               99.0   119.0      218.0  45.412844  54.587156
>60 months  

In [49]:
# update DataFrame
df_drop  = uf.insertIntoDataFrame(df_drop, ['CigaretteAbstinence_CAN'])

#### Cancer
- CancerExtraCranial_DON & CancerIntraCranial_DON & CancerHistory_DON & CancerSkin_DON
- Consolidated Cancer_CountTotal_DON

In [50]:
features = getFeatureList(df, 'Cancer')

                        count unique top   freq
CancerExtraCranial_DON  16126      4  No  15812
CancerIntraCranial_DON  16126      4  No  15813
CancerHistory_DON       16126      4  No  15788
CancerSkin_DON          16126      4  No  15831


In [51]:
# test of Independence for Categorical Variables
usf.pairColsMultiIndependenceCat(df, features)

Unnamed: 0,column1,column2,chi2,p_value,cramer_v
2,CancerExtraCranial_DON,CancerSkin_DON,30163.91479,0.0,0.789623
0,CancerExtraCranial_DON,CancerIntraCranial_DON,30010.382128,0.0,0.787611
4,CancerIntraCranial_DON,CancerSkin_DON,29703.152245,0.0,0.783569
3,CancerIntraCranial_DON,CancerHistory_DON,5485.081283,0.0,0.336719
5,CancerHistory_DON,CancerSkin_DON,5152.772003,0.0,0.32636
1,CancerExtraCranial_DON,CancerHistory_DON,4710.037669,0.0,0.312024


In [52]:
for col in features:
    print(uf.categoryContingencySurvival(df, col).to_string())
    print("\n")

Survival                  Dead  Living  Row Total     Dead %   Living %
CancerExtraCranial_DON                                                 
Missing                  182.0    12.0      194.0  93.814433   6.185567
No                      7604.0  8208.0    15812.0  48.090058  51.909942
Unknown                   53.0    36.0       89.0  59.550562  40.449438
Yes                       13.0    18.0       31.0  41.935484  58.064516
Column Total            7852.0  8274.0    16126.0  48.691554  51.308446


Survival                  Dead  Living  Row Total     Dead %   Living %
CancerIntraCranial_DON                                                 
Missing                  182.0    12.0      194.0  93.814433   6.185567
No                      7600.0  8213.0    15813.0  48.061721  51.938279
Unknown                   57.0    35.0       92.0  61.956522  38.043478
Yes                       13.0    14.0       27.0  48.148148  51.851852
Column Total            7852.0  8274.0    16126.0  48.691554  

In [53]:
# new feature
df['Cancer_CountTotal_DON'] = df[features].apply(lambda row: sum(1 for value in row if value == 'Yes'), axis=1)

# update DataFrame
df_drop  = uf.insertIntoDataFrame(df_drop, features)
df_don  = uf.insertIntoDataFrame(df_don, ['Cancer_CountTotal_DON'])
df_ordinal  = uf.insertIntoDataFrame(df_ordinal, ['Cancer_CountTotal_DON'])

# convert to category
df = uf.toCategory(df, ['Cancer_CountTotal_DON'])

In [54]:
print(uf.categoryContingencySurvival(df, 'Cancer_CountTotal_DON').to_string())

Survival                 Dead  Living  Row Total     Dead %   Living %
Cancer_CountTotal_DON                                                 
0                      7741.0  8149.0    15890.0  48.716174  51.283826
1                        93.0   107.0      200.0  46.500000  53.500000
2                        18.0    18.0       36.0  50.000000  50.000000
Column Total           7852.0  8274.0    16126.0  48.691554  51.308446


#### CardiacArrest_DON

In [55]:
features = getFeatureList(df, 'CardiacArrest')

                   count unique top   freq
CardiacArrest_DON  16126      3  No  14523


In [56]:
for col in features:
    print(uf.categoryContingencySurvival(df, col).to_string())
    print("\n")

Survival             Dead  Living  Row Total     Dead %   Living %
CardiacArrest_DON                                                 
Missing             484.0    13.0      497.0  97.384306   2.615694
No                 6806.0  7717.0    14523.0  46.863596  53.136404
Yes                 562.0   544.0     1106.0  50.813743  49.186257
Column Total       7852.0  8274.0    16126.0  48.691554  51.308446




#### Citizenship
- Citizenship_CAN & Citizenship_DON
- New Feature Citizenship_Difference

In [57]:
features = getFeatureList(df, 'Citizenship')

                 count unique         top   freq
Citizenship_CAN  16126      5  US Citizen  15402
Citizenship_DON  16126      5  US Citizen  14084


In [58]:
# test of Independence for Categorical Variables
usf.pairColsMultiIndependenceCat(df, features)

Unnamed: 0,column1,column2,chi2,p_value,cramer_v
0,Citizenship_CAN,Citizenship_DON,599.975559,2.3146e-117,0.096444


In [59]:
# convert to string
df[features] = df[features].astype(str)

# create New Feature (True = 1 & False = 0)
df['Citizenship_Difference'] = ( df.Citizenship_CAN.str.replace(" ", "", regex=False).str.lower() == df.Citizenship_DON.str.replace(" ", "", regex=False).str.lower()).astype(int)

# convert to category
df = uf.toCategory(df, ['Citizenship_Difference']).copy()

# update DataFrame
df_drop  = uf.insertIntoDataFrame(df_drop, features)
df_nominal = uf.insertIntoDataFrame(df_nominal, ['Citizenship_Difference'])
df_both = uf.insertIntoDataFrame(df_both, ['Citizenship_Difference'])

In [60]:
uf.categoryContingencySurvival(df, 'Citizenship_Difference', 'Survival')

Survival,Dead,Living,Row Total,Dead %,Living %
Citizenship_Difference,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1334.0,1222.0,2556.0,52.190923,47.809077
1,6518.0,7052.0,13570.0,48.032424,51.967576
Column Total,7852.0,8274.0,16126.0,48.691554,51.308446


#### Cocaine & Drug
- PastCocaineUse_DON & CocaineUse_DON & OtherDrugUse_DON & PastOtherDrugUse_DON
- Consolidated DrugUse_CountTotal_DON

In [61]:
features = getFeatureList(df, 'Cocaine|Drug')

                      count unique      top   freq
PastCocaineUse_DON    16126      4       No  11483
CocaineUse_DON        16126      4  Missing  11961
OtherDrugUse_DON      16126      4      Yes   7746
PastOtherDrugUse_DON  16126      4      Yes   9662


In [62]:
# test of Independence for Categorical Variables
usf.pairColsMultiIndependenceCat(df, features)

Unnamed: 0,column1,column2,chi2,p_value,cramer_v
2,PastCocaineUse_DON,PastOtherDrugUse_DON,21907.758852,0.0,0.672938
5,OtherDrugUse_DON,PastOtherDrugUse_DON,16126.0,0.0,0.57735
0,PastCocaineUse_DON,CocaineUse_DON,16126.0,0.0,0.57735
3,CocaineUse_DON,OtherDrugUse_DON,2981.58749,0.0,0.248256
1,PastCocaineUse_DON,OtherDrugUse_DON,1956.932954,0.0,0.201124
4,CocaineUse_DON,PastOtherDrugUse_DON,1733.719204,0.0,0.189306


In [63]:
for col in features:
    print(uf.categoryContingencySurvival(df, col).to_string())
    print("\n")

Survival              Dead  Living  Row Total     Dead %   Living %
PastCocaineUse_DON                                                 
Missing              181.0    12.0      193.0  93.782383   6.217617
No                  5496.0  5987.0    11483.0  47.862057  52.137943
Unknown              172.0   113.0      285.0  60.350877  39.649123
Yes                 2003.0  2162.0     4165.0  48.091236  51.908764
Column Total        7852.0  8274.0    16126.0  48.691554  51.308446


Survival          Dead  Living  Row Total     Dead %   Living %
CocaineUse_DON                                                 
Missing         5849.0  6112.0    11961.0  48.900594  51.099406
No               547.0   667.0     1214.0  45.057661  54.942339
Unknown          271.0   259.0      530.0  51.132075  48.867925
Yes             1185.0  1236.0     2421.0  48.946716  51.053284
Column Total    7852.0  8274.0    16126.0  48.691554  51.308446


Survival            Dead  Living  Row Total     Dead %   Living %
OtherD

In [64]:
# new feature
df['DrugUse_CountTotal_DON'] = df[features].apply(lambda row: sum(1 for value in row if value == 'Yes'), axis=1)

# update DataFrame
df_drop  = uf.insertIntoDataFrame(df_drop, features)
df_don  = uf.insertIntoDataFrame(df_don, ['DrugUse_CountTotal_DON'])
df_ordinal  = uf.insertIntoDataFrame(df_ordinal, ['DrugUse_CountTotal_DON'])

# convert to category
df = uf.toCategory(df, ['DrugUse_CountTotal_DON'])

In [65]:
uf.categoryContingencySurvival(df, 'DrugUse_CountTotal_DON')

Survival,Dead,Living,Row Total,Dead %,Living %
DrugUse_CountTotal_DON,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2746.0,3171.0,5917.0,46.408653,53.591347
1,687.0,820.0,1507.0,45.587259,54.412741
2,2909.0,2673.0,5582.0,52.113938,47.886062
3,555.0,602.0,1157.0,47.968885,52.031115
4,955.0,1008.0,1963.0,48.650025,51.349975
Column Total,7852.0,8274.0,16126.0,48.691554,51.308446


#### CoronaryAngiogram_DON

In [66]:
features = getFeatureList(df, 'Coronary')

                       count unique top  freq
CoronaryAngiogram_DON  16126      4  No  9469


In [67]:
for col in features:
    print(uf.categoryContingencySurvival(df, col).to_string())
    print("\n")

Survival                 Dead  Living  Row Total     Dead %   Living %
CoronaryAngiogram_DON                                                 
Missing                 184.0    12.0      196.0  93.877551   6.122449
No                     4353.0  5116.0     9469.0  45.971063  54.028937
Yes, normal            3016.0  2898.0     5914.0  50.997633  49.002367
Yes, not normal         299.0   248.0      547.0  54.661792  45.338208
Column Total           7852.0  8274.0    16126.0  48.691554  51.308446




#### CrossMatchDone

In [68]:
features = getFeatureList(df, 'CrossMatch')

                count unique  top   freq
CrossMatchDone  16126      3  Yes  14743


In [69]:
df_dict[df_dict.Feature.isin(features)]

Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
47,CrossMatchDone,CROSSMATCH DONE Y/N,RH,1994-04-01,NaT,TEST INFORMATION,CHAR(1),,,CRSMATCH_DONE,Category,N/Y/X to No/Yes/Missing


In [70]:
for col in features:
    print(uf.categoryContingencySurvival(df, col).to_string())
    print("\n")

Survival          Dead  Living  Row Total     Dead %   Living %
CrossMatchDone                                                 
Missing          498.0    87.0      585.0  85.128205  14.871795
No               365.0   433.0      798.0  45.739348  54.260652
Yes             6989.0  7754.0    14743.0  47.405548  52.594452
Column Total    7852.0  8274.0    16126.0  48.691554  51.308446




#### Death
- CauseOfDeath_DON & DeathCircumstance_DON & DeathMechanism_DON

In [71]:
features = getFeatureList(df, 'Death')

                       count unique                        top  freq
CauseOfDeath_DON       16126      6                HEAD TRAUMA  6741
DeathCircumstance_DON  16126      8  DEATH FROM NATURAL CAUSES  3569
DeathMechanism_DON     16126     13               BLUNT INJURY  4089


In [72]:
# test of Independence for Categorical Variables
usf.pairColsMultiIndependenceCat(df, features)

Unnamed: 0,column1,column2,chi2,p_value,cramer_v
1,CauseOfDeath_DON,DeathMechanism_DON,42005.791298,0.0,0.721782
2,DeathCircumstance_DON,DeathMechanism_DON,35851.210225,0.0,0.563559
0,CauseOfDeath_DON,DeathCircumstance_DON,19011.505969,0.0,0.485579


In [73]:
for col in features:
    print(uf.categoryContingencySurvival(df, col).to_string())
    print("\n")

Survival                  Dead  Living  Row Total     Dead %   Living %
CauseOfDeath_DON                                                       
ANOXIA                  3482.0  3225.0     6707.0  51.915909  48.084091
CEREBROVASCULAR/STROKE  1020.0  1222.0     2242.0  45.495094  54.504906
CNS TUMOR                 31.0    32.0       63.0  49.206349  50.793651
HEAD TRAUMA             3140.0  3601.0     6741.0  46.580626  53.419374
Missing                    1.0     1.0        2.0  50.000000  50.000000
OTHER SPECIFY            178.0   193.0      371.0  47.978437  52.021563
Column Total            7852.0  8274.0    16126.0  48.691554  51.308446


Survival                     Dead  Living  Row Total     Dead %    Living %
DeathCircumstance_DON                                                      
Accident, Non-MVA          1766.0  1577.0     3343.0  52.826802   47.173198
CHILD-ABUSE                   0.0     1.0        1.0   0.000000  100.000000
DEATH FROM NATURAL CAUSES  1724.0  1845.0     

In [74]:
# update dataframe
df_drop  = uf.insertIntoDataFrame(df_drop, ['CauseOfDeath_DON','DeathCircumstance_DON'])

#### DeceasedRetyped_DON

In [75]:
features = getFeatureList(df, 'Deceased')

                     count unique  top  freq
DeceasedRetyped_DON  16126      3  Yes  9973


In [76]:
for col in features:
    print(uf.categoryContingencySurvival(df, col).to_string())
    print("\n")

Survival               Dead  Living  Row Total     Dead %   Living %
DeceasedRetyped_DON                                                 
Missing               500.0    87.0      587.0  85.178876  14.821124
No                   2181.0  3385.0     5566.0  39.184333  60.815667
Yes                  5171.0  4802.0     9973.0  51.849995  48.150005
Column Total         7852.0  8274.0    16126.0  48.691554  51.308446




#### DefibrillatorImplantRegistration_CAN

In [77]:
features = getFeatureList(df, 'Defibrillator')

                                      count unique  top   freq
DefibrillatorImplantRegistration_CAN  16126      4  Yes  11388


In [78]:
for col in features:
    print(uf.categoryContingencySurvival(df, col).to_string())
    print("\n")

Survival                                Dead  Living  Row Total      Dead %   Living %
DefibrillatorImplantRegistration_CAN                                                  
Missing                                 75.0     0.0       75.0  100.000000   0.000000
No                                    2278.0  2105.0     4383.0   51.973534  48.026466
Unknown                                161.0   119.0      280.0   57.500000  42.500000
Yes                                   5338.0  6050.0    11388.0   46.873902  53.126098
Column Total                          7852.0  8274.0    16126.0   48.691554  51.308446




#### Diabetes
- DiabetesType_CAN & DiabetesHistory_DON & Diabetes_DON & InsulinManagement_DON

In [79]:
features = getFeatureList(df, 'Diabetes|Insulin')

                       count unique  top   freq
DiabetesType_CAN       16126      7   No  11352
DiabetesHistory_DON    16126      7   No  15379
Diabetes_DON           16126      4   No  15379
InsulinManagement_DON  16126      4  Yes   8255


In [80]:
# test of Independence for Categorical Variables
usf.pairColsMultiIndependenceCat(df, features)

Unnamed: 0,column1,column2,chi2,p_value,cramer_v
3,DiabetesHistory_DON,Diabetes_DON,48378.0,0.0,1.0
2,DiabetesType_CAN,InsulinManagement_DON,536.46063,2.2115449999999998e-102,0.105304
4,DiabetesHistory_DON,InsulinManagement_DON,409.219597,1.0919850000000001e-75,0.091972
5,Diabetes_DON,InsulinManagement_DON,400.02188,1.3552649999999999e-80,0.090932
0,DiabetesType_CAN,DiabetesHistory_DON,20.949957,0.9785786,0.014715
1,DiabetesType_CAN,Diabetes_DON,9.38666,0.9501015,0.013929


In [81]:
for col in features:
    print(uf.categoryContingencySurvival(df, col).to_string())
    print("\n")

Survival                   Dead  Living  Row Total      Dead %   Living %
DiabetesType_CAN                                                         
Diabetes Status Unknown     4.0     6.0       10.0   40.000000  60.000000
Missing                    73.0     0.0       73.0  100.000000   0.000000
No                       5416.0  5936.0    11352.0   47.709655  52.290345
Type I                     97.0    87.0      184.0   52.717391  47.282609
Type II                  2155.0  2146.0     4301.0   50.104627  49.895373
Type Other                 53.0    36.0       89.0   59.550562  40.449438
Type Unknown               54.0    63.0      117.0   46.153846  53.846154
Column Total             7852.0  8274.0    16126.0   48.691554  51.308446


Survival                 Dead  Living  Row Total     Dead %   Living %
DiabetesHistory_DON                                                   
Missing                   1.0     1.0        2.0  50.000000  50.000000
No                     7452.0  7927.0    1537

In [82]:
# update DataFrame
df_drop  = uf.insertIntoDataFrame(df_drop, ['DiabetesHistory_DON'])

#### Diagnosis
- WaitListDiagnosisCode_CAN & DiagnosisAtListing_CAN & PrimaryDiagnosisType_CAN

In [83]:
features = getFeatureList(df, 'Diagnosis')

                           count unique                           top  freq
WaitListDiagnosisCode_CAN  16126     36  DILATED MYOPATHY: IDIOPATHIC  5517
DiagnosisAtListing_CAN     16126     37  DILATED MYOPATHY: IDIOPATHIC  5555
PrimaryDiagnosisType_CAN   16126     37  DILATED MYOPATHY: IDIOPATHIC  5510


In [84]:
# test of Independence for Categorical Variables
usf.pairColsMultiIndependenceCat(df, features)

Unnamed: 0,column1,column2,chi2,p_value,cramer_v
0,WaitListDiagnosisCode_CAN,DiagnosisAtListing_CAN,292228.216938,0.0,0.719555
2,DiagnosisAtListing_CAN,PrimaryDiagnosisType_CAN,293333.189131,0.0,0.71083
1,WaitListDiagnosisCode_CAN,PrimaryDiagnosisType_CAN,237514.891457,0.0,0.648706


In [85]:
df_dict[df_dict.Feature.isin(features)]

Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
72,PrimaryDiagnosisType_CAN,RECIPIENT PRIMARY DIAGNOSIS,TRR>TCR,1987-10-01,NaT,PATIENT STATUS/CLINICAL INFORMATION,NUM,ALL_DGN,"THIS FIELD DRAWS FROM ""AT TRANSPLANT"" AND IF NOT THERE THEN FROM TCR.",DIAG,Category,FMTNAME: TH_DGN
270,DiagnosisAtListing_CAN,CANDIDATE DIAGNOSIS AT LISTING,TCR,1994-04-01,NaT,CLINICAL INFORMATION,NUM,ALL_DGN,,TCR_DGN,Category,FMTNAME: TH_DGN
272,WaitListDiagnosisCode_CAN,Waitlist CANDIDATE DIAGNOSIS,WL DATA,NaT,NaT,,NUM,ALL_DGN,,THORACIC_DGN,Category,FMTNAME: TH_DGN


In [86]:
# update dataframe
df_drop  = uf.insertIntoDataFrame(df_drop, ['WaitListDiagnosisCode_CAN','DiagnosisAtListing_CAN'])

#### Dialysis
- DialysisBetweenListingTransplant_CAN & DialysisPriorListing_CAN
- New Feature Dialysis_Combined_CAN

In [87]:
features = getFeatureList(df, 'Dialysis')

                                           count unique top   freq
DialysisBetweenRegistrationTransplant_CAN  16126      4  No  14914
DialysisPriorRegistration_CAN              16126      3  No  14781


In [88]:
# test of Independence for Categorical Variables
usf.pairColsMultiIndependenceCat(df, features)

Unnamed: 0,column1,column2,chi2,p_value,cramer_v
0,DialysisBetweenRegistrationTransplant_CAN,DialysisPriorRegistration_CAN,28688.916103,0.0,0.943146


In [89]:
# new feature
df['Dialysis_CountTotal_CAN'] = df[features].apply(lambda row: sum(1 for value in row if value == 'Yes'), axis=1)

# update DataFrame
df_drop  = uf.insertIntoDataFrame(df_drop, features)
df_don  = uf.insertIntoDataFrame(df_don, ['Dialysis_CountTotal_CAN'])
df_ordinal  = uf.insertIntoDataFrame(df_ordinal, ['Dialysis_CountTotal_CAN'])

# change datatype to category
df = uf.toCategory(df, ['Dialysis_CountTotal_CAN'])

In [90]:
uf.categoryContingencySurvival(df, 'Dialysis_CountTotal_CAN')

Survival,Dead,Living,Row Total,Dead %,Living %
Dialysis_CountTotal_CAN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,7326.0,7878.0,15204.0,48.184688,51.815312
1,85.0,62.0,147.0,57.823129,42.176871
2,441.0,334.0,775.0,56.903226,43.096774
Column Total,7852.0,8274.0,16126.0,48.691554,51.308446


#### Diuretic
- Diuretics_DON & SynthicAntiDiureticHormone_DON
- New Feature Diuretics_Combined_DON

In [91]:
features = getFeatureList(df, 'Diuretic')

                                count unique  top   freq
Diuretics_DON                   16126      4  Yes  11012
SynthicAntiDiureticHormone_DON  16126      4   No  13893


In [92]:
# test of Independence for Categorical Variables
usf.pairColsMultiIndependenceCat(df, features)

Unnamed: 0,column1,column2,chi2,p_value,cramer_v
0,Diuretics_DON,SynthicAntiDiureticHormone_DON,17203.885241,0.0,0.596334


In [93]:
# new feature
df['Diuretics_CountTotal_DON'] = df[features].apply(lambda row: sum(1 for value in row if value == 'Yes'), axis=1)

# update DataFrame
df_drop  = uf.insertIntoDataFrame(df_drop, features)
df_don  = uf.insertIntoDataFrame(df_don, ['Diuretics_CountTotal_DON'])
df_ordinal  = uf.insertIntoDataFrame(df_ordinal, ['Diuretics_CountTotal_DON'])

# change datatype to category
df = uf.toCategory(df, ['Diuretics_CountTotal_DON'])

In [94]:
uf.categoryContingencySurvival(df, 'Diuretics_CountTotal_DON')

Survival,Dead,Living,Row Total,Dead %,Living %
Diuretics_CountTotal_DON,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2447.0,2007.0,4454.0,54.93938,45.06062
1,4834.0,5463.0,10297.0,46.945712,53.054288
2,571.0,804.0,1375.0,41.527273,58.472727
Column Total,7852.0,8274.0,16126.0,48.691554,51.308446


#### EducationLevel_CAN

In [95]:
features = getFeatureList(df, 'Education')

                    count unique                        top  freq
EducationLevel_CAN  16126      8  HIGH SCHOOL (9-12) or GED  5798


In [96]:
for col in features:
    print(uf.categoryContingencySurvival(df, col).to_string())
    print("\n")

Survival                             Dead  Living  Row Total      Dead %   Living %
EducationLevel_CAN                                                                 
ASSOCIATE/BACHELOR DEGREE          1662.0  1744.0     3406.0   48.796242  51.203758
ATTENDED COLLEGE/TECHNICAL SCHOOL  2025.0  2278.0     4303.0   47.060191  52.939809
GRADE SCHOOL (0-8)                  224.0   262.0      486.0   46.090535  53.909465
HIGH SCHOOL (9-12) or GED          2794.0  3004.0     5798.0   48.189031  51.810969
Missing                              74.0     0.0       74.0  100.000000   0.000000
NONE                                 18.0    20.0       38.0   47.368421  52.631579
POST-COLLEGE GRADUATE DEGREE        718.0   793.0     1511.0   47.518200  52.481800
UNKNOWN                             337.0   173.0      510.0   66.078431  33.921569
Column Total                       7852.0  8274.0    16126.0   48.691554  51.308446




#### EpsteinBarr
- EpsteinBarrSeroStatusTransplant_CAN & EpsteinBarr_IGG_DON & EpsteinBarr_IGM_DON
- These differences make EpsteinBarr_IGG_DON and EpsteinBarr_IGM_DON complementary markers for diagnosing and staging Epstein-Barr virus infections in donors.

In [97]:
features = getFeatureList(df, 'EpsteinBarr')

                                     count unique       top   freq
EpsteinBarrSeroStatusTransplant_CAN  16126      4  Positive  13466
EpsteinBarr_IGG_DON                  16126      5  Positive  14269
EpsteinBarr_IGM_DON                  16126      5  Negative  12673


In [98]:
# test of Independence for Categorical Variables
usf.pairColsMultiIndependenceCat(df, features)

Unnamed: 0,column1,column2,chi2,p_value,cramer_v
2,EpsteinBarr_IGG_DON,EpsteinBarr_IGM_DON,10545.63449,0.0,0.404337
0,EpsteinBarrSeroStatusTransplant_CAN,EpsteinBarr_IGG_DON,28.176229,0.005213,0.024133
1,EpsteinBarrSeroStatusTransplant_CAN,EpsteinBarr_IGM_DON,20.723804,0.054574,0.020697


In [99]:
for col in features:
    print(uf.categoryContingencySurvival(df, col).to_string())
    print("\n")

Survival                               Dead  Living  Row Total     Dead %   Living %
EpsteinBarrSeroStatusTransplant_CAN                                                 
Missing/Unknown                       448.0    57.0      505.0  88.712871  11.287129
Negative                              724.0   773.0     1497.0  48.363393  51.636607
Not Done                              251.0   407.0      658.0  38.145897  61.854103
Positive                             6429.0  7037.0    13466.0  47.742462  52.257538
Column Total                         7852.0  8274.0    16126.0  48.691554  51.308446


Survival               Dead  Living  Row Total     Dead %   Living %
EpsteinBarr_IGG_DON                                                 
Indeterminate          71.0    44.0      115.0  61.739130  38.260870
Missing/Unknown         9.0    10.0       19.0  47.368421  52.631579
Negative              555.0   529.0     1084.0  51.199262  48.800738
Not Done              268.0   371.0      639.0  41.940532 

In [100]:
def combine_EpsteinBarrDON(left, right):
    if left.startswith('Positive') or right.startswith('Positive'):
        return 'Positive'
    elif left.startswith('Indeterminate') or right.startswith('Indeterminate'):
        return 'Indeterminate' 
    elif left == 'Negative' and right == 'Negative':
        return 'Negative'
    elif left.startswith('Not') and right.startswith('Not'):
        return 'Negative'
    else:
        return 'Missing/Unknown'

In [101]:
# get features
features = ['EpsteinBarr_IGG_DON', 'EpsteinBarr_IGM_DON']

# new feature
df['EpsteinBarr_Combined_DON'] = df[features].apply(lambda row: combine_EpsteinBarrDON(row['EpsteinBarr_IGG_DON'], row['EpsteinBarr_IGM_DON']), axis=1)

# update DataFrame
df_drop  = uf.insertIntoDataFrame(df_drop, features)
df_don  = uf.insertIntoDataFrame(df_don, ['EpsteinBarr_Combined_DON'])
df_nominal  = uf.insertIntoDataFrame(df_nominal, ['EpsteinBarr_Combined_DON'])

# convert to category
df = uf.toCategory(df, ['EpsteinBarr_Combined_DON'])

In [102]:
uf.categoryContingencySurvival(df, 'EpsteinBarr_Combined_DON')

Survival,Dead,Living,Row Total,Dead %,Living %
EpsteinBarr_Combined_DON,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Indeterminate,71.0,44.0,115.0,61.73913,38.26087
Missing/Unknown,169.0,275.0,444.0,38.063063,61.936937
Negative,655.0,630.0,1285.0,50.972763,49.027237
Positive,6957.0,7325.0,14282.0,48.711665,51.288335
Column Total,7852.0,8274.0,16126.0,48.691554,51.308446


#### Ethnicity

In [103]:
features = getFeatureList(df, 'Ethnicity|Hispani')

               count unique                      top   freq
Hispanic_CAN   16126      2  Non-Hispanic/Non-Latino  14583
Ethnicity_CAN  16126      7      White, Non-Hispanic  10002
Ethnicity_DON  16126      7      White, Non-Hispanic  10211


In [104]:
# test of Independence for Categorical Variables
usf.pairColsMultiIndependenceCat(df, features)

Unnamed: 0,column1,column2,chi2,p_value,cramer_v
0,Hispanic_CAN,Ethnicity_CAN,15942.519677,0.0,0.994295
1,Hispanic_CAN,Ethnicity_DON,191.573778,1.177422e-38,0.108995
2,Ethnicity_CAN,Ethnicity_DON,356.147361,2.606182e-54,0.06067


In [105]:
for col in features:
    print(uf.categoryContingencySurvival(df, col).to_string())
    print("\n")

Survival                   Dead  Living  Row Total     Dead %   Living %
Hispanic_CAN                                                            
Hispanic/Latino           794.0   749.0     1543.0  51.458198  48.541802
Non-Hispanic/Non-Latino  7058.0  7525.0    14583.0  48.398821  51.601179
Column Total             7852.0  8274.0    16126.0  48.691554  51.308446


Survival                                  Dead  Living  Row Total     Dead %   Living %
Ethnicity_CAN                                                                          
Amer Ind/Alaska Native                    25.0    22.0       47.0  53.191489  46.808511
Asian                                    310.0   317.0      627.0  49.441786  50.558214
Black                                   1909.0  1862.0     3771.0  50.623177  49.376823
Hispanic                                 784.0   739.0     1523.0  51.477347  48.522653
Multiracial                               52.0    45.0       97.0  53.608247  46.391753
Native Hawaiian/o

In [106]:
# update DataFrame
df_drop  = uf.insertIntoDataFrame(df_drop, ['Hispanic_CAN'])

#### Gender
- New Feature Gender_Difference

In [107]:
features = getFeatureList(df, 'Gender')

            count unique top   freq
Gender_CAN  16126      2   M  11717
Gender_DON  16126      2   M  11394


In [108]:
# test of Independence for Categorical Variables
usf.pairColsMultiIndependenceCat(df, features)

Unnamed: 0,column1,column2,chi2,p_value,cramer_v
0,Gender_CAN,Gender_DON,3221.282943,0.0,0.446942


In [109]:
for col in features:
    print(uf.categoryContingencySurvival(df, col).to_string())
    print("\n")

Survival        Dead  Living  Row Total     Dead %   Living %
Gender_CAN                                                   
F             2081.0  2328.0     4409.0  47.198911  52.801089
M             5771.0  5946.0    11717.0  49.253222  50.746778
Column Total  7852.0  8274.0    16126.0  48.691554  51.308446


Survival        Dead  Living  Row Total     Dead %   Living %
Gender_DON                                                   
F             2189.0  2543.0     4732.0  46.259510  53.740490
M             5663.0  5731.0    11394.0  49.701597  50.298403
Column Total  7852.0  8274.0    16126.0  48.691554  51.308446




In [110]:
# create New Feature (True = 1 & False = 0)
df['Geder_Difference'] = (df.Gender_CAN == df.Gender_DON).astype(int)

# update DataFrame
df_drop  = uf.insertIntoDataFrame(df_drop, features)
df_nominal = uf.insertIntoDataFrame(df_nominal, ['Geder_Difference'])
df_both = uf.insertIntoDataFrame(df_both, ['Geder_Difference'])

# convert to category
df = uf.toCategory(df, ['Geder_Difference']).copy()

In [111]:
uf.categoryContingencySurvival(df, 'Geder_Difference')

Survival,Dead,Living,Row Total,Dead %,Living %
Geder_Difference,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1690.0,1937.0,3627.0,46.594982,53.405018
1,6162.0,6337.0,12499.0,49.299944,50.700056
Column Total,7852.0,8274.0,16126.0,48.691554,51.308446


#### HBV
- HBV_NAT_Result_CAN & HBV_NAT_Result_DON & SurfaceHBVAntibodyTotalTransplant_CAN

In [112]:
features = getFeatureList(df, 'HBV')

                                       count unique       top   freq
SurfaceHBVAntibodyTotalTransplant_CAN  16126      4  Negative  10565
HBV_NAT_Result_CAN                     16126      4  Not Done  10044
HBV_NAT_Result_DON                     16126      4  Negative  16083


In [113]:
# test of Independence for Categorical Variables
usf.pairColsMultiIndependenceCat(df, features)

Unnamed: 0,column1,column2,chi2,p_value,cramer_v
0,SurfaceHBVAntibodyTotalTransplant_CAN,HBV_NAT_Result_CAN,748.167735,3.028721e-155,0.124359
2,HBV_NAT_Result_CAN,HBV_NAT_Result_DON,269.392553,7.954225e-53,0.074622
1,SurfaceHBVAntibodyTotalTransplant_CAN,HBV_NAT_Result_DON,10.957194,0.2786545,0.01505


In [114]:
for col in features:
    print(uf.categoryContingencySurvival(df, col).to_string())
    print("\n")

Survival                                 Dead  Living  Row Total     Dead %   Living %
SurfaceHBVAntibodyTotalTransplant_CAN                                                 
Missing/Unknown                         632.0   358.0      990.0  63.838384  36.161616
Negative                               4890.0  5675.0    10565.0  46.284903  53.715097
Not Done                                182.0   405.0      587.0  31.005111  68.994889
Positive                               2148.0  1836.0     3984.0  53.915663  46.084337
Column Total                           7852.0  8274.0    16126.0  48.691554  51.308446


Survival              Dead  Living  Row Total     Dead %   Living %
HBV_NAT_Result_CAN                                                 
Missing/Unknown      810.0  3004.0     3814.0  21.237546  78.762454
Negative            1422.0   824.0     2246.0  63.312556  36.687444
Not Done            5603.0  4441.0    10044.0  55.784548  44.215452
Positive              17.0     5.0       22.0  77

#### HCV
- HCV_NAT_PreTranspant_CAN & HCV_NAT_Result_DON

In [115]:
features = getFeatureList(df, 'HCV')

                          count unique       top   freq
HCV_NAT_PreTranspant_CAN  16126      4  Not Done   9397
HCV_NAT_Result_DON        16126      4  Negative  15223


In [116]:
# test of Independence for Categorical Variables
usf.pairColsMultiIndependenceCat(df, features)

Unnamed: 0,column1,column2,chi2,p_value,cramer_v
0,HCV_NAT_PreTranspant_CAN,HCV_NAT_Result_DON,249.409174,1.3295109999999999e-48,0.071801


In [117]:
for col in features:
    print(uf.categoryContingencySurvival(df, col).to_string())
    print("\n")

Survival                    Dead  Living  Row Total     Dead %   Living %
HCV_NAT_PreTranspant_CAN                                                 
Missing/Unknown            804.0  3004.0     3808.0  21.113445  78.886555
Negative                  1874.0  1011.0     2885.0  64.956672  35.043328
Not Done                  5149.0  4248.0     9397.0  54.794083  45.205917
Positive                    25.0    11.0       36.0  69.444444  30.555556
Column Total              7852.0  8274.0    16126.0  48.691554  51.308446


Survival              Dead  Living  Row Total     Dead %   Living %
HCV_NAT_Result_DON                                                 
Missing/Unknown        1.0     1.0        2.0  50.000000  50.000000
Negative            7313.0  7910.0    15223.0  48.039151  51.960849
Not Done               3.0     5.0        8.0  37.500000  62.500000
Positive             535.0   358.0      893.0  59.910414  40.089586
Column Total        7852.0  8274.0    16126.0  48.691554  51.308446




#### HEP
- SurfaceAntigenHEP_B_CAN & HEP_C_SerostatusStatus_CAN & SurfaceAntigenHEP_B_DON & Antibody_HEP_C_DON

In [118]:
features = getFeatureList(df, 'HEP')

                            count unique       top   freq
SurfaceAntigenHEP_B_CAN     16126      4  Negative  15257
HEP_C_SerostatusStatus_CAN  16126      4  Negative  15101
SurfaceAntigenHEP_B_DON     16126      4  Negative  16086
Antibody_HEP_C_DON          16126      4  Negative  14735


In [119]:
# test of Independence for Categorical Variables
usf.pairColsMultiIndependenceCat(df, features)

Unnamed: 0,column1,column2,chi2,p_value,cramer_v
0,SurfaceAntigenHEP_B_CAN,HEP_C_SerostatusStatus_CAN,17640.396194,0.0,0.603852
5,SurfaceAntigenHEP_B_DON,Antibody_HEP_C_DON,1616.709437,0.0,0.182807
4,HEP_C_SerostatusStatus_CAN,Antibody_HEP_C_DON,43.940632,1e-06,0.030138
1,SurfaceAntigenHEP_B_CAN,SurfaceAntigenHEP_B_DON,40.988075,5e-06,0.029107
2,SurfaceAntigenHEP_B_CAN,Antibody_HEP_C_DON,3.9047,0.917572,0.008984
3,HEP_C_SerostatusStatus_CAN,SurfaceAntigenHEP_B_DON,2.721803,0.974312,0.007501


In [120]:
for col in features:
    print(uf.categoryContingencySurvival(df, col).to_string())
    print("\n")

Survival                   Dead  Living  Row Total     Dead %   Living %
SurfaceAntigenHEP_B_CAN                                                 
Missing/Unknown           414.0    16.0      430.0  96.279070   3.720930
Negative                 7226.0  8031.0    15257.0  47.361867  52.638133
Not Done                   81.0   132.0      213.0  38.028169  61.971831
Positive                  131.0    95.0      226.0  57.964602  42.035398
Column Total             7852.0  8274.0    16126.0  48.691554  51.308446


Survival                      Dead  Living  Row Total     Dead %   Living %
HEP_C_SerostatusStatus_CAN                                                 
Missing/Unknown              415.0    18.0      433.0  95.842956   4.157044
Negative                    7155.0  7946.0    15101.0  47.380968  52.619032
Not Done                     101.0   126.0      227.0  44.493392  55.506608
Positive                     181.0   184.0      365.0  49.589041  50.410959
Column Total                785

#### HIV
- HIV_SeroStatusTransplant_CAN & HIV_NAT_PreTransplant_CAN & HIV_NAT_Result_DON & HIV_Risk_DON

In [121]:
features = getFeatureList(df, 'HIV')

                              count unique       top   freq
HIV_SeroStatusTransplant_CAN  16126      4  Negative  15417
HIV_NAT_PreTransplant_CAN     16126      4  Not Done  10226
HIV_NAT_Result_DON            16126      3  Negative  16111
HIV_Risk_DON                  16126      4        No  10996


In [122]:
# test of Independence for Categorical Variables
usf.pairColsMultiIndependenceCat(df, features)

Unnamed: 0,column1,column2,chi2,p_value,cramer_v
5,HIV_NAT_Result_DON,HIV_Risk_DON,16126.457728,0.0,0.707117
0,HIV_SeroStatusTransplant_CAN,HIV_NAT_PreTransplant_CAN,2018.755018,0.0,0.204276
4,HIV_NAT_PreTransplant_CAN,HIV_Risk_DON,46.021184,5.967814e-07,0.030843
2,HIV_SeroStatusTransplant_CAN,HIV_Risk_DON,21.776113,0.009616274,0.021216
3,HIV_NAT_PreTransplant_CAN,HIV_NAT_Result_DON,8.328304,0.215024,0.016069
1,HIV_SeroStatusTransplant_CAN,HIV_NAT_Result_DON,0.690465,0.9946945,0.004627


In [123]:
for col in features:
    print(uf.categoryContingencySurvival(df, col, 'Survival').to_string())
    print("\n")

Survival                        Dead  Living  Row Total     Dead %   Living %
HIV_SeroStatusTransplant_CAN                                                 
Missing/Unknown                431.0    41.0      472.0  91.313559   8.686441
Negative                      7323.0  8094.0    15417.0  47.499514  52.500486
Not Done                        60.0   104.0      164.0  36.585366  63.414634
Positive                        38.0    35.0       73.0  52.054795  47.945205
Column Total                  7852.0  8274.0    16126.0  48.691554  51.308446


Survival                     Dead  Living  Row Total     Dead %   Living %
HIV_NAT_PreTransplant_CAN                                                 
Missing/Unknown             814.0  3006.0     3820.0  21.308901  78.691099
Negative                   1315.0   756.0     2071.0  63.495896  36.504104
Not Done                   5719.0  4507.0    10226.0  55.926071  44.073929
Positive                      4.0     5.0        9.0  44.444444  55.555556
Co

In [124]:
# update DataFrame
df_drop  = uf.insertIntoDataFrame(df_drop, ['HIV_Risk_DON'])

#### Heart
- NonHeartBeating_DON & HeartProcedureType_CAN

In [125]:
features = getFeatureList(df, 'Heart')

                        count unique                 top   freq
NonHeartBeating_DON     16126      3                  No  15808
HeartProcedureType_CAN  16126      5  Orthotopic Bicaval  12925


In [126]:
# test of Independence for Categorical Variables
usf.pairColsMultiIndependenceCat(df, features)

Unnamed: 0,column1,column2,chi2,p_value,cramer_v
0,NonHeartBeating_DON,HeartProcedureType_CAN,252.361835,5.439004999999999e-50,0.088457


In [127]:
for col in features:
    print(uf.categoryContingencySurvival(df, col).to_string())
    print("\n")

Survival               Dead  Living  Row Total     Dead %   Living %
NonHeartBeating_DON                                                 
Missing                 2.0     1.0        3.0  66.666667  33.333333
No                   7536.0  8272.0    15808.0  47.672065  52.327935
Yes                   314.0     1.0      315.0  99.682540   0.317460
Column Total         7852.0  8274.0    16126.0  48.691554  51.308446


Survival                          Dead  Living  Row Total     Dead %   Living %
HeartProcedureType_CAN                                                         
Heterotopic                        5.0     6.0       11.0  45.454545  54.545455
Missing                          422.0     1.0      423.0  99.763593   0.236407
Orthotopic Bicaval              6031.0  6894.0    12925.0  46.661509  53.338491
Orthotopic Total (Bicaval, PV)   238.0   212.0      450.0  52.888889  47.111111
Orthotopic Traditional          1156.0  1161.0     2317.0  49.892102  50.107898
Column Total            

In [128]:
# update DataFrame
df_drop  = uf.insertIntoDataFrame(df_drop, ['NonHeartBeating_DON'])

#### HeavyAlcoholUse_DON

In [129]:
features = getFeatureList(df, 'Heavy')

                     count unique top   freq
HeavyAlcoholUse_DON  16126      4  No  12796


In [130]:
for col in features:
    print(uf.categoryContingencySurvival(df, col).to_string())
    print("\n")

Survival               Dead  Living  Row Total     Dead %   Living %
HeavyAlcoholUse_DON                                                 
Missing                 3.0     4.0        7.0  42.857143  57.142857
No                   6138.0  6658.0    12796.0  47.968115  52.031885
Unknown               261.0   197.0      458.0  56.986900  43.013100
Yes                  1450.0  1415.0     2865.0  50.610820  49.389180
Column Total         7852.0  8274.0    16126.0  48.691554  51.308446




#### HeparinManagement_DON

In [131]:
features = getFeatureList(df, 'Heparin')

                       count unique  top   freq
HeparinManagement_DON  16126      4  Yes  15711


In [132]:
for col in features:
    print(uf.categoryContingencySurvival(df, col).to_string())
    print("\n")

Survival                 Dead  Living  Row Total     Dead %    Living %
HeparinManagement_DON                                                  
Missing                 181.0    12.0      193.0  93.782383    6.217617
No                       81.0   140.0      221.0  36.651584   63.348416
Unknown                   0.0     1.0        1.0   0.000000  100.000000
Yes                    7590.0  8121.0    15711.0  48.310101   51.689899
Column Total           7852.0  8274.0    16126.0  48.691554   51.308446




#### Hepatitis

In [133]:
features = getFeatureList(df, 'Hepatitis')

                              count unique       top   freq
Hepatitis_B_CoreAntibody_CAN  16126      4  Negative  14454
Hepatitis_B_CoreAntibody_DON  16126      4  Negative  15744


In [134]:
# test of Independence for Categorical Variables
usf.pairColsMultiIndependenceCat(df, features)

Unnamed: 0,column1,column2,chi2,p_value,cramer_v
0,Hepatitis_B_CoreAntibody_CAN,Hepatitis_B_CoreAntibody_DON,3.938124,0.915438,0.009022


In [135]:
for col in features:
    print(uf.categoryContingencySurvival(df, col).to_string())
    print("\n")

Survival                        Dead  Living  Row Total     Dead %   Living %
Hepatitis_B_CoreAntibody_CAN                                                 
Missing/Unknown                420.0    22.0      442.0  95.022624   4.977376
Negative                      6860.0  7594.0    14454.0  47.460910  52.539090
Not Done                       218.0   283.0      501.0  43.512974  56.487026
Positive                       354.0   375.0      729.0  48.559671  51.440329
Column Total                  7852.0  8274.0    16126.0  48.691554  51.308446


Survival                        Dead  Living  Row Total      Dead %   Living %
Hepatitis_B_CoreAntibody_DON                                                  
Indeterminate                    1.0     0.0        1.0  100.000000   0.000000
Negative                      7651.0  8093.0    15744.0   48.596291  51.403709
Not Done                         3.0     5.0        8.0   37.500000  62.500000
Positive                       197.0   176.0      373.0  

#### HypertensionHistory_DON

In [136]:
features = getFeatureList(df, 'HypertensionH')

                         count unique top   freq
HypertensionHistory_DON  16126      4  No  13483


In [137]:
for col in features:
    print(uf.categoryContingencySurvival(df, col).to_string())
    print("\n")

Survival                   Dead  Living  Row Total     Dead %   Living %
HypertensionHistory_DON                                                 
Missing                     1.0     1.0        2.0  50.000000  50.000000
No                       6548.0  6935.0    13483.0  48.564859  51.435141
Unknown                   105.0    46.0      151.0  69.536424  30.463576
Yes                      1198.0  1292.0     2490.0  48.112450  51.887550
Column Total             7852.0  8274.0    16126.0  48.691554  51.308446




#### Infection
- InfectionTherapyIV_CAN & BloodInfectionSource_DON & OtherInfectionSource_DON & PulmonaryInfection_DON & UrineInfection_DON & InfectionClinical_DON

In [138]:
features = getFeatureList(df, 'Infection')

                          count unique  top   freq
InfectionTherapyIV_CAN    16126      4   No  14068
BloodInfectionSource_DON  16126      3    0  14448
OtherInfectionSource_DON  16126      3    0  11424
PulmonaryInfection_DON    16126      3    1  11625
UrineInfection_DON        16126      3    0  14219
InfectionClinical_DON     16126      4  Yes  12616


In [139]:
# test of Independence for Categorical Variables
usf.pairColsMultiIndependenceCat(df, features)

Unnamed: 0,column1,column2,chi2,p_value,cramer_v
7,BloodInfectionSource_DON,UrineInfection_DON,16171.085698,0.0,0.708095
12,PulmonaryInfection_DON,UrineInfection_DON,16170.440031,0.0,0.70808
6,BloodInfectionSource_DON,PulmonaryInfection_DON,16153.804078,0.0,0.707716
11,OtherInfectionSource_DON,InfectionClinical_DON,16126.0,0.0,0.707107
9,OtherInfectionSource_DON,PulmonaryInfection_DON,11893.324696,0.0,0.607258
13,PulmonaryInfection_DON,InfectionClinical_DON,11747.098669,0.0,0.603513
4,InfectionTherapyIV_CAN,InfectionClinical_DON,3718.706529,0.0,0.27725
14,UrineInfection_DON,InfectionClinical_DON,766.480005,2.686332e-162,0.15416
8,BloodInfectionSource_DON,InfectionClinical_DON,685.922819,6.694921e-145,0.145834
10,OtherInfectionSource_DON,UrineInfection_DON,608.949773,1.791482e-130,0.137408


In [140]:
# combine features
features = ['BloodInfectionSource_DON','UrineInfection_DON','PulmonaryInfection_DON','InfectionClinical_DON','OtherInfectionSource_DON']

# new feature
df['Infection_CountTotal_DON'] = df[features].apply(lambda row: sum(1 for value in row if value == 1 or value == 'Yes'), axis=1)

# update DataFrame
df_drop  = uf.insertIntoDataFrame(df_drop, features)
df_don  = uf.insertIntoDataFrame(df_don, ['Infection_CountTotal_DON'])
df_ordinal  = uf.insertIntoDataFrame(df_ordinal, ['Infection_CountTotal_DON'])

# change datatype to category
df = uf.toCategory(df, ['Infection_CountTotal_DON'])

In [141]:
uf.categoryContingencySurvival(df, 'Infection_CountTotal_DON')

Survival,Dead,Living,Row Total,Dead %,Living %
Infection_CountTotal_DON,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1764.0,1746.0,3510.0,50.25641,49.74359
1,1.0,0.0,1.0,100.0,0.0
2,4474.0,4829.0,9303.0,48.092013,51.907987
3,1393.0,1469.0,2862.0,48.672257,51.327743
4,214.0,215.0,429.0,49.88345,50.11655
5,6.0,15.0,21.0,28.571429,71.428571
Column Total,7852.0,8274.0,16126.0,48.691554,51.308446


#### Inotropic

In [142]:
features = getFeatureList(df, 'Inotropic')

                    count unique      top   freq
InotropicAgent_DON  16126      7  Missing  10484


In [143]:
for col in features:
    print(uf.categoryContingencySurvival(df, col, 'Survival').to_string())
    print("\n")

Survival              Dead  Living  Row Total     Dead %   Living %
InotropicAgent_DON                                                 
Dobutamine           163.0   254.0      417.0  39.088729  60.911271
Dopamine             207.0   343.0      550.0  37.636364  62.363636
Epinephrine           38.0    36.0       74.0  51.351351  48.648649
Levophed             798.0   966.0     1764.0  45.238095  54.761905
Missing             5310.0  5174.0    10484.0  50.648607  49.351393
Neosynephrine       1157.0  1275.0     2432.0  47.574013  52.425987
Other, specify       179.0   226.0      405.0  44.197531  55.802469
Column Total        7852.0  8274.0    16126.0  48.691554  51.308446




#### Intropes

##### IntropesIV
- IntropesIVRegistration_CAN & IntropesIVTransplant_CAN
- Combined IntropesIV_CountTotal_CAN

In [144]:
features = getFeatureList(df, 'IntropesIV')

                            count  unique  top   freq
IntropesIVRegistration_CAN  16126       2    0  10594
IntropesIVTransplant_CAN    16126       2    0   9890


In [145]:
# test of Independence for Categorical Variables
usf.pairColsMultiIndependenceCat(df, features)

Unnamed: 0,column1,column2,chi2,p_value,cramer_v
0,IntropesIVRegistration_CAN,IntropesIVTransplant_CAN,4010.613582,0.0,0.498703


In [146]:
df_dict[df_dict.Feature.isin(features)]

Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
183,IntropesIVRegistration_CAN,IV INOTROPES @ REGISTRATION,TCR,1994-04-01,NaT,CANDIDATE INFORMATION,NUM,,,INOTROPES_TCR,Category,
184,IntropesIVTransplant_CAN,IV INOTROPES @ TRANSPLANT,TRR,1994-04-01,NaT,PATIENT STATUS,NUM,,,INOTROPES_TRR,Category,


In [147]:
for col in features:
    print(uf.categoryContingencySurvival(df, col, 'Survival').to_string())
    print("\n")

Survival                      Dead  Living  Row Total     Dead %   Living %
IntropesIVRegistration_CAN                                                 
0                           5270.0  5324.0    10594.0  49.745139  50.254861
1                           2582.0  2950.0     5532.0  46.673897  53.326103
Column Total                7852.0  8274.0    16126.0  48.691554  51.308446


Survival                    Dead  Living  Row Total     Dead %   Living %
IntropesIVTransplant_CAN                                                 
0                         4888.0  5002.0     9890.0  49.423660  50.576340
1                         2964.0  3272.0     6236.0  47.530468  52.469532
Column Total              7852.0  8274.0    16126.0  48.691554  51.308446




In [148]:
# new feature
df['IntropesIV_CountTotal_CAN'] = df[features].apply(lambda row: sum(1 for value in row if value == 1), axis=1)

# update DataFrame
df_drop  = uf.insertIntoDataFrame(df_drop, features)
df_can  = uf.insertIntoDataFrame(df_can, ['IntropesIV_CountTotal_CAN'])
df_ordinal  = uf.insertIntoDataFrame(df_ordinal, ['IntropesIV_CountTotal_CAN'])

# change datatype to category
df = uf.toCategory(df, ['IntropesIV_CountTotal_CAN'])

In [149]:
uf.categoryContingencySurvival(df, 'IntropesIV_CountTotal_CAN')

Survival,Dead,Living,Row Total,Dead %,Living %
IntropesIV_CountTotal_CAN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,4186.0,4171.0,8357.0,50.089745,49.910255
1,1786.0,1984.0,3770.0,47.374005,52.625995
2,1880.0,2119.0,3999.0,47.011753,52.988247
Column Total,7852.0,8274.0,16126.0,48.691554,51.308446


#### IntropesVasodilators
- Combined IntropesVasodilators_CountTotal_CAN

In [150]:
features = getFeatureList(df, 'IntropesVasodilators')

                                          count unique top  freq
IntropesVasodilatorsRegistration_SYS_CAN  16126      3  No  9137
IntropesVasodilatorsRegistration_DIA_CAN  16126      3  No  9134
IntropesVasodilatorsRegistration_MN_CAN   16126      3  No  9113
IntropesVasodilatorsRegistration_PCW_CAN  16126      3  No  9060
IntropesVasodilatorsRegistration_CO_CAN   16126      3  No  9031
IntropesVasodilatorsTransplant_CO_CAN     16126      3  No  8554
IntropesVasodilatorsTransplant_DIA_CAN    16126      3  No  8622
IntropesVasodilatorsTransplant_MN_CAN     16126      3  No  8548
IntropesVasodilatorsTransplant_PCW_CAN    16126      3  No  8557
IntropesVasodilatorsTransplant_SYS_CAN    16126      3  No  8628


In [151]:
# test of Independence for Categorical Variables
usf.pairColsMultiIndependenceCat(df, features)

Unnamed: 0,column1,column2,chi2,p_value,cramer_v
41,IntropesVasodilatorsTransplant_DIA_CAN,IntropesVasodilatorsTransplant_SYS_CAN,32062.886816,0.0,0.997064
0,IntropesVasodilatorsRegistration_SYS_CAN,IntropesVasodilatorsRegistration_DIA_CAN,31732.976835,0.0,0.991921
43,IntropesVasodilatorsTransplant_MN_CAN,IntropesVasodilatorsTransplant_SYS_CAN,27590.705166,0.0,0.924918
39,IntropesVasodilatorsTransplant_DIA_CAN,IntropesVasodilatorsTransplant_MN_CAN,27580.535778,0.0,0.924747
9,IntropesVasodilatorsRegistration_DIA_CAN,IntropesVasodilatorsRegistration_MN_CAN,26151.246676,0.0,0.900467
1,IntropesVasodilatorsRegistration_SYS_CAN,IntropesVasodilatorsRegistration_MN_CAN,26114.063256,0.0,0.899827
38,IntropesVasodilatorsTransplant_CO_CAN,IntropesVasodilatorsTransplant_SYS_CAN,25561.594146,0.0,0.890258
35,IntropesVasodilatorsTransplant_CO_CAN,IntropesVasodilatorsTransplant_DIA_CAN,25467.177525,0.0,0.888612
36,IntropesVasodilatorsTransplant_CO_CAN,IntropesVasodilatorsTransplant_MN_CAN,23919.380452,0.0,0.861185
3,IntropesVasodilatorsRegistration_SYS_CAN,IntropesVasodilatorsRegistration_CO_CAN,22863.284394,0.0,0.841959


In [152]:
# new feature
df['IntropesVasodilators_CountTotal_CAN'] = df[features].apply(lambda row: sum(1 for value in row if value == 'Yes'), axis=1)

# update DataFrame
df_drop  = uf.insertIntoDataFrame(df_drop, features)
df_can  = uf.insertIntoDataFrame(df_can, ['IntropesVasodilators_CountTotal_CAN'])
df_ordinal  = uf.insertIntoDataFrame(df_ordinal, ['IntropesVasodilators_CountTotal_CAN'])

# change datatype to category
df = uf.toCategory(df, ['IntropesVasodilators_CountTotal_CAN'])

In [153]:
uf.categoryContingencySurvival(df, 'IntropesVasodilators_CountTotal_CAN')

Survival,Dead,Living,Row Total,Dead %,Living %
IntropesVasodilators_CountTotal_CAN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,3852.0,3679.0,7531.0,51.148586,48.851414
1,13.0,17.0,30.0,43.333333,56.666667
2,11.0,10.0,21.0,52.380952,47.619048
3,54.0,53.0,107.0,50.46729,49.53271
4,236.0,196.0,432.0,54.62963,45.37037
5,1498.0,1955.0,3453.0,43.382566,56.617434
6,25.0,32.0,57.0,43.859649,56.140351
7,23.0,17.0,40.0,57.5,42.5
8,172.0,163.0,335.0,51.343284,48.656716
9,297.0,277.0,574.0,51.74216,48.25784


#### IntropicMedicationProcurement_DON

In [154]:
features = getFeatureList(df, 'Intropic')

                                   count unique top   freq
IntropicMedicationProcurement_DON  16126      4  No  10266


In [155]:
for col in features:
    print(uf.categoryContingencySurvival(df, col, 'Survival').to_string())
    print("\n")

Survival                             Dead  Living  Row Total     Dead %   Living %
IntropicMedicationProcurement_DON                                                 
Missing                             180.0    12.0      192.0  93.750000   6.250000
No                                 5115.0  5151.0    10266.0  49.824664  50.175336
Unknown                              14.0    11.0       25.0  56.000000  44.000000
Yes                                2543.0  3100.0     5643.0  45.064682  54.935318
Column Total                       7852.0  8274.0    16126.0  48.691554  51.308446




#### KidneyAllocation_DON

In [156]:
features = getFeatureList(df, 'Kidney')

                      count  unique  top   freq
KidneyAllocation_DON  16126       3    0  15724


In [157]:
for col in features:
    print(uf.categoryContingencySurvival(df, col, 'Survival').to_string())
    print("\n")

Survival                Dead  Living  Row Total     Dead %   Living %
KidneyAllocation_DON                                                 
0                     7670.0  8054.0    15724.0  48.778937  51.221063
1                      181.0   219.0      400.0  45.250000  54.750000
999                      1.0     1.0        2.0  50.000000  50.000000
Column Total          7852.0  8274.0    16126.0  48.691554  51.308446




#### LV_Ejection
- LV_EjectionFractionMedthod_DON & LV_EjectionFractionPercent_CAT_DON

In [158]:
features = getFeatureList(df, 'LV_Ejection')

                                    count unique          top   freq
LV_EjectionFractionMedthod_DON      16126      3         Echo  15826
LV_EjectionFractionPercent_CAT_DON  16126      5  Normal LVEF  14831


In [159]:
# test of Independence for Categorical Variables
usf.pairColsMultiIndependenceCat(df, features)

Unnamed: 0,column1,column2,chi2,p_value,cramer_v
0,LV_EjectionFractionMedthod_DON,LV_EjectionFractionPercent_CAT_DON,9001.546943,0.0,0.5283


In [160]:
for col in features:
    print(uf.categoryContingencySurvival(df, col, 'Survival').to_string())
    print("\n")

Survival                          Dead  Living  Row Total     Dead %   Living %
LV_EjectionFractionMedthod_DON                                                 
Angiogram                         83.0   190.0      273.0  30.402930  69.597070
Echo                            7754.0  8072.0    15826.0  48.995324  51.004676
Missing                           15.0    12.0       27.0  55.555556  44.444444
Column Total                    7852.0  8274.0    16126.0  48.691554  51.308446


Survival                              Dead  Living  Row Total     Dead %   Living %
LV_EjectionFractionPercent_CAT_DON                                                 
High LVEF                            523.0   537.0     1060.0  49.339623  50.660377
Mild Dysfunction                      80.0    94.0      174.0  45.977011  54.022989
Missing                                7.0    10.0       17.0  41.176471  58.823529
Normal LVEF                         7215.0  7616.0    14831.0  48.648102  51.351898
Reduced LVEF  

In [161]:
# update DataFrame
df_drop  = uf.insertIntoDataFrame(df_drop, ['LV_EjectionFractionMedthod_DON'])

#### LifeSupport
- LifeSupportRegistration_ECMO_CAN & LifeSupportRegistration_IABP_CAN & LifeSupportRegistration_PGE_CAN & LifeSupportMechanismRegistration_OTHER_CAN & LifeSupportRegistration_CAN & LifeSupportInhaledRegistration_CAN
- LifeSupportTransplant_ECMO_CAN & LifeSupportTransplant_PGE_CAN & LifeSupportTransplant_IABP_CAN & LifeSupportMechanismTransplant_OTHER_CAN & LifeSupportInhaledTransplant_CAN & LifeSupportTransplant_CAN
- LifeSupportInhaled_CAN 

In [162]:
features = getFeatureList(df, 'LifeSupportRegistration|LifeSupport')

                                            count unique  top   freq
LifeSupportRegistration_ECMO_CAN            16126      2    0  15682
LifeSupportRegistration_IABP_CAN            16126      2    0  14217
LifeSupportInhaled_CAN                      16126      2    0  16091
LifeSupportRegistration_PGE_CAN             16126      2    0  16119
LifeSupportMechanismRegistration_OTHER_CAN  16126      2    0  15314
LifeSupportRegistration_CAN                 16126      3  Yes  10558
LifeSupportTransplant_ECMO_CAN              16126      2    0  15504
LifeSupportTransplant_PGE_CAN               16126      2    0  16107
LifeSupportTransplant_IABP_CAN              16126      2    0  12851
LifeSupportMechanismTransplant_OTHER_CAN    16126      2    0  15096
LifeSupportInhaledTransplant_CAN            16126      2    0  16088
LifeSupportInhaledRegistration_CAN          16126      2    0  16091
LifeSupportTransplant_CAN                   16126      3  Yes  12891


In [163]:
# test of Independence for Categorical Variables
usf.pairColsMultiIndependenceCat(df, features)

Unnamed: 0,column1,column2,chi2,p_value,cramer_v
31,LifeSupportInhaled_CAN,LifeSupportInhaledRegistration_CAN,15667.56032,0.0,0.985683
5,LifeSupportRegistration_ECMO_CAN,LifeSupportTransplant_ECMO_CAN,7235.608979,0.0,0.669845
18,LifeSupportRegistration_IABP_CAN,LifeSupportTransplant_IABP_CAN,4456.840161,0.0,0.525715
56,LifeSupportRegistration_CAN,LifeSupportTransplant_CAN,6880.90529,0.0,0.461896
46,LifeSupportMechanismRegistration_OTHER_CAN,LifeSupportMechanismTransplant_OTHER_CAN,2414.247613,0.0,0.386926
15,LifeSupportRegistration_IABP_CAN,LifeSupportRegistration_CAN,1141.937117,1.075241e-248,0.266108
71,LifeSupportTransplant_IABP_CAN,LifeSupportTransplant_CAN,1031.308775,1.132779e-224,0.25289
42,LifeSupportMechanismRegistration_OTHER_CAN,LifeSupportRegistration_CAN,450.932576,1.205689e-98,0.167222
30,LifeSupportInhaled_CAN,LifeSupportInhaledTransplant_CAN,357.474445,9.988992e-80,0.148888
75,LifeSupportInhaledTransplant_CAN,LifeSupportInhaledRegistration_CAN,357.474445,9.988992e-80,0.148888


In [164]:
for col in features:
    print(uf.categoryContingencySurvival(df, col).to_string())
    print("\n")

Survival                            Dead  Living  Row Total     Dead %   Living %
LifeSupportRegistration_ECMO_CAN                                                 
0                                 7553.0  8129.0    15682.0  48.163500  51.836500
1                                  299.0   145.0      444.0  67.342342  32.657658
Column Total                      7852.0  8274.0    16126.0  48.691554  51.308446


Survival                            Dead  Living  Row Total     Dead %   Living %
LifeSupportRegistration_IABP_CAN                                                 
0                                 6713.0  7504.0    14217.0  47.218119  52.781881
1                                 1139.0   770.0     1909.0  59.664746  40.335254
Column Total                      7852.0  8274.0    16126.0  48.691554  51.308446


Survival                  Dead  Living  Row Total     Dead %   Living %
LifeSupportInhaled_CAN                                                 
0                       7833.0  

In [165]:
# new feature
df['LifeSupport_CountTotal_CAN'] = df[features].apply(lambda row: sum(1 for value in row if value == 'Yes' or value == 1), axis=1)

# update DataFrame
df_drop  = uf.insertIntoDataFrame(df_drop, features)
df_can  = uf.insertIntoDataFrame(df_can, ['LifeSupport_CountTotal_CAN'])
df_ordinal  = uf.insertIntoDataFrame(df_ordinal, ['LifeSupport_CountTotal_CAN'])

# change datatype to category
df = uf.toCategory(df, ['LifeSupport_CountTotal_CAN'])

In [166]:
uf.categoryContingencySurvival(df, 'LifeSupport_CountTotal_CAN')

Survival,Dead,Living,Row Total,Dead %,Living %
LifeSupport_CountTotal_CAN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1471.0,1135.0,2606.0,56.446662,43.553338
1,927.0,1203.0,2130.0,43.521127,56.478873
2,3268.0,4054.0,7322.0,44.632614,55.367386
3,814.0,878.0,1692.0,48.108747,51.891253
4,1231.0,922.0,2153.0,57.176033,42.823967
5,87.0,58.0,145.0,60.0,40.0
6,44.0,20.0,64.0,68.75,31.25
7,8.0,3.0,11.0,72.727273,27.272727
9,2.0,1.0,3.0,66.666667,33.333333
Column Total,7852.0,8274.0,16126.0,48.691554,51.308446


#### Malignancy
- PreviousMalignancy_CAN & MalignancyBetweenRegistrationTransplant_CAN & Malignancy_CAN
- Combined Malignancy_CountTotal_CAN

In [167]:
features = getFeatureList(df, 'Malignancy')

                                             count unique      top   freq
PreviousMalignancy_CAN                       16126      3       No  14556
MalignancyBetweenRegistrationTransplant_CAN  16126      1  Missing  16126
Malignancy_CAN                               16126      3       No  14556


In [168]:
# test of Independence for Categorical Variables
usf.pairColsMultiIndependenceCat(df, features)

Unnamed: 0,column1,column2,chi2,p_value,cramer_v
1,PreviousMalignancy_CAN,Malignancy_CAN,32252.0,0.0,1.0
0,PreviousMalignancy_CAN,MalignancyBetweenRegistrationTransplant_CAN,0.0,1.0,
2,MalignancyBetweenRegistrationTransplant_CAN,Malignancy_CAN,0.0,1.0,


In [169]:
for col in features:
    print(uf.categoryContingencySurvival(df, col, 'Survival').to_string())
    print("\n")

Survival                  Dead  Living  Row Total      Dead %   Living %
PreviousMalignancy_CAN                                                  
Missing                   73.0     0.0       73.0  100.000000   0.000000
No                      7037.0  7519.0    14556.0   48.344325  51.655675
Yes                      742.0   755.0     1497.0   49.565798  50.434202
Column Total            7852.0  8274.0    16126.0   48.691554  51.308446


Survival                                       Dead  Living  Row Total     Dead %   Living %
MalignancyBetweenRegistrationTransplant_CAN                                                 
Missing                                      7852.0  8274.0    16126.0  48.691554  51.308446
Column Total                                 7852.0  8274.0    16126.0  48.691554  51.308446


Survival          Dead  Living  Row Total      Dead %   Living %
Malignancy_CAN                                                  
Missing           73.0     0.0       73.0  100.000000   

In [170]:
# new feature
df['Malignancy_CountTotal_CAN'] = df[features].apply(lambda row: sum(1 for value in row if value == 'Yes' or value == 1), axis=1)

# update DataFrame
df_drop  = uf.insertIntoDataFrame(df_drop, features)
df_can  = uf.insertIntoDataFrame(df_can, ['Malignancy_CountTotal_CAN'])
df_ordinal  = uf.insertIntoDataFrame(df_ordinal, ['Malignancy_CountTotal_CAN'])

# change datatype to category
df = uf.toCategory(df, ['Malignancy_CountTotal_CAN'])

In [171]:
uf.categoryContingencySurvival(df, 'Malignancy_CountTotal_CAN')

Survival,Dead,Living,Row Total,Dead %,Living %
Malignancy_CountTotal_CAN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,7110.0,7519.0,14629.0,48.602092,51.397908
2,742.0,755.0,1497.0,49.565798,50.434202
Column Total,7852.0,8274.0,16126.0,48.691554,51.308446


#### MyocardialInfarctionHistory_DON

In [172]:
features = getFeatureList(df, 'Myocardial')

                                 count unique top   freq
MyocardialInfarctionHistory_DON  16126      4  No  15610


In [173]:
for col in features:
    print(uf.categoryContingencySurvival(df, col, 'Survival').to_string())
    print("\n")

Survival                           Dead  Living  Row Total     Dead %   Living %
MyocardialInfarctionHistory_DON                                                 
Missing                           182.0    12.0      194.0  93.814433   6.185567
No                               7484.0  8126.0    15610.0  47.943626  52.056374
Unknown                           106.0    61.0      167.0  63.473054  36.526946
Yes                                80.0    75.0      155.0  51.612903  48.387097
Column Total                     7852.0  8274.0    16126.0  48.691554  51.308446




#### PreviousTransplant

In [174]:
features = getFeatureList(df, 'PreviousTransplant')

                                 count unique top   freq
PreviousTransplantNumber_CAN     16126      4   0  15616
PreviousTransplantSameOrgan_CAN  16126      2  No  15640
PreviousTransplantAnyOrgan_CAN   16126      2  No  15597


In [175]:
# test of Independence for Categorical Variables
usf.pairColsMultiIndependenceCat(df, features)

Unnamed: 0,column1,column2,chi2,p_value,cramer_v
0,PreviousTransplantNumber_CAN,PreviousTransplantSameOrgan_CAN,14827.98576,0.0,0.95891
2,PreviousTransplantSameOrgan_CAN,PreviousTransplantAnyOrgan_CAN,14743.044146,0.0,0.956159
1,PreviousTransplantNumber_CAN,PreviousTransplantAnyOrgan_CAN,13580.060705,0.0,0.917672


In [176]:
for col in features:
    print(uf.categoryContingencySurvival(df, col, 'Survival').to_string())
    print("\n")

Survival                        Dead  Living  Row Total     Dead %    Living %
PreviousTransplantNumber_CAN                                                  
0                             7590.0  8026.0    15616.0  48.603996   51.396004
1                              243.0   236.0      479.0  50.730689   49.269311
2                               19.0    11.0       30.0  63.333333   36.666667
3                                0.0     1.0        1.0   0.000000  100.000000
Column Total                  7852.0  8274.0    16126.0  48.691554   51.308446


Survival                           Dead  Living  Row Total     Dead %   Living %
PreviousTransplantSameOrgan_CAN                                                 
No                               7603.0  8037.0    15640.0  48.612532  51.387468
Yes                               249.0   237.0      486.0  51.234568  48.765432
Column Total                     7852.0  8274.0    16126.0  48.691554  51.308446


Survival                          Dead

In [177]:
# update DataFrame
df_drop  = uf.insertIntoDataFrame(df_drop, ['PreviousTransplantSameOrgan_CAN','PreviousTransplantAnyOrgan_CAN'])

#### PriorCardiac
- PriorCardiacSurgery_CAN & PriorCardiacSurgeryType_CAN & PriorCardiacSurgeryTypeListAndTransplant_CAN & PriorCardiacSurgeryListAndTransplant_CAN

In [178]:
features = getFeatureList(df, 'PriorCardiac')

                                              count unique      top   freq
PriorCardiacSurgery_CAN                       16126      4       No   9608
PriorCardiacSurgeryType_CAN                   16126     23  Missing  10127
PriorCardiacSurgeryTypeListAndTransplant_CAN  16126     16  Missing  13143
PriorCardiacSurgeryListAndTransplant_CAN      16126      4       No  12706


In [179]:
# test of Independence for Categorical Variables
usf.pairColsMultiIndependenceCat(df, features)

Unnamed: 0,column1,column2,chi2,p_value,cramer_v
0,PriorCardiacSurgery_CAN,PriorCardiacSurgeryType_CAN,16126.0,0.0,0.57735
5,PriorCardiacSurgeryTypeListAndTransplant_CAN,PriorCardiacSurgeryListAndTransplant_CAN,16092.890836,0.0,0.576757
3,PriorCardiacSurgeryType_CAN,PriorCardiacSurgeryTypeListAndTransplant_CAN,12954.795759,0.0,0.231423
2,PriorCardiacSurgery_CAN,PriorCardiacSurgeryListAndTransplant_CAN,2531.342846,0.0,0.228745
1,PriorCardiacSurgery_CAN,PriorCardiacSurgeryTypeListAndTransplant_CAN,1376.122431,6.675308999999999e-259,0.168657
4,PriorCardiacSurgeryType_CAN,PriorCardiacSurgeryListAndTransplant_CAN,1001.892307,2.777133e-167,0.143909


In [180]:
for col in features:
    print(uf.categoryContingencySurvival(df, col, 'Survival').to_string())
    print("\n")

Survival                   Dead  Living  Row Total     Dead %   Living %
PriorCardiacSurgery_CAN                                                 
Missing                   100.0    28.0      128.0  78.125000  21.875000
No                       4606.0  5002.0     9608.0  47.939217  52.060783
Unknown                   218.0   173.0      391.0  55.754476  44.245524
Yes                      2928.0  3071.0     5999.0  48.808135  51.191865
Column Total             7852.0  8274.0    16126.0  48.691554  51.308446


Survival                                                       Dead  Living  Row Total      Dead %    Living %
PriorCardiacSurgeryType_CAN                                                                                   
CABG                                                          641.0   661.0     1302.0   49.231951   50.768049
CABG; Congenital                                                1.0     2.0        3.0   33.333333   66.666667
CABG; Congenital; Other, specify           

In [181]:
# initialize features
features = ['PriorCardiacSurgery_CAN','PriorCardiacSurgeryListAndTransplant_CAN']

# new feature
df['PriorCardiacSurgery_CountTotal_CAN'] = df[features].apply(lambda row: sum(1 for value in row if value == 'Yes'), axis=1)

# change to category
df['PriorCardiacSurgery_CountTotal_CAN'] = df['PriorCardiacSurgery_CountTotal_CAN'].astype('category')

# update DataFrame
df_drop  = uf.insertIntoDataFrame(df_drop, features)
df_can  = uf.insertIntoDataFrame(df_can, ['PriorCardiacSurgery_CountTotal_CAN'])
df_ordinal  = uf.insertIntoDataFrame(df_ordinal, ['PriorCardiacSurgery_CountTotal_CAN'])

# change datatype to category
df = uf.toCategory(df, ['PriorCardiacSurgery_CountTotal_CAN'])

In [182]:
uf.categoryContingencySurvival(df, 'PriorCardiacSurgery_CountTotal_CAN')

Survival,Dead,Living,Row Total,Dead %,Living %
PriorCardiacSurgery_CountTotal_CAN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,4429.0,4511.0,8940.0,49.541387,50.458613
1,2555.0,2830.0,5385.0,47.446611,52.553389
2,868.0,933.0,1801.0,48.195447,51.804553
Column Total,7852.0,8274.0,16126.0,48.691554,51.308446


In [183]:
def consolidate_PriorSurgery(value):
    if value.startswith('CABG'):
        return 'CABG'
    elif value.startswith('Congenital'):
        return 'Congenital' 
    elif value.startswith('Left Vent'):
        return 'Left Vent'
    elif value.startswith('Valve Replace'):
        return 'Valve Replace'
    elif value.startswith('Other'):
        return 'Other'
    else:
        return 'Missing/Unknown'

In [184]:
# consolidate features
features = ['PriorCardiacSurgeryType_CAN','PriorCardiacSurgeryTypeListAndTransplant_CAN']

# map 
df[features] = df[features].map(consolidate_PriorSurgery).fillna("Re-Examine")

# change datatype to category
df = uf.toCategory(df, features)

In [185]:
for col in features:
    print(uf.categoryContingencySurvival(df, col).to_string())
    print("\n")

Survival                       Dead  Living  Row Total     Dead %   Living %
PriorCardiacSurgeryType_CAN                                                 
CABG                          850.0   898.0     1748.0  48.627002  51.372998
Congenital                    240.0   181.0      421.0  57.007126  42.992874
Left Vent                      27.0    12.0       39.0  69.230769  30.769231
Missing/Unknown              4924.0  5203.0    10127.0  48.622494  51.377506
Other                        1287.0  1439.0     2726.0  47.212032  52.787968
Valve Replace                 524.0   541.0     1065.0  49.201878  50.798122
Column Total                 7852.0  8274.0    16126.0  48.691554  51.308446


Survival                                        Dead  Living  Row Total     Dead %   Living %
PriorCardiacSurgeryTypeListAndTransplant_CAN                                                 
CABG                                           276.0   299.0      575.0  48.000000  52.000000
Congenital             

#### PriorLungSurgeryAfterListing_CAN

In [186]:
features = getFeatureList(df, 'PriorLung')

                                       count unique top   freq
PriorLungSurgeryAfterRegistration_CAN  16126      4  No  15646


In [187]:
for col in features:
    print(uf.categoryContingencySurvival(df, col, 'Survival').to_string())
    print("\n")

Survival                                 Dead  Living  Row Total      Dead %   Living %
PriorLungSurgeryAfterRegistration_CAN                                                  
Missing                                 422.0     0.0      422.0  100.000000   0.000000
No                                     7397.0  8249.0    15646.0   47.277259  52.722741
Unknown                                   6.0     3.0        9.0   66.666667  33.333333
Yes                                      27.0    22.0       49.0   55.102041  44.897959
Column Total                           7852.0  8274.0    16126.0   48.691554  51.308446




#### PulmonaryCatheter_DON

In [188]:
features = getFeatureList(df, 'PulmonaryCatheter')

                       count unique top   freq
PulmonaryCatheter_DON  16126      3  No  15242


In [189]:
for col in features:
    print(uf.categoryContingencySurvival(df, col, 'Survival').to_string())
    print("\n")

Survival                 Dead  Living  Row Total     Dead %   Living %
PulmonaryCatheter_DON                                                 
Missing                 181.0    12.0      193.0  93.782383   6.217617
No                     7400.0  7842.0    15242.0  48.550059  51.449941
Yes                     271.0   420.0      691.0  39.218524  60.781476
Column Total           7852.0  8274.0    16126.0  48.691554  51.308446




#### Residency
- ResidencyStateRegistration_CAN & ResidencyStateTransplant_CAN & ResidencyState_DON

In [190]:
features = getFeatureList(df, 'Residency')

                                count unique top  freq
ResidencyStateRegistration_CAN  16126     55  CA  1945
ResidencyStateTransplant_CAN    16126     55  CA  1903
ResidencyState_DON              16126     55  CA  1597


In [191]:
# test of Independence for Categorical Variables
usf.pairColsMultiIndependenceCat(df, features)

Unnamed: 0,column1,column2,chi2,p_value,cramer_v
0,ResidencyStateRegistration_CAN,ResidencyStateTransplant_CAN,763321.295784,0.0,0.936254
2,ResidencyStateTransplant_CAN,ResidencyState_DON,84369.682892,0.0,0.311267
1,ResidencyStateRegistration_CAN,ResidencyState_DON,81539.183909,0.0,0.306001


In [192]:
# update DataFrame
df_drop  = uf.insertIntoDataFrame(df_drop, features)

#### Status
- FunctionalStatusRegistration_CAN & StatusAtTransplant_CAN & FunctionalStatusTransplant_CAN
- Coronary artery bypass grafting `(CABG)` is a major surgical procedure used to treat coronary heart disease. The surgery involves creating a new path for blood to flow around blocked or narrowed coronary arteries, improving blood supply to the heart muscle

In [193]:
features = getFeatureList(df, 'FunctionalStatus|StatusAtTransplant')

                                  count unique                                                                                  top  freq
FunctionalStatusRegistration_CAN  16126     19               20% - Very sick, hospitalization necessary: active treatment necessary  4120
StatusAtTransplant_CAN            16126      8  Atrioventricular Septal Defect; Congenitally Corrected Transposition (L-TGA); Other  4728
FunctionalStatusTransplant_CAN    16126     11               20% - Very sick, hospitalization necessary: active treatment necessary  5353


In [194]:
# test of Independence for Categorical Variables
usf.pairColsMultiIndependenceCat(df, features)

Unnamed: 0,column1,column2,chi2,p_value,cramer_v
1,FunctionalStatusRegistration_CAN,FunctionalStatusTransplant_CAN,19533.535086,0.0,0.348038
2,StatusAtTransplant_CAN,FunctionalStatusTransplant_CAN,6724.735428,0.0,0.244076
0,FunctionalStatusRegistration_CAN,StatusAtTransplant_CAN,3486.987448,0.0,0.175757


In [195]:
df_dict[df_dict.Feature.isin(features)]

Unnamed: 0,Feature,Description,Form,FeatureStartDate,FeatureEndDate,FormSection,DataType,SASAnalysisFormat,Comment,OrginalFeature,FeatureType,Information
102,StatusAtTransplant_CAN,CANDIDATE STATUS AT TRANSPLANT OFFER/REMOVALCURRENT TIME,TRR>TCR,1990-01-01,NaT,WAITING LIST DATA,NUM,STAT,,END_STAT,Category,FMTNAME: CHDMULT - This Feature could be Ordinal but using as Nominal
108,FunctionalStatusRegistration_CAN,RECIPIENT FUNCTIONAL STATUS @ REGISTRATION,TCR,1994-04-01,NaT,CANDIDATE INFORMATION,NUM,FUNCSTAT,,FUNC_STAT_TCR,Category,FMTNAME: FUNCSTAT
110,FunctionalStatusTransplant_CAN,RECIPIENT FUNCTIONAL STATUS @TRANSPLANT,TRR,1994-04-01,NaT,PATIENT STATUS,NUM,FUNCSTAT,,FUNC_STAT_TRR,Category,FMTNAME: FUNCSTAT


In [196]:
for col in features:
    print(uf.categoryContingencySurvival(df, col).to_string())
    print("\n")

Survival                                                                                               Dead  Living  Row Total      Dead %    Living %
FunctionalStatusRegistration_CAN                                                                                                                      
10% - Moribund, fatal processes progressing rapidly                                                   315.0   180.0      495.0   63.636364   36.363636
100% - Fully active, normal                                                                             4.0     0.0        4.0  100.000000    0.000000
100% - Normal, no complaints, no evidence of disease                                                   27.0    21.0       48.0   56.250000   43.750000
20% - Very sick, hospitalization necessary: active treatment necessary                               2124.0  1996.0     4120.0   51.553398   48.446602
30% - In bed; needs assistance even for quiet play                                            

In [197]:
# update DataFrame
df_drop  = uf.insertIntoDataFrame(df_drop, ['FunctionalStatusRegistration_CAN'])

#### Steroids

In [198]:
features = getFeatureList(df, 'Steroids')

                 count unique  top   freq
SteroidsUse_CAN  16126      4   No  14531
SteroidsUse_DON  16126      4  Yes  11156


In [199]:
# test of Independence for Categorical Variables
usf.pairColsMultiIndependenceCat(df, features)

Unnamed: 0,column1,column2,chi2,p_value,cramer_v
0,SteroidsUse_CAN,SteroidsUse_DON,3655.355346,0.0,0.274879


In [200]:
for col in features:
    print(uf.categoryContingencySurvival(df, col, 'Survival').to_string())
    print("\n")

Survival           Dead  Living  Row Total      Dead %   Living %
SteroidsUse_CAN                                                  
Missing           429.0     0.0      429.0  100.000000   0.000000
No               6898.0  7633.0    14531.0   47.470924  52.529076
Unknown            60.0    37.0       97.0   61.855670  38.144330
Yes               465.0   604.0     1069.0   43.498597  56.501403
Column Total     7852.0  8274.0    16126.0   48.691554  51.308446


Survival           Dead  Living  Row Total     Dead %   Living %
SteroidsUse_DON                                                 
Missing           180.0    12.0      192.0  93.750000   6.250000
No               2265.0  2499.0     4764.0  47.544081  52.455919
Unknown             8.0     6.0       14.0  57.142857  42.857143
Yes              5399.0  5757.0    11156.0  48.395482  51.604518
Column Total     7852.0  8274.0    16126.0  48.691554  51.308446




#### Tatoos

In [201]:
features = getFeatureList(df, 'Tatoos')

            count unique  top  freq
Tatoos_DON  16126      4  Yes  9668


In [202]:
for col in features:
    print(uf.categoryContingencySurvival(df, col, 'Survival').to_string())
    print("\n")

Survival        Dead  Living  Row Total     Dead %   Living %
Tatoos_DON                                                   
Missing        181.0    12.0      193.0  93.782383   6.217617
No            2847.0  3404.0     6251.0  45.544713  54.455287
Unknown          5.0     9.0       14.0  35.714286  64.285714
Yes           4819.0  4849.0     9668.0  49.844849  50.155151
Column Total  7852.0  8274.0    16126.0  48.691554  51.308446




#### ThyroxineT4_DON

In [203]:
features = getFeatureList(df, 'Thyroxine')

                 count unique  top  freq
ThyroxineT4_DON  16126      4  Yes  9723


In [204]:
for col in features:
    print(uf.categoryContingencySurvival(df, col, 'Survival').to_string())
    print("\n")

Survival           Dead  Living  Row Total     Dead %   Living %
ThyroxineT4_DON                                                 
Missing           181.0    12.0      193.0  93.782383   6.217617
No               3426.0  2781.0     6207.0  55.195747  44.804253
Unknown             1.0     2.0        3.0  33.333333  66.666667
Yes              4244.0  5479.0     9723.0  43.649080  56.350920
Column Total     7852.0  8274.0    16126.0  48.691554  51.308446




#### Transfusion
- TransfusionAfterListing_CAN & TransfusionNumber_DON

In [205]:
features = getFeatureList(df, 'Transfusion')

                                  count unique   top   freq
TransfusionAfterRegistration_CAN  16126      4    No  13087
TransfusionNumber_DON             16126      5  NONE   8187


In [206]:
# test of Independence for Categorical Variables
usf.pairColsMultiIndependenceCat(df, features)

Unnamed: 0,column1,column2,chi2,p_value,cramer_v
0,TransfusionAfterRegistration_CAN,TransfusionNumber_DON,10.890692,0.538309,0.015004


In [207]:
for col in features:
    print(uf.categoryContingencySurvival(df, col, 'Survival').to_string())
    print("\n")

Survival                            Dead  Living  Row Total      Dead %   Living %
TransfusionAfterRegistration_CAN                                                  
Missing                            424.0     0.0      424.0  100.000000   0.000000
No                                6248.0  6839.0    13087.0   47.742034  52.257966
Unknown                             46.0    32.0       78.0   58.974359  41.025641
Yes                               1134.0  1403.0     2537.0   44.698463  55.301537
Column Total                      7852.0  8274.0    16126.0   48.691554  51.308446


Survival                 Dead  Living  Row Total     Dead %   Living %
TransfusionNumber_DON                                                 
1 - 5                  2443.0  2664.0     5107.0  47.836303  52.163697
6 - 10                  809.0   953.0     1762.0  45.913734  54.086266
GREATER THAN 10         475.0   578.0     1053.0  45.109212  54.890788
Missing                   5.0    12.0       17.0  29.411765  7

#### TransplantType_CAN

In [208]:
features = getFeatureList(df, 'TransplantType')

                    count unique top   freq
TransplantType_CAN  16126      3   O  15692


In [209]:
for col in features:
    print(uf.categoryContingencySurvival(df, col, 'Survival').to_string())
    print("\n")

Survival              Dead  Living  Row Total     Dead %   Living %
TransplantType_CAN                                                 
H                      5.0     6.0       11.0  45.454545  54.545455
O                   7425.0  8267.0    15692.0  47.317104  52.682896
X                    422.0     1.0      423.0  99.763593   0.236407
Column Total        7852.0  8274.0    16126.0  48.691554  51.308446




#### TriiodothyronineT3_DON

In [210]:
features = getFeatureList(df, 'Triiodothyronine')

                        count unique top   freq
TriiodothyronineT3_DON  16126      4  No  15907


In [211]:
for col in features:
    print(uf.categoryContingencySurvival(df, col, 'Survival').to_string())
    print("\n")

Survival                  Dead  Living  Row Total     Dead %   Living %
TriiodothyronineT3_DON                                                 
Missing                  181.0    12.0      193.0  93.782383   6.217617
No                      7658.0  8249.0    15907.0  48.142327  51.857673
Unknown                    2.0     2.0        4.0  50.000000  50.000000
Yes                       11.0    11.0       22.0  50.000000  50.000000
Column Total            7852.0  8274.0    16126.0  48.691554  51.308446




#### UrinePortein_DON

In [212]:
features = getFeatureList(df, 'UrinePortein')


                  count unique top  freq
UrinePortein_DON  16126      4  No  8069


In [213]:
for col in features:
    print(uf.categoryContingencySurvival(df, col, 'Survival').to_string())
    print("\n")

Survival            Dead  Living  Row Total     Dead %   Living %
UrinePortein_DON                                                 
Missing            180.0    12.0      192.0  93.750000   6.250000
No                3648.0  4421.0     8069.0  45.210063  54.789937
Unknown             27.0    33.0       60.0  45.000000  55.000000
Yes               3997.0  3808.0     7805.0  51.210762  48.789238
Column Total      7852.0  8274.0    16126.0  48.691554  51.308446




#### Vasodilators_DON

In [214]:
features = getFeatureList(df, 'Vasodilators_DON')

                  count unique top   freq
Vasodilators_DON  16126      4  No  13093


In [215]:
for col in features:
    print(uf.categoryContingencySurvival(df, col, 'Survival').to_string())
    print("\n")

Survival            Dead  Living  Row Total     Dead %    Living %
Vasodilators_DON                                                  
Missing            181.0    12.0      193.0  93.782383    6.217617
No                6195.0  6898.0    13093.0  47.315359   52.684641
Unknown              0.0     6.0        6.0   0.000000  100.000000
Yes               1476.0  1358.0     2834.0  52.081863   47.918137
Column Total      7852.0  8274.0    16126.0  48.691554   51.308446




#### Ventilator
- VentilatorRegistration_CAN & VentilatorySupport_CAN & VentilatorTransplant_CAN & VentilatorySupportAfterRegistration_CAN
- Combined Ventilator_CountTotal_CAN

In [216]:
features = getFeatureList(df, 'Ventilator')

                                         count unique top   freq
VentilatorRegistration_CAN               16126      2   0  15869
VentilatorySupport_CAN                   16126      4  No  13163
VentilatorTransplant_CAN                 16126      2   0  15840
VentilatorySupportAfterRegistration_CAN  16126      4  No  13163


In [217]:
# test of Independence for Categorical Variables
usf.pairColsMultiIndependenceCat(df, features)

Unnamed: 0,column1,column2,chi2,p_value,cramer_v
4,VentilatorySupport_CAN,VentilatorySupportAfterRegistration_CAN,48378.0,0.0,1.0
1,VentilatorRegistration_CAN,VentilatorTransplant_CAN,2595.809583,0.0,0.401211
5,VentilatorTransplant_CAN,VentilatorySupportAfterRegistration_CAN,1117.52308,5.746494e-242,0.263248
3,VentilatorySupport_CAN,VentilatorTransplant_CAN,1117.52308,5.746494e-242,0.263248
0,VentilatorRegistration_CAN,VentilatorySupport_CAN,518.262808,5.256683e-112,0.179272
2,VentilatorRegistration_CAN,VentilatorySupportAfterRegistration_CAN,518.262808,5.256683e-112,0.179272


In [218]:
for col in features:
    print(uf.categoryContingencySurvival(df, col, 'Survival').to_string())
    print("\n")

Survival                      Dead  Living  Row Total     Dead %   Living %
VentilatorRegistration_CAN                                                 
0                           7710.0  8159.0    15869.0  48.585292  51.414708
1                            142.0   115.0      257.0  55.252918  44.747082
Column Total                7852.0  8274.0    16126.0  48.691554  51.308446


Survival                  Dead  Living  Row Total      Dead %   Living %
VentilatorySupport_CAN                                                  
Missing                  424.0     0.0      424.0  100.000000   0.000000
No                      6256.0  6907.0    13163.0   47.527159  52.472841
Unknown                   42.0    19.0       61.0   68.852459  31.147541
Yes                     1130.0  1348.0     2478.0   45.601291  54.398709
Column Total            7852.0  8274.0    16126.0   48.691554  51.308446


Survival                    Dead  Living  Row Total     Dead %   Living %
VentilatorTransplant_CAN       

In [219]:
# new feature
df['Ventilator_CountTotal_CAN'] = df[features].apply(lambda row: sum(1 for value in row if value == 'Yes' or value == 1), axis=1)

# update DataFrame
df_drop  = uf.insertIntoDataFrame(df_drop, features)
df_can  = uf.insertIntoDataFrame(df_can, ['Ventilator_CountTotal_CAN'])
df_ordinal  = uf.insertIntoDataFrame(df_ordinal, ['Ventilator_CountTotal_CAN'])

# change datatype to category
df = uf.toCategory(df, ['Ventilator_CountTotal_CAN'])

In [220]:
uf.categoryContingencySurvival(df, 'Ventilator_CountTotal_CAN')

Survival,Dead,Living,Row Total,Dead %,Living %
Ventilator_CountTotal_CAN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,6669.0,6860.0,13529.0,49.294109,50.705891
1,49.0,62.0,111.0,44.144144,55.855856
2,937.0,1237.0,2174.0,43.100276,56.899724
3,131.0,77.0,208.0,62.980769,37.019231
4,66.0,38.0,104.0,63.461538,36.538462
Column Total,7852.0,8274.0,16126.0,48.691554,51.308446


#### WorkIncome
- WorkIncomeRegistration_CAN & WorkIncomeTransplant_CAN

In [221]:
features = getFeatureList(df, 'WorkIncome')

                            count unique top   freq
WorkIncomeRegistration_CAN  16126      4  No  12059
WorkIncomeTransplant_CAN    16126      4  No  12695


In [222]:
# test of Independence for Categorical Variables
usf.pairColsMultiIndependenceCat(df, features)

Unnamed: 0,column1,column2,chi2,p_value,cramer_v
0,WorkIncomeRegistration_CAN,WorkIncomeTransplant_CAN,7388.401051,0.0,0.390797


In [223]:
for col in features:
    print(uf.categoryContingencySurvival(df, col, 'Survival').to_string())
    print("\n")

Survival                      Dead  Living  Row Total     Dead %   Living %
WorkIncomeRegistration_CAN                                                 
Missing                       98.0    28.0      126.0  77.777778  22.222222
No                          5675.0  6384.0    12059.0  47.060287  52.939713
Unknown                      230.0   137.0      367.0  62.670300  37.329700
Yes                         1849.0  1725.0     3574.0  51.734751  48.265249
Column Total                7852.0  8274.0    16126.0  48.691554  51.308446


Survival                    Dead  Living  Row Total      Dead %   Living %
WorkIncomeTransplant_CAN                                                  
Missing                    412.0     0.0      412.0  100.000000   0.000000
No                        5834.0  6861.0    12695.0   45.955100  54.044900
Unknown                    214.0   217.0      431.0   49.651972  50.348028
Yes                       1392.0  1196.0     2588.0   53.786708  46.213292
Column Total    

In [224]:
# new feature
df['WorkIncome_CountTotal_CAN'] = df[features].apply(lambda row: sum(1 for value in row if value == 'Yes' or value == 1), axis=1)

# update DataFrame
df_drop  = uf.insertIntoDataFrame(df_drop, features)
df_can  = uf.insertIntoDataFrame(df_can, ['WorkIncome_CountTotal_CAN'])
df_ordinal  = uf.insertIntoDataFrame(df_ordinal, ['WorkIncome_CountTotal_CAN'])

# change datatype to category
df = uf.toCategory(df, ['WorkIncome_CountTotal_CAN'])

In [225]:
uf.categoryContingencySurvival(df, 'WorkIncome_CountTotal_CAN')

Survival,Dead,Living,Row Total,Dead %,Living %
WorkIncome_CountTotal_CAN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,5717.0,6255.0,11972.0,47.753091,52.246909
1,1029.0,1117.0,2146.0,47.949674,52.050326
2,1106.0,902.0,2008.0,55.079681,44.920319
Column Total,7852.0,8274.0,16126.0,48.691554,51.308446


### Remove Unwanted Features

In [226]:
# get remove features
removeCols = df_drop.column.to_list()

# remove features
df, df_dict, df_label, df_can, df_don, df_both, df_ordinal, df_nominal, df_numeric, df_drop, df_object, df_unknown, df_date = uf.HouseKeeping(df, removeCols, df_dict, df_label, df_can,\
                                                                df_don, df_both, df_ordinal, df_nominal, df_numeric, df_drop, df_object, df_unknown, df_date, txt=REMOVE, display=True)

Data Dictionary Updated.
Remove 0 row(s) from df_label DataFrame.
Remove 51 row(s) from df_can DataFrame.
Remove 27 row(s) from df_don DataFrame.
Remove 0 row(s) from df_both DataFrame.
Remove 3 row(s) from df_ordinal DataFrame.
Remove 76 row(s) from df_nominal DataFrame.
Remove 0 row(s) from df_numeric DataFrame.
Remove 79 row(s) from df_drop DataFrame.
Remove 0 row(s) from df_object DataFrame.
Remove 0 row(s) from df_unknown DataFrame.
Remove 0 row(s) from df_date DataFrame.

Removed Features: ['BloodInfectionSource_DON', 'BronchoscopyLeft_DON', 'BronchoscopyRight_DON', 'CMV_IGG_Transplant_CAN', 'CMV_IGM_Transplant_CAN', 'CancerExtraCranial_DON', 'CancerHistory_DON', 'CancerIntraCranial_DON', 'CancerSkin_DON', 'CauseOfDeath_DON', 'CigaretteAbstinence_CAN', 'Citizenship_CAN', 'Citizenship_DON', 'CocaineUse_DON', 'DeathCircumstance_DON', 'DiabetesHistory_DON', 'DiagnosisAtListing_CAN', 'DialysisBetweenRegistrationTransplant_CAN', 'DialysisPriorRegistration_CAN', 'Diuretics_DON', 'Epste

### Ordinals

In [227]:
print(sorted(df_ordinal.column.to_list()))

['Age_CAT_CAN', 'Age_CAT_DON', 'AntigenDA1_DON', 'AntigenDA2_DON', 'AntigenDB1_DON', 'AntigenDB2_DON', 'AntigenDQ1_CAN', 'AntigenDQ2_CAN', 'AntigenRA1_CAN', 'AntigenRA2_CAN', 'AntigenRB1_CAN', 'AntigenRB2_CAN', 'AntigenRDR1_CAN', 'AntigenRDR2_CAN', 'BMI_CAT_CAN', 'BMI_CAT_DON', 'BloodUreaNitrogenLevel_CAT_DON', 'Cancer_CountTotal_DON', 'Creatinine_CAT_CAN', 'Creatinine_CAT_DON', 'Dialysis_CountTotal_CAN', 'DistanceFromDonorHospitaltoTXCenter_CAT', 'Diuretics_CountTotal_DON', 'DrugUse_CountTotal_DON', 'EducationLevel_CAN', 'FunctionalStatusTransplant_CAN', 'HeightCm_CAT_CAN', 'HeightCm_CAT_DON', 'Hematocrit_CAT_DON', 'Hemodynamics_CAT_CAN', 'Hemodynamics_CO_CAT_CAN', 'Infection_CountTotal_DON', 'IntropesIV_CountTotal_CAN', 'IntropesVasodilators_CountTotal_CAN', 'IschemicTimeHour_CAT_DON', 'LV_EjectionFractionPercent_CAT_DON', 'Level_SGOT_ALT_CAT_DON', 'Level_SGOT_AST_CAT_DON', 'LifeSupport_CountTotal_CAN', 'LungPO2_CAT_DON', 'LungPO2_FIO2_CAT_DON', 'Malignancy_CountTotal_CAN', 'MedicalC

#### Age

In [228]:
features = getFeatureList(df, 'Age_CAT')

             count unique           top  freq
Age_CAT_CAN  16126      4    Age(46-57)  4439
Age_CAT_DON  16126      4  Age (Min-24)  4377


In [229]:
# Age
orderingCAN = sorted(df.Age_CAT_CAN.unique())
orderingDON = sorted(df.Age_CAT_DON.unique())
# initialize encoder
encoderCAN = OrdinalEncoder(categories=[orderingCAN])
encorerDON = OrdinalEncoder(categories=[orderingDON])

# encode
df['Age_CAT_CAN'] = encoderCAN.fit_transform(df[['Age_CAT_CAN']])
df['Age_CAT_DON'] = encorerDON.fit_transform(df[['Age_CAT_DON']])

# test
usf.mannwhitneyu_combinations(df, features)

Unnamed: 0,column1,columns2,U_statistic,p_value
0,Age_CAT_CAN,Age_CAT_DON,128018300.5,0.013204


#### AntigenC
- AntigenC1_CAN & AntigenC2_CAN

In [230]:
# custom sort function
def custom_sort_key(x):
    if x == 'Missing':
        return (0, '')
    elif x.isdigit():
        return (1, int(x))  # single numbers sorted numerically
    else:
        return (2, x)  # other values sorted lexicographically

In [231]:
features = getFeatureList(df, 'AntigenC')

               count unique top   freq
AntigenC1_CAN  16126     37   0  10275
AntigenC2_CAN  16126     45   0  10733


In [232]:
# AntigenC1
ordering1 = sorted(df.AntigenC1_CAN.unique(), key=custom_sort_key)
ordering2 = sorted(df.AntigenC2_CAN.unique(), key=custom_sort_key)
# initialize encoder
encorer1 = OrdinalEncoder(categories=[ordering1])
encoder2 = OrdinalEncoder(categories=[ordering2])

# encode
df['AntigenC1_CAN'] = encorer1.fit_transform(df[['AntigenC1_CAN']])
df['AntigenC2_CAN'] = encoder2.fit_transform(df[['AntigenC2_CAN']])

# test
usf.mannwhitneyu_combinations(df, features)

Unnamed: 0,column1,columns2,U_statistic,p_value
0,AntigenC1_CAN,AntigenC2_CAN,126897327.5,1.1e-05


##### AntigenDA
- AntigenDA1_DON & AntigenDA2_DON

In [233]:
features = getFeatureList(df, 'AntigenDA')

                count unique top  freq
AntigenDA1_DON  16126     39   2  6394
AntigenDA2_DON  16126     44  24  1804


In [234]:
# AntigenDA
ordering1 = sorted(df.AntigenDA1_DON.unique(), key=custom_sort_key)
ordering2 = sorted(df.AntigenDA2_DON.unique(), key=custom_sort_key)
# initialize encoder
encorer1 = OrdinalEncoder(categories=[ordering1])
encoder2 = OrdinalEncoder(categories=[ordering2])

# encode
df['AntigenDA1_DON'] = encorer1.fit_transform(df[['AntigenDA1_DON']])
df['AntigenDA2_DON'] = encoder2.fit_transform(df[['AntigenDA2_DON']])

# test
usf.mannwhitneyu_combinations(df, features)

Unnamed: 0,column1,columns2,U_statistic,p_value
0,AntigenDA1_DON,AntigenDA2_DON,43529234.0,0.0


##### AntigenDB
- AntigenDB1_DON & AntigenDB2_DON

In [235]:
features = getFeatureList(df, 'AntigenDB')

                count unique top  freq
AntigenDB1_DON  16126     81   7  3083
AntigenDB2_DON  16126     87  44  2418


In [236]:
# AntigenDB
ordering1 = sorted(df.AntigenDB1_DON.unique(), key=custom_sort_key)
ordering2 = sorted(df.AntigenDB2_DON.unique(), key=custom_sort_key)
# initialize encoder
encorer1 = OrdinalEncoder(categories=[ordering1])
encoder2 = OrdinalEncoder(categories=[ordering2])

# encode
df['AntigenDB1_DON'] = encorer1.fit_transform(df[['AntigenDB1_DON']])
df['AntigenDB2_DON'] = encoder2.fit_transform(df[['AntigenDB2_DON']])

# test
usf.mannwhitneyu_combinations(df, features)

Unnamed: 0,column1,columns2,U_statistic,p_value
0,AntigenDB1_DON,AntigenDB2_DON,63449793.0,0.0


##### AntigenDQ
- AntigenDQ1_CAN & AntigenDQ2_CAN

In [237]:
features = getFeatureList(df, 'AntigenDQ')

                count unique top   freq
AntigenDQ1_CAN  16126     25   0  10286
AntigenDQ2_CAN  16126     25   0  10880


In [238]:
# AntigenDQ
ordering1 = sorted(df.AntigenDQ1_CAN.unique(), key=custom_sort_key)
ordering2 = sorted(df.AntigenDQ2_CAN.unique(), key=custom_sort_key)
# initialize encoder
encorer1 = OrdinalEncoder(categories=[ordering1])
encoder2 = OrdinalEncoder(categories=[ordering2])

# encode
df['AntigenDQ1_CAN'] = encorer1.fit_transform(df[['AntigenDQ1_CAN']])
df['AntigenDQ2_CAN'] = encoder2.fit_transform(df[['AntigenDQ2_CAN']])

# test
usf.mannwhitneyu_combinations(df, features)

Unnamed: 0,column1,columns2,U_statistic,p_value
0,AntigenDQ1_CAN,AntigenDQ2_CAN,130334467.0,0.66076


##### AntigenDR
- AntigenDR1_DON & AntigenDR2_DON

In [239]:
features = getFeatureList(df, 'AntigenDR.*_DON$')

                count unique top  freq
AntigenDR1_DON  16126     43   4  3982
AntigenDR2_DON  16126     46  15  3386


In [240]:
# AntigenDR
ordering1 = sorted(df.AntigenDR1_DON.unique(), key=custom_sort_key)
ordering2 = sorted(df.AntigenDR2_DON.unique(), key=custom_sort_key)
# initialize encoder
encorer1 = OrdinalEncoder(categories=[ordering1])
encoder2 = OrdinalEncoder(categories=[ordering2])

# encode
df['AntigenDR1_DON'] = encorer1.fit_transform(df[['AntigenDR1_DON']])
df['AntigenDR2_DON'] = encoder2.fit_transform(df[['AntigenDR2_DON']])

# test
usf.mannwhitneyu_combinations(df, features)

Unnamed: 0,column1,columns2,U_statistic,p_value
0,AntigenDR1_DON,AntigenDR2_DON,87821869.5,0.0


##### AntigenDR5
- AntigenDR51_CAN & AntigenDR51_2_CAN & AntigenDR52_CAN & AntigenDR52_2_CAN + AntigenDR53_CAN + AntigenDR53_2_CAN

In [241]:
features = getFeatureList(df, 'AntigenDR5')

                   count unique top   freq
AntigenDR51_CAN    16126      9   0  11039
AntigenDR51_2_CAN  16126      8   0  14557
AntigenDR52_CAN    16126     10   0  10671
AntigenDR52_2_CAN  16126     10   0  14462
AntigenDR53_CAN    16126      7   0  10848
AntigenDR53_2_CAN  16126      6   0  14551


In [242]:
# AntigenDR
ordering1 = sorted(df.AntigenDR51_CAN.unique(), key=custom_sort_key)
ordering2 = sorted(df.AntigenDR51_2_CAN.unique(), key=custom_sort_key)
ordering3 = sorted(df.AntigenDR52_CAN.unique(), key=custom_sort_key)
ordering4 = sorted(df.AntigenDR52_2_CAN.unique(), key=custom_sort_key)
ordering5 = sorted(df.AntigenDR53_CAN.unique(), key=custom_sort_key)
ordering6 = sorted(df.AntigenDR53_2_CAN.unique(), key=custom_sort_key)
# initialize encoder
encorer1 = OrdinalEncoder(categories=[ordering1])
encoder2 = OrdinalEncoder(categories=[ordering2])
encoder3 = OrdinalEncoder(categories=[ordering3])
encoder4 = OrdinalEncoder(categories=[ordering4])
encoder5 = OrdinalEncoder(categories=[ordering5])
encoder6 = OrdinalEncoder(categories=[ordering6])

# encode
df['AntigenDR51_CAN'] = encorer1.fit_transform(df[['AntigenDR51_CAN']])
df['AntigenDR51_2_CAN'] = encoder2.fit_transform(df[['AntigenDR51_2_CAN']])
df['AntigenDR52_CAN'] = encoder3.fit_transform(df[['AntigenDR52_CAN']])
df['AntigenDR52_2_CAN'] = encoder4.fit_transform(df[['AntigenDR52_2_CAN']])
df['AntigenDR53_CAN'] = encoder5.fit_transform(df[['AntigenDR53_CAN']])
df['AntigenDR53_2_CAN'] = encoder6.fit_transform(df[['AntigenDR53_2_CAN']])

# test
usf.mannwhitneyu_combinations(df, features)

Unnamed: 0,column1,columns2,U_statistic,p_value
8,AntigenDR51_2_CAN,AntigenDR53_2_CAN,131070082.0,0.01500701
6,AntigenDR51_2_CAN,AntigenDR52_2_CAN,128194718.0,2.674959e-05
13,AntigenDR52_2_CAN,AntigenDR53_2_CAN,131922806.0,1.327315e-05
3,AntigenDR51_CAN,AntigenDR53_CAN,136969444.5,8.14996e-24
1,AntigenDR51_CAN,AntigenDR52_CAN,118314234.5,1.484405e-63
10,AntigenDR52_CAN,AntigenDR53_CAN,143596038.0,9.41186e-84
0,AntigenDR51_CAN,AntigenDR51_2_CAN,161552977.0,0.0
2,AntigenDR51_CAN,AntigenDR52_2_CAN,155703064.5,0.0
4,AntigenDR51_CAN,AntigenDR53_2_CAN,162050626.5,0.0
5,AntigenDR51_2_CAN,AntigenDR52_CAN,95233102.5,0.0


##### AntigenRA
- AntigenRA1_CAN & AntigenRA2_CAN

In [243]:
features = getFeatureList(df, 'AntigenRA')

                count unique top  freq
AntigenRA1_CAN  16126     50   2  5189
AntigenRA2_CAN  16126     52  68  1436


In [244]:
# AntigenRA
ordering1 = sorted(df.AntigenRA1_CAN.unique(), key=custom_sort_key)
ordering2 = sorted(df.AntigenRA2_CAN.unique(), key=custom_sort_key)
# initialize encoder
encorer1 = OrdinalEncoder(categories=[ordering1])
encoder2 = OrdinalEncoder(categories=[ordering2])

# encode
df['AntigenRA1_CAN'] = encorer1.fit_transform(df[['AntigenRA1_CAN']])
df['AntigenRA2_CAN'] = encoder2.fit_transform(df[['AntigenRA2_CAN']])

# test
usf.mannwhitneyu_combinations(df, features)

Unnamed: 0,column1,columns2,U_statistic,p_value
0,AntigenRA1_CAN,AntigenRA2_CAN,63859313.5,0.0


##### AntigenRB
- AntigenRB1_CAN & AntigenRB2_CAN

In [245]:
features = getFeatureList(df, 'AntigenRB')

                count unique top  freq
AntigenRB1_CAN  16126     97   7  2595
AntigenRB2_CAN  16126     95  44  1937


In [246]:
# AntigenRB
ordering1 = sorted(df.AntigenRB1_CAN.unique(), key=custom_sort_key)
ordering2 = sorted(df.AntigenRB2_CAN.unique(), key=custom_sort_key)
# initialize encoder
encorer1 = OrdinalEncoder(categories=[ordering1])
encoder2 = OrdinalEncoder(categories=[ordering2])

# encode
df['AntigenRB1_CAN'] = encorer1.fit_transform(df[['AntigenRB1_CAN']])
df['AntigenRB2_CAN'] = encoder2.fit_transform(df[['AntigenRB2_CAN']])

# test
usf.mannwhitneyu_combinations(df, features)

Unnamed: 0,column1,columns2,U_statistic,p_value
0,AntigenRB1_CAN,AntigenRB2_CAN,83028607.5,0.0


##### AntigenRDR
- AntigenRDR1_CAN & AntigenRDR2_CAN

In [247]:
features = getFeatureList(df, 'AntigenRD')

                 count unique top  freq
AntigenRDR1_CAN  16126     54   4  2917
AntigenRDR2_CAN  16126     56  15  3030


In [248]:
# AntigenRDR
ordering1 = sorted(df.AntigenRDR1_CAN.unique(), key=custom_sort_key)
ordering2 = sorted(df.AntigenRDR2_CAN.unique(), key=custom_sort_key)
# initialize encoder
encorer1 = OrdinalEncoder(categories=[ordering1])
encoder2 = OrdinalEncoder(categories=[ordering2])

# encode
df['AntigenRDR1_CAN'] = encorer1.fit_transform(df[['AntigenRDR1_CAN']])
df['AntigenRDR2_CAN'] = encoder2.fit_transform(df[['AntigenRDR2_CAN']])

# test
usf.mannwhitneyu_combinations(df, features)

Unnamed: 0,column1,columns2,U_statistic,p_value
0,AntigenRDR1_CAN,AntigenRDR2_CAN,110439826.5,5.152113999999999e-122


#### BMI

In [249]:
features = getFeatureList(df, 'BMI_CAT')

             count unique            top  freq
BMI_CAT_CAN  16126      5    Over Weight  5808
BMI_CAT_DON  16126      5  Normal Weight  5817


In [250]:
# order
ordering = ['Missing', 'Under Weight','Normal Weight', 'Over Weight', 'Obesity']
# initialize encoder
encoder = OrdinalEncoder(categories=[ordering])
# encode
df['BMI_CAT_CAN'] = encoder.fit_transform(df[['BMI_CAT_CAN']])
df['BMI_CAT_DON'] = encoder.fit_transform(df[['BMI_CAT_DON']])

# test
usf.mannwhitneyu_combinations(df, features)

Unnamed: 0,column1,columns2,U_statistic,p_value
0,BMI_CAT_CAN,BMI_CAT_DON,136000824.0,3.764542e-14


#### BloodUreaNitrogenLevel_CAT_DON

In [251]:
features = getFeatureList(df, 'BloodUreaNitrogenLevel_CAT_DON')

                                count unique   top  freq
BloodUreaNitrogenLevel_CAT_DON  16126      4  High  7918


In [252]:
# order
ordering = ['Missing', 'Low', 'Normal', 'High']
# initialize encoder
encoder = OrdinalEncoder(categories=[ordering])
# encode
df['BloodUreaNitrogenLevel_CAT_DON'] = encoder.fit_transform(df[['BloodUreaNitrogenLevel_CAT_DON']])

#### Creatinine_CAT

In [253]:
features = getFeatureList(df, 'Creatinine_CAT_')

                    count unique     top   freq
Creatinine_CAT_CAN  16126      5  Normal   8066
Creatinine_CAT_DON  16126      5  Normal  10200


In [254]:
# order
ordering = ['Missing', 'Normal', 'Mildly Elevated', 'Moderately Elevated', 'Severely Elevated']
# initialize encoder
encoder = OrdinalEncoder(categories=[ordering])
# encode
df['Creatinine_CAT_CAN'] = encoder.fit_transform(df[['Creatinine_CAT_CAN']])
df['Creatinine_CAT_DON'] = encoder.fit_transform(df[['Creatinine_CAT_DON']])

#### DistanceFromDonorHospitaltoTXCenter_CAT

In [255]:
features = getFeatureList(df, 'DistanceFromDonorHospitaltoTXCenter_CAT')

                                         count unique               top  freq
DistanceFromDonorHospitaltoTXCenter_CAT  16126      4  Distance(Min-38)  4076


In [256]:
# order
ordering = ['Distance(Min-38)', 'Distance(38-168)', 'Distance(168-356)', 'Distance(356-Max)']
# initialize encoder
encoder = OrdinalEncoder(categories=[ordering])
# encode
df['DistanceFromDonorHospitaltoTXCenter_CAT'] = encoder.fit_transform(df[['DistanceFromDonorHospitaltoTXCenter_CAT']])

### EducationLevel_CAN

In [257]:
features = getFeatureList(df, 'EducationLevel_CAN')

                    count unique                        top  freq
EducationLevel_CAN  16126      8  HIGH SCHOOL (9-12) or GED  5798


In [258]:
# order
ordering = ['Missing', 'NONE', 'UNKNOWN', 'GRADE SCHOOL (0-8)', 'HIGH SCHOOL (9-12) or GED', 'ATTENDED COLLEGE/TECHNICAL SCHOOL', 'ASSOCIATE/BACHELOR DEGREE', 'POST-COLLEGE GRADUATE DEGREE']
# initialize encoder
encoder = OrdinalEncoder(categories=[ordering])
# encode
df['EducationLevel_CAN'] = encoder.fit_transform(df[['EducationLevel_CAN']])

#### FunctionalStatusTransplant_CAN

In [259]:
features = getFeatureList(df, 'FunctionalStatusTransplant_CAN')

                                count unique                                                                     top  freq
FunctionalStatusTransplant_CAN  16126     11  20% - Very sick, hospitalization necessary: active treatment necessary  5353


In [260]:
# sort the list, placing "Unknown" first
ordering = ['Unknown'] + sorted(
    [status for status in df.FunctionalStatusTransplant_CAN.unique().tolist() if status != 'Unknown'],
    key=lambda x: int(x.split('%')[0])
)
print(ordering)

['Unknown', '10% - Moribund, fatal processes progressing rapidly', '20% - Very sick, hospitalization necessary: active treatment necessary', '30% - Severely disabled: hospitalization is indicated, death not imminent', '40% - Disabled: requires special care and assistance', '50% - Requires considerable assistance and frequent medical care', '60% - Requires occasional assistance but is able to care for needs', '70% - Cares for self: unable to carry on normal activity or active work', '80% - Normal activity with effort: some symptoms of disease', '90% - Able to carry on normal activity: minor symptoms of disease', '100% - Normal, no complaints, no evidence of disease']


In [261]:
# initialize encoder
encoder = OrdinalEncoder(categories=[ordering])
# encode
df['FunctionalStatusTransplant_CAN'] = encoder.fit_transform(df[['FunctionalStatusTransplant_CAN']])

#### HeightCm

In [262]:
features = getFeatureList(df, 'HeightCm_CAT_')

                  count unique              top  freq
HeightCm_CAT_CAN  16126      4  Height(Min-167)  5026
HeightCm_CAT_DON  16126      4  Height(167-175)  4700


In [263]:
# order
ordering = ['Height(Min-167)','Height(167-175)','Height(175-180)','Height(180-Max)']
# initialize encoder
encoder = OrdinalEncoder(categories=[ordering])
# encode
df['HeightCm_CAT_CAN'] = encoder.fit_transform(df[['HeightCm_CAT_CAN']])
df['HeightCm_CAT_DON'] = encoder.fit_transform(df[['HeightCm_CAT_DON']])

# test
usf.mannwhitneyu_combinations(df, features)

Unnamed: 0,column1,columns2,U_statistic,p_value
0,HeightCm_CAT_CAN,HeightCm_CAT_DON,128418599.0,0.047225


#### Hematocrit_CAT_DON

In [264]:
features = getFeatureList(df, 'Hematocrit_CAT_DON')

                    count unique  top   freq
Hematocrit_CAT_DON  16126      4  Low  14863


In [265]:
# order
ordering = ['Missing', 'Low', 'Normal', 'High']
# initialize encoder
encoder = OrdinalEncoder(categories=[ordering])
# encode
df['Hematocrit_CAT_DON'] = encoder.fit_transform(df[['Hematocrit_CAT_DON']])

#### Hemodynamics

In [266]:
features = getFeatureList(df, 'Hemodynamics')

                         count unique     top   freq
Hemodynamics_CAT_CAN     16126      5  Normal   9132
Hemodynamics_CO_CAT_CAN  16126      5     Low  13677


In [267]:
# order
ordering = ['Missing', 'Low', 'Normal', 'High', 'Extreme']
# initialize encoder
encoder = OrdinalEncoder(categories=[ordering])
# encode
df['Hemodynamics_CAT_CAN'] = encoder.fit_transform(df[['Hemodynamics_CAT_CAN']])
df['Hemodynamics_CO_CAT_CAN'] = encoder.fit_transform(df[['Hemodynamics_CO_CAT_CAN']])

#### IschemicTimeHour_CAT_DON

In [268]:
features = getFeatureList(df, 'IschemicTimeHour_CAT_DON')

                          count unique             top  freq
IschemicTimeHour_CAT_DON  16126      5  Hours(Min-2.6)  4162


In [269]:
# order
ordering = ['Missing', 'Hours(Min-2.6)', 'Hours(2.6-3.3)', 'Hours(3.3-3.9)', 'Hours(3.9-Max)']
# initialize encoder
encoder = OrdinalEncoder(categories=[ordering])
# encode
df['IschemicTimeHour_CAT_DON'] = encoder.fit_transform(df[['IschemicTimeHour_CAT_DON']])

#### LV_EjectionFractionPercent_CAT_DON

In [270]:
features = getFeatureList(df, 'LV_EjectionFractionPercent_CAT_DON')

                                    count unique          top   freq
LV_EjectionFractionPercent_CAT_DON  16126      5  Normal LVEF  14831


In [271]:
# order
ordering = ['Missing', 'Reduced LVEF', 'Normal LVEF', 'Mild Dysfunction', 'High LVEF']
# initialize encoder
encoder = OrdinalEncoder(categories=[ordering])
# encode
df['LV_EjectionFractionPercent_CAT_DON'] = encoder.fit_transform(df[['LV_EjectionFractionPercent_CAT_DON']])

#### Level_SGOT_ALT_CAT

In [272]:
features = getFeatureList(df, 'Level_SGOT')

                        count unique     top  freq
Level_SGOT_AST_CAT_DON  16126      4  Normal  8172
Level_SGOT_ALT_CAT_DON  16126      4  Normal  8308


In [273]:
# order
ordering = ['Missing', 'Low', 'Normal', 'High']
# initialize encoder
encoder = OrdinalEncoder(categories=[ordering])
# encode
df['Level_SGOT_AST_CAT_DON'] = encoder.fit_transform(df[['Level_SGOT_AST_CAT_DON']])
df['Level_SGOT_ALT_CAT_DON'] = encoder.fit_transform(df[['Level_SGOT_ALT_CAT_DON']])

#### LungPO2

In [274]:
features = getFeatureList(df, 'LungPO2')

                      count unique                top   freq
LungPO2_FIO2_CAT_DON  16126      5            Extreme  10779
LungPO2_CAT_DON       16126      6  Severe Impairment   6863


In [275]:
# order
ordering = ['Missing', 'Low', 'Normal', 'High', 'Extreme']
# initialize encoder
encoder = OrdinalEncoder(categories=[ordering])
# encode
df['LungPO2_FIO2_CAT_DON'] = encoder.fit_transform(df[['LungPO2_FIO2_CAT_DON']])

# order
ordering = ['Missing', 'Borderline Normal', 'Normal', 'Mild Impairment', 'Moderate Impairment', 'Severe Impairment']
# initialize encoder
encoder = OrdinalEncoder(categories=[ordering])
# encode
df['LungPO2_CAT_DON'] = encoder.fit_transform(df[['LungPO2_CAT_DON']])

#### MedicalConditionTransplant_CAN

In [276]:
features = getFeatureList(df, 'MedicalConditionTransplant_CAN')

                                count unique                     top  freq
MedicalConditionTransplant_CAN  16126      4  In Intensive Care Unit  7201


In [277]:
# order
ordering = ['Missing', 'Not Hospitalized', 'Hospitalized Not in ICU', 'In Intensive Care Unit']
# initialize encoder
encoder = OrdinalEncoder(categories=[ordering])
# encode
df['MedicalConditionTransplant_CAN'] = encoder.fit_transform(df[['MedicalConditionTransplant_CAN']])

#### MismatchLevel_AMIS

In [278]:
features = getFeatureList(df, 'MismatchLevel_AMIS')

                    count  unique  top  freq
MismatchLevel_AMIS  16126       4    2  7603


In [279]:
# order
ordering = ['999', '0', '1', '2']
# convert to string
df[features] = df[features].astype(str)
# initialize encoder
encoder = OrdinalEncoder(categories=[ordering])
# encode
df['MismatchLevel_AMIS'] = encoder.fit_transform(df[['MismatchLevel_AMIS']])

#### MismatchLevel_BMIS

In [280]:
features = getFeatureList(df, 'MismatchLevel_BMIS')

                    count  unique  top   freq
MismatchLevel_BMIS  16126       4    2  10670


In [281]:
# convert to string
df[features] = df[features].astype(str)
# initialize encoder
encoder = OrdinalEncoder(categories=[ordering])
# encode
df['MismatchLevel_BMIS'] = encoder.fit_transform(df[['MismatchLevel_BMIS']])

#### MismatchLevel_DRMIS

In [282]:
features = getFeatureList(df, 'MismatchLevel_DRMIS')

                     count  unique  top  freq
MismatchLevel_DRMIS  16126       4    2  8144


In [283]:
# convert to string
df[features] = df[features].astype(str)
# initialize encoder
encoder = OrdinalEncoder(categories=[ordering])
# encode
df['MismatchLevel_DRMIS'] = encoder.fit_transform(df[['MismatchLevel_DRMIS']])

#### MismatchLevel_HLMIS

In [284]:
features = getFeatureList(df, 'MismatchLevel_HLMIS')

                     count  unique  top  freq
MismatchLevel_HLMIS  16126       8    5  5532


In [285]:
# order
ordering = ['999', '0', '1', '2', '3', '4', '5', '6']
# convert to string
df[features] = df[features].astype(str)
# initialize encoder
encoder = OrdinalEncoder(categories=[ordering])
# encode
df['MismatchLevel_HLMIS'] = encoder.fit_transform(df[['MismatchLevel_HLMIS']])

#### OrganRecovery_PCO2_CAT_DON

In [286]:
features = getFeatureList(df, 'OrganRecovery_PCO2_CAT_DON')

                            count unique     top  freq
OrganRecovery_PCO2_CAT_DON  16126      4  Normal  9419


In [287]:
# order
ordering = ['Missing', 'Low', 'Normal', 'High']
# initialize encoder
encoder = OrdinalEncoder(categories=[ordering])
# encode
df['OrganRecovery_PCO2_CAT_DON'] = encoder.fit_transform(df[['OrganRecovery_PCO2_CAT_DON']])

#### PanelReactiveAntibody_CPRA_CAT_CAN

In [288]:
features = getFeatureList(df, 'PanelReactiveAntibody_CPRA_CAT_CAN')

                                    count unique               top  freq
PanelReactiveAntibody_CPRA_CAT_CAN  16126      7  No Sensitization  7459


In [289]:
# order
ordering = ['Missing', 'No Sensitization', 'Low Sensitization', 'Some Sensitization', 'Moderate Sensitization', 'High Sensitization', 'Extreme Sensitization']
# initialize encoder
encoder = OrdinalEncoder(categories=[ordering])
# encode
df['PanelReactiveAntibody_CPRA_CAT_CAN'] = encoder.fit_transform(df[['PanelReactiveAntibody_CPRA_CAT_CAN']])

#### TotalBilirubin_CAT

In [290]:
features = getFeatureList(df, 'TotalBilirubin_CAT')

                        count unique     top   freq
TotalBilirubin_CAT_CAN  16126      4  Normal  13074
TotalBilirubin_CAT_DON  16126      4  Normal  12944


In [291]:
# order
ordering = ['Missing', 'Low', 'Normal', 'High']
# initialize encoder
encoder = OrdinalEncoder(categories=[ordering])
# encode
df['TotalBilirubin_CAT_CAN'] = encoder.fit_transform(df[['TotalBilirubin_CAT_CAN']])
df['TotalBilirubin_CAT_DON'] = encoder.fit_transform(df[['TotalBilirubin_CAT_DON']])

#### TotalDayWaitList_CAT_CAN

In [292]:
features = getFeatureList(df, 'TotalDayWaitList_CAT_CAN')

                          count unique           top  freq
TotalDayWaitList_CAT_CAN  16126      4  Days(Min-13)  4221


In [293]:
# order
ordering = ['Days(Min-13)', 'Days(13-44)', 'Days(44-163)', 'Days(163-Max)']
# initialize encoder
encoder = OrdinalEncoder(categories=[ordering])
# encode
df['TotalDayWaitList_CAT_CAN'] = encoder.fit_transform(df[['TotalDayWaitList_CAT_CAN']])

#### TransfusionNumber_DON

In [294]:
features = getFeatureList(df, 'TransfusionNumber_DON')

                       count unique   top  freq
TransfusionNumber_DON  16126      5  NONE  8187


In [295]:
# order
ordering = ['Missing', 'NONE', '1 - 5', '6 - 10', 'GREATER THAN 10']
# initialize encoder
encoder = OrdinalEncoder(categories=[ordering])
# encode
df['TransfusionNumber_DON'] = encoder.fit_transform(df[['TransfusionNumber_DON']])

#### WeightKg

In [296]:
features = getFeatureList(df, 'WeightKg')

                  count unique           top  freq
WeightKg_CAT_CAN  16126      5  Normal Weght  4054
WeightKg_CAT_DON  16126      5  Under Weight  4136


In [297]:
# order
ordering = ['Missing', 'Under Weight', 'Normal Weght', 'Over Weight', 'Obesity']
# initialize encoder
encoder = OrdinalEncoder(categories=[ordering])
# encode
df['WeightKg_CAT_CAN'] = encoder.fit_transform(df[['WeightKg_CAT_CAN']])
df['WeightKg_CAT_DON'] = encoder.fit_transform(df[['WeightKg_CAT_DON']])

### Change DataTypes to Integer & Display Ordinal Features

In [298]:
# convert to float
df[df_ordinal.column.to_list()] = df[df_ordinal.column.to_list()].astype('int')
# dispaly
df[df_ordinal.column.to_list()].head()

Unnamed: 0,PreviousTransplantNumber_CAN,EducationLevel_CAN,FunctionalStatusTransplant_CAN,AntigenDQ1_CAN,AntigenDQ2_CAN,MedicalConditionTransplant_CAN,TransfusionNumber_DON,AntigenDA1_DON,AntigenDA2_DON,AntigenDB1_DON,AntigenDB2_DON,AntigenRA1_CAN,AntigenRA2_CAN,AntigenRB1_CAN,AntigenRB2_CAN,AntigenRDR1_CAN,AntigenRDR2_CAN,PanelReactiveAntibody_CPRA_CAT_CAN,Hemodynamics_CAT_CAN,Hemodynamics_CO_CAT_CAN,Creatinine_CAT_CAN,Creatinine_CAT_DON,IschemicTimeHour_CAT_DON,TotalBilirubin_CAT_CAN,TotalBilirubin_CAT_DON,LungPO2_FIO2_CAT_DON,LungPO2_CAT_DON,OrganRecovery_PCO2_CAT_DON,Level_SGOT_AST_CAT_DON,Level_SGOT_ALT_CAT_DON,BloodUreaNitrogenLevel_CAT_DON,Hematocrit_CAT_DON,LV_EjectionFractionPercent_CAT_DON,BMI_CAT_CAN,BMI_CAT_DON,WeightKg_CAT_DON,WeightKg_CAT_CAN,Age_CAT_CAN,Age_CAT_DON,DistanceFromDonorHospitaltoTXCenter_CAT,HeightCm_CAT_CAN,HeightCm_CAT_DON,TotalDayWaitList_CAT_CAN,Cancer_CountTotal_DON,DrugUse_CountTotal_DON,Dialysis_CountTotal_CAN,Diuretics_CountTotal_DON,Infection_CountTotal_DON,IntropesIV_CountTotal_CAN,IntropesVasodilators_CountTotal_CAN,LifeSupport_CountTotal_CAN,Malignancy_CountTotal_CAN,PriorCardiacSurgery_CountTotal_CAN,Ventilator_CountTotal_CAN,WorkIncome_CountTotal_CAN
0,0,6,3,5,5,1,1,2,10,15,37,6,9,2,46,16,13,2,2,1,1,1,2,2,2,2,5,2,2,3,2,1,2,4,3,2,3,1,3,1,1,0,0,2,1,0,1,0,0,0,2,0,0,0,0
1,0,6,2,0,0,3,2,1,2,1,29,3,9,34,8,18,4,1,2,1,1,2,2,2,3,2,5,3,3,3,3,1,2,2,4,4,1,0,0,1,0,3,1,0,0,0,0,0,2,0,2,0,0,0,0
2,0,2,1,0,0,3,1,1,18,7,33,2,3,3,33,5,15,0,3,1,1,1,1,3,2,4,3,2,2,3,2,1,4,4,4,4,3,0,3,0,0,2,0,0,0,0,1,2,0,10,0,0,0,0,0
3,0,5,6,0,0,1,2,2,18,1,23,8,9,10,24,13,12,1,2,1,2,1,4,2,3,4,4,2,3,3,2,1,2,2,2,2,1,3,2,3,1,2,0,1,1,0,0,2,0,4,0,0,2,0,0
4,0,5,1,0,0,2,2,18,18,11,24,6,16,39,10,2,9,1,2,1,2,4,2,2,2,4,2,2,2,2,3,1,2,3,3,1,2,3,3,0,1,0,1,0,0,0,2,2,1,9,1,0,1,0,0


### Manning Whitney U Test

In [299]:
# test
mwutDF = usf.mannwhitneyu_combinations(df, df_ordinal.column.to_list())

# display p_value >= 0.05
mwutDF[mwutDF.p_value >= 0.05]

Unnamed: 0,column1,columns2,U_statistic,p_value
408,AntigenDA2_DON,AntigenRB1_CAN,129965227.0,0.943864
1383,HeightCm_CAT_CAN,DrugUse_CountTotal_DON,130143572.0,0.881759
159,AntigenDQ1_CAN,AntigenDQ2_CAN,130334467.0,0.66076
1335,Age_CAT_CAN,HeightCm_CAT_DON,129650969.0,0.644727
253,AntigenDQ2_CAN,IntropesIV_CountTotal_CAN,129621300.0,0.586448
1135,Level_SGOT_AST_CAT_DON,BloodUreaNitrogenLevel_CAT_DON,129578111.0,0.542999
1367,DistanceFromDonorHospitaltoTXCenter_CAT,TotalDayWaitList_CAT_CAN,130606054.0,0.472049
1295,WeightKg_CAT_DON,WeightKg_CAT_CAN,129426716.0,0.460651
130,FunctionalStatusTransplant_CAN,LungPO2_CAT_DON,130902863.5,0.285199
1334,Age_CAT_CAN,HeightCm_CAT_CAN,131004560.0,0.225307


### Nominal Association Testing

In [300]:
nominalDF = usf.pairColsMultiIndependenceCat(df, df_nominal.column.to_list())
nominalDF[(nominalDF.cramer_v >= .5) | (nominalDF.p_value >= .05)].sample(20)

Unnamed: 0,column1,column2,chi2,p_value,cramer_v
170,BloodGroup_DON,Geder_Difference,5.553789,0.592704,0.018558
269,InotropicAgent_DON,AntigenC2_CAN,244.476331,0.800205,0.050267
350,HeavyAlcoholUse_DON,AntigenC1_CAN,115.657968,0.28967,0.048895
1154,InfectionTherapyIV_CAN,HIV_NAT_Result_DON,0.774661,0.992736,0.004901
3006,HBV_NAT_Result_CAN,HIV_NAT_PreTransplant_CAN,26552.170869,0.0,0.740843
110,BloodGroup_DON,AntigenDR53_2_CAN,38.486667,0.314618,0.021848
84,BloodGroup_CAN,EpsteinBarr_Combined_DON,14.493487,0.847484,0.017309
2778,SteroidsUse_DON,UrinePortein_DON,16128.017564,0.0,0.577386
1145,InfectionTherapyIV_CAN,Hepatitis_B_CoreAntibody_DON,1.752112,0.994809,0.006018
476,MismatchLevel_AMIS,AntibodyResultRPR_VDRL_DON,5.941481,0.745761,0.011082


### Dummy Encoding

In [301]:
# encode
df_dummy = pd.get_dummies(df, columns=df_nominal.column.to_list(), drop_first=True)
# print shape
print(f"Shape: {df_dummy.shape}")
# add columns to list
dummyCols = df_dummy.columns.tolist()
# remove label
dummyCols.remove('Survival')
# create DataFrame
varDF = pd.DataFrame(columns=['ColumnName', 'Variance'])

Shape: (16126, 575)


In [302]:
for col in dummyCols:
    variance = df_dummy[col].var()
    # store the result
    result = {'ColumnName': col, 'Variance': variance}
    varDF.loc[len(varDF)] = result

varDF = varDF.sort_values(by='Variance', ascending=False)

In [303]:
lowVar = varDF.ColumnName[varDF.Variance < .01].to_list()
len(lowVar)

251

In [304]:
# encoded DataFrame
df_dummy = df_dummy.drop(columns=lowVar)

In [305]:
# heart encoded dataset
uf.writeToFile(df_dummy, 'Clean_ML_Heart_CAT',path='../Data/', format='pkl')

# heart label
uf.writeToFile(df, 'Clean_Full_CAT',path='../Data/', format='pkl')

16,126 records written to ../Data/Clean_ML_Heart_CAT.pkl
16,126 records written to ../Data/Clean_Full_CAT.pkl
