In [1]:
import numpy as np
import pandas as pd
import pyreadstat

In [2]:
# Import the PTR datasets from 2021 to 2023
df2021, meta2021 = pyreadstat.read_sas7bdat('D:\\NYU_Hospital\\ptr_hr_20210101_20211231_pub.sas7bdat')
df2022, meta2022 = pyreadstat.read_sas7bdat('D:\\NYU_Hospital\\ptr_hr_20220101_20221231_pub.sas7bdat')
df2023, meta2023 = pyreadstat.read_sas7bdat('D:\\NYU_Hospital\\ptr_hr_20230101_20231231_pub.sas7bdat')

In [3]:
## concate all the datasets to form a PTR dataframe, called PTR
PTR = pd.concat([df2021, df2022, df2023], axis=0)

In [4]:
## delete the empty columns with NaNs
PTR = PTR[PTR['MATCH_ORG'] == 'HR']
all_na_columns = PTR.columns[PTR.isna().all()]
PTR = PTR.drop(columns=all_na_columns)

In [5]:
## pick out our response variable, that is the PTR_OFFER_ACPT, which shows a heart is accepted or not.
unique_values_count = PTR['PTR_OFFER_ACPT'].value_counts()
print(unique_values_count)

     957549
N    534024
Z     94191
B     14969
Y      3521
Name: PTR_OFFER_ACPT, dtype: int64


In [6]:
# Select the required columns
columns_needed = ['MATCH_ID', 'MATCH_ORG', 'DONOR_ID', 'PTR_SEQUENCE_NUM', 'PX_ID', 
                  'PTR_TOT_SCORE', 'PTR_STAT_CD', 'PTR_OFFER_ID', 
                  'PTR_OFFER_ACPT', 'PTR_PRIME_OPO_REFUSAL_ID', 'PTR_SECOND_OPO_REFUSAL_ID', 
                  'PTR_DISTANCE', 'PTR_OLD_STAT1_TM_SEC']
df_filtered = PTR[columns_needed]
#df_filtered = df_filtered.dropna()
#df_filtered = df_filtered.drop_duplicates(subset=['MATCH_ID'])
# Sample 1000 rows ensuring the distribution
df_success = df_filtered[df_filtered['PTR_OFFER_ACPT'] == 'Y'].sample(n=400)
df_fail = df_filtered[df_filtered['PTR_OFFER_ACPT'] == 'N'].sample(n=400)
df_provisional_yes = df_filtered[df_filtered['PTR_OFFER_ACPT'] == 'Z'].sample(n=100)
df_bypass = df_filtered[df_filtered['PTR_OFFER_ACPT'] == 'B'].sample(n=100)

In [7]:
df_sampled = pd.concat([df_success, df_fail, df_provisional_yes, df_bypass])

In [8]:
df_sampled.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 1276681 to 1229677
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   MATCH_ID                   1000 non-null   float64
 1   MATCH_ORG                  1000 non-null   object 
 2   DONOR_ID                   1000 non-null   float64
 3   PTR_SEQUENCE_NUM           1000 non-null   float64
 4   PX_ID                      1000 non-null   float64
 5   PTR_TOT_SCORE              1000 non-null   float64
 6   PTR_STAT_CD                1000 non-null   float64
 7   PTR_OFFER_ID               1000 non-null   float64
 8   PTR_OFFER_ACPT             1000 non-null   object 
 9   PTR_PRIME_OPO_REFUSAL_ID   500 non-null    float64
 10  PTR_SECOND_OPO_REFUSAL_ID  81 non-null     float64
 11  PTR_DISTANCE               1000 non-null   float64
 12  PTR_OLD_STAT1_TM_SEC       1000 non-null   float64
dtypes: float64(11), object(2)
memory usage:

In [9]:
import pyreadstat

# read datasets

df_tx_HR, meta_tx_HR = pyreadstat.read_sas7bdat('D:\\NYU_Hospital\\TX_HR.sas7bdat')
df_immuno, meta_immuno = pyreadstat.read_sas7bdat('D:\\NYU_Hospital\\IMMUNO.sas7bdat')
df_rec, meta_rec = pyreadstat.read_sas7bdat('D:\\NYU_Hospital\\REC_HISTO.sas7bdat')
df_rec_x, meta_rec_x = pyreadstat.read_sas7bdat('D:\\NYU_Hospital\\REC_HISTO_XMAT.sas7bdat')
df_deceased, meta_deceased = pyreadstat.read_sas7bdat('D:\\NYU_Hospital\\donor_deceased.sas7bdat')
df_disposition, meta_disposition = pyreadstat.read_sas7bdat('D:\\NYU_Hospital\\donor_disposition.sas7bdat')
df_cand, meta_cand = pyreadstat.read_sas7bdat('D:\\NYU_Hospital\\cand_thor.sas7bdat')
df_hist, meta_hist = pyreadstat.read_sas7bdat('D:\\NYU_Hospital\\stathist_thor.sas7bdat')
df_statjusta, meta_statjusta = pyreadstat.read_sas7bdat('D:\\NYU_Hospital\\statjust_hr1a.sas7bdat')
df_statjustb, meta_statjustb = pyreadstat.read_sas7bdat('D:\\NYU_Hospital\\statjust_hr1b.sas7bdat')
df_txf_HR, meta_txf_HR = pyreadstat.read_sas7bdat('D:\\NYU_Hospital\\txf_hr.sas7bdat')
#df_malig, meta_malig = pyreadstat.read_sas7bdat('D:\\NYU_Hospital\\malig.sas7bdat')
#df_treat, meta_treat = pyreadstat.read_sas7bdat('D:\\NYU_Hospital\\treatment.sas7bdat')


# filter columns in different datasets
columns_to_include = {
    'df_tx_HR': [
        'CAN_ABO', 'CAN_AGE_AT_LISTING', 'CAN_AGE_IN_MONTHS_AT_LISTING', 'CAN_CARDIAC_OUTPUT', 'CAN_CITIZENSHIP', 'CAN_DGN', 
        'CAN_EDUCATION', 'CAN_ETHNICITY_SRTR', 'CAN_GENDER', 'CAN_HGT_CM', 'CAN_INIT_ACT_STAT_CD', 'CAN_INIT_STAT', 
        'CAN_LAST_STAT', 'CAN_LISTING_DT', 'CAN_WGT_KG', 'DON_AGE', 'DON_AGE_IN_MONTHS', 'DON_ANTI_HCV', 'DON_CAD_DON_COD', 
        'DON_CARDIAC_ARREST_AFTER_DEATH', 'DON_CITIZENSHIP', 'DON_CONT_CIGARETTE', 'DON_CONT_COCAINE', 'DON_CONT_IV_DRUG', 
        'DON_CREAT', 'DON_DOBUTAMINE', 'DON_DOPAMINE', 'DON_ETHNICITY_SRTR', 'DON_GENDER', 'DON_HGT_CM', 'DON_HIST_CANCER', 
        'DON_HIST_COCAINE', 'DON_HIST_DIAB', 'DON_HIST_HYPERTEN', 'DON_HIST_IV_DRUG', 'DON_HIST_OTHER_DRUG', 'DON_INOTROP_AGENT_GE3', 
        'DON_INOTROP_SUPPORT', 'DON_NON_HR_BEAT', 'DON_RACE_SRTR', 'DON_RECOV_DT', 'DON_WARM_ISCH_TM_MINS', 'DON_WGT_KG', 'DONOR_ID', 
        'REC_CARDIAC_OUTPUT', 'REC_CREAT', 'PX_ID', 'TRR_ID', 'TX_ID', 'REC_HISTO_TX_ID'
    ],
    'df_cand': [
        'CAN_ABO', 'CAN_ACPT_ABO_INCOMP', 'CAN_AGE_AT_LISTING', 'CAN_AGE_IN_MONTHS_AT_LISTING', 'CAN_CARDIAC_OUTPUT', 'CAN_CITIZENSHIP', 
        'CAN_DGN', 'CAN_EDUCATION', 'CAN_ETHNICITY_SRTR', 'CAN_GENDER', 'CAN_HGT_CM', 'CAN_INIT_ACT_STAT_CD', 'CAN_INIT_STAT', 
        'CAN_LAST_STAT', 'CAN_LISTING_CTR_CD', 'CAN_LISTING_DT', 'CAN_PCW_MEAN', 'CAN_PERM_STATE', 'CAN_PRIMARY_PAY', 'CAN_SECONDARY_PAY', 
        'CAN_WGT_KG', 'DONOR_ID', 'PX_ID'
    ],
    'df_hist': [
        'CAN_INIT_ACT_STAT_CD', 'CAN_INIT_STAT', 'CAN_LAST_STAT', 'CAN_LISTING_DT', 'PX_ID'
    ],
    'df_statjustb': [
        'CAN_LISTING_CTR_CD', 'PX_ID'
    ],
    'df_statjusta': [
        'CAN_LISTING_CTR_CD', 'CANHX_INTRP_NOREPINE', 'PX_ID'
    ],
    'df_deceased': [
        'DON_ABNORM_LVH', 'DON_ABNORM_VALVES', 'DON_AGE', 'DON_AGE_IN_MONTHS', 'DON_ANTI_HBC', 'DON_ANTI_HCV', 'DON_ANTI_HIV', 'DON_ANTI_HTLV', 
        'DON_BUN', 'DON_CAD_DON_COD', 'DON_CANCER_OTHER_OSTXT', 'DON_CARDIAC_ARREST_AFTER_DEATH', 'DON_CHEST_XRAY', 'DON_CITIZENSHIP', 'DON_CLAMP_DT', 
        'DON_CLAMP_TM', 'DON_CLAMP_TM_ZONE', 'DON_CO_FINAL', 'DON_CO_INIT', 'DON_COD_DON_STROKE', 'DON_CONT_ALCOHOL', 'DON_CONT_CIGARETTE', 
        'DON_CONT_COCAINE', 'DON_CONT_IV_DRUG', 'DON_CORONARY_ANGIO', 'DON_DOBUTAMINE', 'DON_DOPAMINE', 'DON_EJECT_FRACT', 'DON_EJECT_FRACT_METH', 
        'DON_ETHNICITY_SRTR', 'DON_FEEDBACK_DONE', 'DON_GENDER', 'DON_HBV_NAT', 'DON_HBV_SURF_ANTIGEN', 'DON_HCV_NAT', 'DON_HCV_STAT', 'DON_HGT_CM', 
        'DON_HIST_CANCER', 'DON_HIST_PREV_MI', 'DON_HOME_STATE', 'DON_INFECT_BLOOD', 'DON_INFECT_BLOOD_CONFIRM', 'DON_INFECT_LU', 'DON_INFECT_LU_CONFIRM', 
        'DON_INFECT_OTHER', 'DON_INFECT_OTHER_CONFIRM', 'DON_INFECT_URINE', 'DON_INFECT_URINE_CONFIRM', 'DON_LEGALLY_BRAIN_DEAD', 'DON_MEET_CDC_HIGH_RISK', 
        'DON_NON_HR_BEAT', 'DON_RECOV_DT', 'DON_TROPONIN_I', 'DON_TROPONIN_T', 'DON_WALL_ABNORM_SEG', 'DON_WARM_ISCH_TM_MINS', 'DON_WGT_KG', 'DONOR_ID'
    ],
    'df_disposition': [
        'DON_RECOV_DT', 'DONOR_ID', 'PX_ID'
    ],
    'df_rec': [
        'REC_HLA_INTERPRET_I', 'REC_HLA_TYP_DONE', 'REC_HISTO_TX_ID'
    ],
    'df_rec_x': [
        'REC_HISTO_TX_ID'
    ],
    'df_immuno': [
        'TRR_ID'
    ],
    'df_txf_HR': [
        'PX_ID', 'TRR_ID', 'TX_ID'
    ]
}

In [10]:
df_tx_HR = df_tx_HR[columns_to_include['df_tx_HR']]
df_immuno = df_immuno[columns_to_include['df_immuno']]
df_rec = df_rec[columns_to_include['df_rec']]
df_rec_x = df_rec_x[columns_to_include['df_rec_x']]
df_deceased = df_deceased[columns_to_include['df_deceased']]
df_disposition = df_disposition[columns_to_include['df_disposition']]
df_cand = df_cand[columns_to_include['df_cand']]
df_hist = df_hist[columns_to_include['df_hist']]
df_statjusta = df_statjusta[columns_to_include['df_statjusta']]
df_statjustb = df_statjustb[columns_to_include['df_statjustb']]
df_txf_HR = df_txf_HR[columns_to_include['df_txf_HR']]

In [11]:
df_tx_HR_filtered = df_tx_HR[(df_tx_HR['DONOR_ID'].isin(df_sampled['DONOR_ID'])) & 
                             (df_tx_HR['PX_ID'].isin(df_sampled['PX_ID']))]

df_merged = df_sampled.merge(df_tx_HR_filtered, on=['DONOR_ID', 'PX_ID'], how='left')

offer_acpt_distribution = df_merged['PTR_OFFER_ACPT'].value_counts()
print("PTR_OFFER_ACPT distribution in merged data:")
print(offer_acpt_distribution)

# Check that MATCH_ORG is "HR"
match_org_check = df_merged['MATCH_ORG'].unique()
print("MATCH_ORG values in merged data:")
print(match_org_check)

# Check for unique donor_id and px_id
unique_donor_ids = df_merged['DONOR_ID'].is_unique
unique_px_ids = df_merged['PX_ID'].is_unique
print("Are donor_id values unique?", unique_donor_ids)
print("Are px_id values unique?", unique_px_ids)

# Output the merged result to a CSV file
df_merged.to_csv('merged_result.csv', index=False)

# Optional: Display the first few rows of the merged dataframe to verify
print(df_merged.head())

PTR_OFFER_ACPT distribution in merged data:
Y    400
N    400
Z    100
B    100
Name: PTR_OFFER_ACPT, dtype: int64
MATCH_ORG values in merged data:
['HR']
Are donor_id values unique? False
Are px_id values unique? False
    MATCH_ID MATCH_ORG  DONOR_ID  PTR_SEQUENCE_NUM      PX_ID  PTR_TOT_SCORE  \
0  1538093.0        HR  675697.0              74.0  1562702.0            0.0   
1  1485606.0        HR  658993.0               3.0  1517717.0            0.0   
2  1389358.0        HR  628307.0               1.0  1441565.0            0.0   
3  1316233.0        HR  604243.0              21.0  1387734.0            0.0   
4  1452018.0        HR  648246.0               4.0  1498557.0            0.0   

   PTR_STAT_CD  PTR_OFFER_ID PTR_OFFER_ACPT  PTR_PRIME_OPO_REFUSAL_ID  ...  \
0       2130.0         324.0              Y                       NaN  ...   
1       2120.0           1.0              Y                       NaN  ...   
2       2120.0          31.0              Y                      