In [None]:
import os
import pandas as pd

## TX_KI Subset Cleaning

1. **Load the raw subset**  
   Read in the extracted `tx_ki_subset.csv`, which contains a summary table of all Kidney Transplant records.


2. **Drop unneeded columns**  
   We remove servaral features with high missingness or only useful when grouping:
   - **CANHX_ALLOC_PRA** (Allocation PRA; 60.3% missing)  

3. **Retain the core PRA features**  
   Keep the essential fields with low missingness or primary importance:
   - **PX_ID** – Patient identifier (key)  
   - **WL_ORG** – Organ type  
   - **CANHX_BEGIN_DT** – Date the PRA record was last changed  
   - **CANHX_CPRA** – Calculated PRA (raw fraction; only 3.4% missing)
   - **CANHX_CUR_PRA** (Current PRA; 57.9% missing)  
   - **CANHX_SRTR_PEAK_PRA** (Peak PRA; 57.7% missing)


4. **Save the cleaned dataset**  
   Write out to `clean_subsets_ver1/tx_ki_clean.csv` for downstream analysis.


In [None]:
SUBSET_FOLDER = "/Users/chanyoungwoo/Thesis/Data_Extraction/extracted_subsets"
CLEAN_FOLDER = "/Users/chanyoungwoo/Thesis/Data_Extraction/clean_subsets_ver1"
os.makedirs(CLEAN_FOLDER, exist_ok=True)

in_path = os.path.join(SUBSET_FOLDER, "tx_ki_subset.csv")
tx = pd.read_csv(in_path)

to_drop = [
    "DON_CMV_IGG",
    "",
    "",
    "",
    "",
    "",
]
tx_clean = tx.drop(columns=to_drop)

out_path = os.path.join(CLEAN_FOLDER, "tx_ki_clean.csv")
tx_clean.to_csv(out_path, index=False)

print(f"Saved cleaned TX_KI to {out_path} (shape {tx_clean.shape})")