In [1]:
import os
import pandas as pd

**Drop (remove) the following features**

- TFL_COD2 & TFL_COD3: 99% Missing value, we are only interested in TFL_COD

- TFL_BK_THERAPY_TY: 99% Missing value, no importance, only use simpler TFL_BK_THERAPY

- TFL_FAIL_BK: 99% Missing value, no importance

- TFL_CAD: 98% Missing value, no importance

- TFL_AVN: 98% Missing value, no importance

- TFL_ANTIVRL_THERAPY_TY: 95% Missing value, use only TFL_ANTIVRL_THERAPY for simpler Analysis

- TFL_URINE_PROTEIN: 82% Missing value, no importance

- TFL_IMMUNO_DISCONT: 72% Missing value, no importance


**Features that are potentially redundant but retain it for now**

- TFL_BMI: 80% Missing value, does BMI hold any information after TX



**Features with high Missing value but with potential importance** 

The amount of high Missing value could be justified by: No entry for patients who had no negative effects after TX

- TFL_REJ_EVENT_NUM: 99% Missing value, but informative

- TFL_ACUTE_REJ_BIOPSY_CONFIRMED: 99% Missing value, but important for Graft Failure

- TFL_MALIG_RECUR_TUMOR & TFL_MALIG_DON_RELATED & TFL_MALIG_TUMOR & TFL_MALIG_LYMPH: 97% Missing value, Malignancy after TX

- TFL_FAIL_REJ_CHRONIC: 97% Missing value, Chronic Rejection

- TFL_FAIL_RECUR_DISEASE: 97% Missing value, Disease Recurrence

- TFL_FAIL_UROL_COMPL: 97% Missing value, Urological problems

- TFL_FAIL_GRAFT_THROMB: 97% Missing value, Graft Thrombosis

- TFL_FAIL_INFECT: 97% Missing value, Infection

- TFL_FAIL_REJ_ACUTE: 97% Missing value, Acute Rejection

- TFL_COD: 97% Missing value, Follow up Cause of Death

- TFL_FAIL_CAUSE_TY: 96% Missing value, Primary Cause of Graft Failure

- TFL_FAIL_DT: 95% Missing value, Date of Graft Failure

- TFL_DISEASE_RECUR: 94% Missing value, Disease Recurrence

- TFL_CMV_IGM & TFL_CMV_IGG: 84% Missing value, Important lab results 

- TFL_BK_THERAPY: 82% Missing value

- TFL_REJ_TREAT: 79% Missing value

- TFL_ANTIVRL_THERAPY: 73% Missing value

**Keep (retain) the following core features**

- TFL_FUNCTN_STAT: 33% Missing value

- TFL_MALIG: 12% Missing value

- TFL_CREAT: 12% Missing value

- TFL_GRAFT_STAT: 0% Missing value

- TFL_PX_STAT_DT: 0% Missing value

- TFL_PX_STAT: 0% Missing value

- PX_ID: 0.0% Missing value

- REC_TX_TY: 0.0% Missing value

- ORG_AR: 0.0% Missing value

- X_ID: 0.0% Missing value

- TRR_FOL_ID: 0.0% Missing value

- TRR_ID: 0.0% Missing value

- ORG_TY: 0.0% Missing value

- REC_TX_DT: 0.0% Missing value, IMPORTANT: sperates different entrys per PX_ID

- PERS_ID: 0.0% Missing value

- TFL_FOL_CD: 0.0% Missing value

- REC_TX_ORG_TY: 0.0% Missing value


In [2]:
SUBSET_FOLDER = "/Users/chanyoungwoo/Thesis/extracted_subsets"
CLEAN_FOLDER = "/Users/chanyoungwoo/Thesis/Data_Extraction/clean_subsets_ver2"
os.makedirs(CLEAN_FOLDER, exist_ok=True)

in_path = os.path.join(SUBSET_FOLDER, "txf_ki_subset_ver1.csv")
txf = pd.read_csv(in_path)

to_drop = [
    "TFL_COD2", "TFL_COD3",
    "TFL_BK_THERAPY_TY", "TFL_FAIL_BK",
    "TFL_CAD", "TFL_AVN",
    "TFL_ANTIVRL_THERAPY_TY", "TFL_URINE_PROTEIN",
    "TFL_IMMUNO_DISCONT", 
]
txf_clean = txf.drop(columns=to_drop)

out_path = os.path.join(CLEAN_FOLDER, "txf_ki_subset_ver2.csv")
txf_clean.to_csv(out_path, index=False)

print(f"Saved cleaned TXF_KI to {out_path} (shape {txf_clean.shape})")
txf_clean.head()

  txf = pd.read_csv(in_path)


Saved cleaned TXF_KI to /Users/chanyoungwoo/Thesis/Data_Extraction/clean_subsets_ver2/txf_ki_subset_ver2.csv (shape (4794289, 39))


Unnamed: 0,TRR_FOL_ID,TRR_ID,TFL_FOL_CD,PERS_ID,REC_TX_DT,ORG_TY,TFL_PX_STAT,TFL_PX_STAT_DT,TFL_COD,TFL_FUNCTN_STAT,...,TFL_FAIL_REJ_CHRONIC,TFL_FAIL_GRAFT_THROMB,TFL_FAIL_INFECT,TFL_FAIL_UROL_COMPL,TFL_FAIL_RECUR_DISEASE,PX_ID,TX_ID,ORG_AR,REC_TX_TY,REC_TX_ORG_TY
0,32.0,7.0,6.0,2955041.0,1994-04-03,KI,A,1994-09-19,,1.0,...,,,,,,-1999979.0,1096391.0,KI,1.0,KI
1,33.0,7.0,10.0,2955041.0,1994-04-03,KI,A,1995-08-01,,1.0,...,,,,,,-1999979.0,1096391.0,KI,1.0,KI
2,34.0,7.0,20.0,2955041.0,1994-04-03,KI,A,1996-03-01,,1.0,...,,,,,,-1999979.0,1096391.0,KI,1.0,KI
3,35.0,7.0,30.0,2955041.0,1994-04-03,KI,A,1997-05-20,,1.0,...,,,,,,-1999979.0,1096391.0,KI,1.0,KI
4,36.0,7.0,40.0,2955041.0,1994-04-03,KI,A,1997-10-01,,1.0,...,,,,,,-1999979.0,1096391.0,KI,1.0,KI


Analyse if there are several entrys for same PX_ID:

In [4]:
file_path = "/Users/chanyoungwoo/Thesis/Data_Extraction/clean_subsets_ver2/txf_ki_subset_ver2.csv"
txf = pd.read_csv(file_path)
px_counts = txf["PX_ID"].value_counts()
duplicates = px_counts[px_counts > 1]

print(f"Total records: {len(txf)}")
print(f"Unique PX_IDs: {px_counts.size}")
print(f"PX_IDs with duplicates: {len(duplicates)}")

if not duplicates.empty:
    print("\nBeispiele für mehrfach vorkommende PX_IDs und deren Counts:")
    print(duplicates.head(10))
else:
    print("\nAlle PX_IDs kommen nur einmal vor.")

  txf = pd.read_csv(file_path)


Total records: 4794289
Unique PX_IDs: 576506
PX_IDs with duplicates: 542286

Beispiele für mehrfach vorkommende PX_IDs und deren Counts:
PX_ID
-1646033.0    40
-1882894.0    39
-1628776.0    39
-1644384.0    39
-1633770.0    39
-1885297.0    39
-1636138.0    39
-1897925.0    39
-1885041.0    39
-1887870.0    39
Name: count, dtype: int64
