In [1]:
import pandas as pd
import numpy as np
import copy


In [2]:
dataset_path = 'https://raw.githubusercontent.com/PhilippRamjoue/Leukemia_Classification/main/dataset/bone-marrow-dataset.csv'

dataset = pd.read_csv(dataset_path, sep=',')

In [4]:
dataset

Unnamed: 0,donor_age,donor_age_below_35,donor_ABO,donor_CMV,recipient_age,recipient_age_below_10,recipient_age_int,recipient_gender,recipient_body_mass,recipient_ABO,...,time_to_ANC_recovery,PLT_recovery,time_to_PLT_recovery,acute_GvHD_II_III_IV,acute_GvHD_III_IV,time_to_acute_GvHD_III_IV,extensive_chronic_GvHD,relapse,survival_time,survival_status
0,22.830137,yes,A,present,9.6,yes,5_10,male,35.0,A,...,19.0,yes,51.0,yes,yes,32.0,no,no,999.0,0
1,23.342466,yes,B,absent,4.0,yes,0_5,male,20.6,B,...,16.0,yes,37.0,yes,no,?,no,yes,163.0,1
2,26.394521,yes,B,absent,6.6,yes,5_10,male,23.4,B,...,23.0,yes,20.0,yes,no,?,no,yes,435.0,1
3,39.684932,no,A,present,18.1,no,10_20,female,50.0,AB,...,23.0,yes,29.0,yes,yes,19.0,?,no,53.0,1
4,33.358904,yes,A,absent,1.3,yes,0_5,female,9.0,AB,...,14.0,yes,14.0,no,no,?,no,no,2043.0,0
5,27.391781,yes,AB,?,8.9,yes,5_10,male,40.0,0,...,16.0,yes,70.0,no,no,?,no,no,2800.0,0
6,34.520548,yes,0,absent,14.4,no,10_20,female,51.0,A,...,17.0,yes,29.0,yes,yes,18.0,?,no,41.0,1
7,21.435616,yes,0,present,18.2,no,10_20,male,56.0,A,...,22.0,yes,58.0,yes,yes,22.0,?,no,45.0,1
8,32.641096,yes,AB,absent,7.9,yes,5_10,male,20.5,0,...,15.0,yes,14.0,no,no,?,no,no,671.0,0
9,28.783562,yes,A,absent,4.7,yes,0_5,male,16.5,0,...,16.0,yes,17.0,yes,no,?,no,no,676.0,0


- __donor_age__ - Age of the donor at the time of hematopoietic stem cells apheresis
- __donor_age_below_35__ - Is donor age less than 35 (yes, no)
- __donor_ABO__ - ABO blood group of the donor of hematopoietic stem cells (0, A, B, AB)
- __donor_CMV__ - Presence of cytomegalovirus infection in the donor of hematopoietic stem cells prior to transplantation (present, absent)
- __recipient_age__ - Age of the recipient of hematopoietic stem cells at the time of transplantation
- __recipient_age_below_10__ - Is recipient age below 10 (yes, no)
- __recipient_age_int__ - Age of the recipient discretized to intervals (0,5], (5, 10], (10, 20]
- __recipient_gender__ - Gender of the recipient (female, male)
- __recipient_body_mass__ - Body mass of the recipient of hematopoietic stem cells at the time of the transplantation
- __recipient_ABO__ - ABO blood group of the recipient of hematopoietic stem cells (0, A, B, AB)
- __recipient_rh__ - Presence of the Rh factor on recipient’s red blood cells (plus, minus)
- __recipient_CMV__ - Presence of cytomegalovirus infection in the donor of hematopoietic stem cells prior to transplantation (present, absent)
- __disease__ - Type of disease (ALL, AML, chronic, nonmalignant, lymphoma)
- __disease_group__ - Type of disease (malignant, nonmalignant)
- __gender_match__ - Compatibility of the donor and recipient according to their gender (female to male, other)
- __ABO_match__ - Compatibility of the donor and the recipient of hematopoietic stem cells according to ABO blood group (matched, mismatched)
- __CMV_status__ - Serological compatibility of the donor and the recipient of hematopoietic stem cells according to cytomegalovirus infection prior to transplantation (the higher the value, the lower the compatibility)
- __HLA_match__ - Compatibility of antigens of the main histocompatibility complex of the donor and the recipient of hematopoietic stem cells (10/10, 9/10, 8/10, 7/10)
- __HLA_mismatch__ - HLA matched or mismatched
- __antigen__ - In how many antigens there is a difference between the donor and the recipient (0-3)
- __allel__ - In how many allele there is a difference between the donor and the recipient (0-4)
- __HLA_group_1__ - The difference type between the donor and the recipient (HLA matched, one antigen, one allel, DRB1 cell, two allele or allel+antigen, two antigenes+allel, mismatched)
- __risk_group__ - Risk group (high, low)
- __stem_cell_source__ - Source of hematopoietic stem cells (peripheral blood, bone marrow)
- __tx_post_relapse__ - The second bone marrow transplantation after relapse (yes ,no)
- __CD34_x1e6_per_kg__ - CD34kgx10d6 - CD34+ cell dose per kg of recipient body weight (10^6/kg)
- __CD3_x1e8_per_kg__ - CD3+ cell dose per kg of recipient body weight (10^8/kg)
- __CD3_to_CD34_ratio__ - CD3+ cell to CD34+ cell ratio
- __ANC_recovery__ - Neutrophils recovery defined as neutrophils count >0.5 x 10^9/L (yes, no)
- __time_to_ANC_recovery__ - Time in days to neutrophils recovery
- __PLT_recovery__ - Platelet recovery defined as platelet count >50000/mm3 (yes, no)
- __time_to_PLT_recovery__ - Time in days to platelet recovery
- __acute_GvHD_II_III_IV__ - Development of acute graft versus host disease stage II or III or IV (yes, no)
- __acute_GvHD_III_IV__ - Development of acute graft versus host disease stage III or IV (yes, no)
- __time_to_acute_GvHD_III_IV__ - Time in days to development of acute graft versus host disease stage III or IV
- __extensive_chronic_GvHD__ - Development of extensive chronic graft versus host disease (yes, no)
- __relapse__ - Relapse of the disease (yes, no)
- __survival_time__ - Time of observation (if alive) or time to event (if dead) in days
- __survival_status__ - Survival status (0 - alive, 1 - dead)


In [3]:
dataset = copy.deepcopy(dataset)

# 1. Convert ? to NaN values
for i in range(len(dataset)):
    for col in dataset:
        if dataset.at[i,col] == '?':
            dataset.at[i, col] = np.NaN

# check out columns with the most nan values
summed_nans_cols = (dataset.isna().sum()).sort_values(ascending=False)

print(summed_nans_cols.head(20))

print(
    "%d %% of the 'time_to_acute_GvHD_III_IV ' data is nan" % ((summed_nans_cols[0] / dataset.shape[0]) * 100))



time_to_acute_GvHD_III_IV    147
extensive_chronic_GvHD        31
time_to_PLT_recovery          17
CMV_status                    16
recipient_CMV                 14
CD3_x1e8_per_kg                5
time_to_ANC_recovery           5
CD3_to_CD34_ratio              5
recipient_body_mass            2
donor_CMV                      2
recipient_rh                   2
allel                          1
recipient_ABO                  1
antigen                        1
ABO_match                      1
recipient_gender               0
recipient_age_int              0
gender_match                   0
recipient_age_below_10         0
recipient_age                  0
dtype: int64
78 % of the 'time_to_acute_GvHD_III_IV ' data is nan


In [6]:
shape_org = dataset.shape

dropped_columns_frame = dataset.dropna(axis=1)

dropped_rows_frame = dataset.dropna(axis=0)

shape_col_drop = dropped_columns_frame.shape
col_drop_loss = 100 - ((shape_col_drop[1]/shape_org[1]) * 100)

shape_row_drop = dropped_rows_frame.shape
row_drop_loss = 100 - ((shape_row_drop[0] / shape_org[0]) * 100)

print("Shape of original frame: %s" % str(shape_org))
print("Shape of frame with dropped nan columns: %s; Loss: %d %%" % (str(shape_col_drop), col_drop_loss))
print("Shape of frame with dropped nan rows: %s; Loss: %d %%" % (str(shape_row_drop), row_drop_loss))



Shape of original frame: (187, 39)
Shape of frame with dropped nan columns: (187, 24); Loss: 38 %
Shape of frame with dropped nan rows: (27, 39); Loss: 85 %
