# 0. Setting Up Env

## 0.1 Import Packages

In [2]:
import pandas as pd
import numpy as np
from  lightgbm import LGBMRegressor,LGBMClassifier,log_evaluation,early_stopping
# from catboost import CatBoostRegressor,CatBoostClassifier
# from lifelines import KaplanMeierFitter
import warnings
warnings.filterwarnings('ignore')
import random


## 0.2 Custom Functions

In [10]:
def get_feature_types(df):
    """
    Returns a dictionary containing lists of object (categorical), numerical, and boolean columns.
    
    Parameters:
    df (pd.DataFrame): The input DataFrame

    Returns:
    dict: A dictionary with keys 'object', 'numeric', and 'boolean' containing respective column lists
    """
    type_len = {
        "object_len": len(df.select_dtypes(include=['object']).columns.tolist()),
        "int_len": len(df.select_dtypes(include=['int']).columns.tolist()),
        "float_len": len(df.select_dtypes(include=['float']).columns.tolist()),
        "boolean_len": len(df.select_dtypes(include=['bool']).columns.tolist())
    }

    feature_types = {
        "object": df.select_dtypes(include=['object']).columns.tolist(),
        "int": df.select_dtypes(include=['int']).columns.tolist(),
        "float": df.select_dtypes(include=['float']).columns.tolist(),
        "boolean": df.select_dtypes(include=['bool']).columns.tolist()
    }
    return type_len, feature_types

## 0.3 Other Setups

In [4]:
pd.set_option("display.max_columns", None)  # Show all columns
pd.set_option("display.width", 1000)  # Increase display width

In [5]:
def seed_everything(seed):
    np.random.seed(seed)
    random.seed(seed)
seed_everything(seed=99)

# 1. Import Data

In [6]:
data_dir = "/06 CIBMTR/01 Data" 
train=pd.read_csv("D:/01 Work/01 Coding/01 Hackathons/06 CIBMTR/01 Data/train.csv")
test=pd.read_csv("D:/01 Work/01 Coding/01 Hackathons/06 CIBMTR/01 Data/test.csv")
print(f" Train Data Shape : ", train.shape)
print(f" Test Data Shape : ", test.shape)

train.head(3)

 Train Data Shape :  (28800, 60)
 Test Data Shape :  (3, 58)


Unnamed: 0,ID,dri_score,psych_disturb,cyto_score,diabetes,hla_match_c_high,hla_high_res_8,tbi_status,arrhythmia,hla_low_res_6,graft_type,vent_hist,renal_issue,pulm_severe,prim_disease_hct,hla_high_res_6,cmv_status,hla_high_res_10,hla_match_dqb1_high,tce_imm_match,hla_nmdp_6,hla_match_c_low,rituximab,hla_match_drb1_low,hla_match_dqb1_low,prod_type,cyto_score_detail,conditioning_intensity,ethnicity,year_hct,obesity,mrd_hct,in_vivo_tcd,tce_match,hla_match_a_high,hepatic_severe,donor_age,prior_tumor,hla_match_b_low,peptic_ulcer,age_at_hct,hla_match_a_low,gvhd_proph,rheum_issue,sex_match,hla_match_b_high,race_group,comorbidity_score,karnofsky_score,hepatic_mild,tce_div_match,donor_related,melphalan_dose,hla_low_res_8,cardiac,hla_match_drb1_high,pulm_moderate,hla_low_res_10,efs,efs_time
0,0,N/A - non-malignant indication,No,,No,,,No TBI,No,6.0,Bone marrow,No,No,No,IEA,6.0,+/+,,2.0,,6.0,2.0,No,2.0,2.0,BM,,,Not Hispanic or Latino,2016,No,,Yes,,2.0,No,,No,2.0,No,9.942,2.0,FKalone,No,M-F,2.0,More than one race,0.0,90.0,No,,Unrelated,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.0,42.356
1,1,Intermediate,No,Intermediate,No,2.0,8.0,"TBI +- Other, >cGy",No,6.0,Peripheral blood,No,No,No,AML,6.0,+/+,10.0,2.0,P/P,6.0,2.0,No,2.0,2.0,PB,Intermediate,MAC,Not Hispanic or Latino,2008,No,Positive,No,Permissive,2.0,No,72.29,No,2.0,No,43.705,2.0,Other GVHD Prophylaxis,No,F-F,2.0,Asian,3.0,90.0,No,Permissive mismatched,Related,"N/A, Mel not given",8.0,No,2.0,Yes,10.0,1.0,4.672
2,2,N/A - non-malignant indication,No,,No,2.0,8.0,No TBI,No,6.0,Bone marrow,No,No,No,HIS,6.0,+/+,10.0,2.0,P/P,6.0,2.0,No,2.0,2.0,BM,,,Not Hispanic or Latino,2019,No,,Yes,,2.0,No,,No,2.0,No,33.997,2.0,Cyclophosphamide alone,No,F-M,2.0,More than one race,0.0,90.0,No,Permissive mismatched,Related,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.0,19.793


In [7]:
set(train.columns) - set(test.columns)

{'efs', 'efs_time'}

# 2. EDA Data

In [8]:
X_train = train.iloc[:,:-2]
y_train = train.iloc[:,-2:]

X_train.dtypes


ID                          int64
dri_score                  object
psych_disturb              object
cyto_score                 object
diabetes                   object
hla_match_c_high          float64
hla_high_res_8            float64
tbi_status                 object
arrhythmia                 object
hla_low_res_6             float64
graft_type                 object
vent_hist                  object
renal_issue                object
pulm_severe                object
prim_disease_hct           object
hla_high_res_6            float64
cmv_status                 object
hla_high_res_10           float64
hla_match_dqb1_high       float64
tce_imm_match              object
hla_nmdp_6                float64
hla_match_c_low           float64
rituximab                  object
hla_match_drb1_low        float64
hla_match_dqb1_low        float64
prod_type                  object
cyto_score_detail          object
conditioning_intensity     object
ethnicity                  object
year_hct      

In [12]:
ft_count, ft_cat = get_feature_types(X_train)
ft_count, len(X_train.columns)

({'object_len': 35, 'int_len': 2, 'float_len': 21, 'boolean_len': 0}, 58)

# 3. Numerical Columns

# 4. Categorical Columns

# END