A strong **CatBoost** baseline to get started in the **WiDS Datathon 2024** competition. 

* I use a lot of the same basic tricks as others (inc. a fork), or my own strong WIDS 2020 baseline kernel (used for our winning approach then): 
https://www.kaggle.com/code/danofer/wids-2020-starter-catboost-0-9045-lb?scriptVersionId=28356656
Enjoy!

In [1]:
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split
from catboost import Pool, CatBoostClassifier
from sklearn.metrics import roc_auc_score

In [2]:
train = pd.read_csv("/kaggle/input/widsdatathon2024-challenge1/training.csv")
print("train:",train.shape)
test = pd.read_csv("/kaggle/input/widsdatathon2024-challenge1/test.csv")
print("test:",test.shape)
sample_submission = pd.read_csv("/kaggle/input/widsdatathon2024-challenge1/sample_submission.csv")
print("sample_submission shape:",sample_submission.shape)

train: (12906, 83)
test: (5792, 82)
sample_submission shape: (5792, 2)


# Basic EDA

In [3]:
train

Unnamed: 0,patient_id,patient_race,payer_type,patient_state,patient_zip3,patient_age,patient_gender,bmi,breast_cancer_diagnosis_code,breast_cancer_diagnosis_desc,...,disabled,poverty,limited_english,commute_time,health_uninsured,veteran,Ozone,PM25,N02,DiagPeriodL90D
0,475714,,MEDICAID,CA,924,84,F,,C50919,Malignant neoplasm of unsp site of unspecified...,...,12.871429,22.542857,10.100000,27.814286,11.200000,3.500000,52.237210,8.650555,18.606528,1
1,349367,White,COMMERCIAL,CA,928,62,F,28.49,C50411,Malig neoplm of upper-outer quadrant of right ...,...,8.957576,10.109091,8.057576,30.606061,7.018182,4.103030,42.301121,8.487175,20.113179,1
2,138632,White,COMMERCIAL,TX,760,43,F,38.09,C50112,Malignant neoplasm of central portion of left ...,...,11.253333,9.663333,3.356667,31.394915,15.066667,7.446667,40.108207,7.642753,14.839351,1
3,617843,White,COMMERCIAL,CA,926,45,F,,C50212,Malig neoplasm of upper-inner quadrant of left...,...,8.845238,8.688095,5.280952,27.561905,4.404762,4.809524,42.070075,7.229393,15.894123,0
4,817482,,COMMERCIAL,ID,836,55,F,,1749,"Malignant neoplasm of breast (female), unspeci...",...,15.276000,11.224000,1.946000,26.170213,12.088000,13.106000,41.356058,4.110749,11.722197,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12901,674178,White,,OH,436,50,F,32.11,C50411,Malig neoplm of upper-outer quadrant of right ...,...,17.400000,23.600000,0.864706,19.841176,6.300000,6.247059,38.753055,8.068682,21.140731,1
12902,452909,,COMMERCIAL,CA,945,50,F,,C50912,Malignant neoplasm of unspecified site of left...,...,11.243210,7.837037,5.411250,34.700000,3.845679,5.671605,36.469947,6.265266,10.728732,1
12903,357486,,COMMERCIAL,CA,926,61,F,29.24,C50912,Malignant neoplasm of unspecified site of left...,...,8.845238,8.688095,5.280952,27.561905,4.404762,4.809524,42.070075,7.229393,15.894123,1
12904,935417,,,NY,112,37,F,31.00,1749,"Malignant neoplasm of breast (female), unspeci...",...,10.194737,18.642105,14.173684,42.502632,6.392105,1.755263,37.722740,7.879795,27.496367,0


In [4]:
train.select_dtypes(["O"]).nunique()

patient_race                              5
payer_type                                3
patient_state                            50
patient_gender                            1
breast_cancer_diagnosis_code             50
breast_cancer_diagnosis_desc             50
metastatic_cancer_diagnosis_code         43
metastatic_first_novel_treatment          2
metastatic_first_novel_treatment_type     1
Region                                    4
Division                                  9
dtype: int64

In [5]:
train.patient_zip3.nunique()

739

In [6]:
# Get categorical columns
cat_cols = train.select_dtypes(exclude='number').columns
print("Cat cols:\n",cat_cols)

### fill missing cols
train[cat_cols] = train[cat_cols].fillna("")

# same transformation for test data
test[cat_cols] = test[cat_cols].fillna("")

Cat cols:
 Index(['patient_race', 'payer_type', 'patient_state', 'patient_gender',
       'breast_cancer_diagnosis_code', 'breast_cancer_diagnosis_desc',
       'metastatic_cancer_diagnosis_code', 'metastatic_first_novel_treatment',
       'metastatic_first_novel_treatment_type', 'Region', 'Division'],
      dtype='object')


In [7]:
### some cat is actually text:
text_cols = ["breast_cancer_diagnosis_desc"]
cat_cols = [c for c in cat_cols if c not in text_cols]

# Store the indices of categorical columns in a list
cat_indices = [idx for idx, col in enumerate(train.columns) if col in cat_cols]

In [8]:
# Separate features (X) and target variable (y) from the dataset
X = train.drop(columns=['DiagPeriodL90D'])  # Features
y = train['DiagPeriodL90D']  # Target variable

# Handle NaN values
X.fillna("NaN", inplace=True)
test.fillna("NaN", inplace=True)

# Split the data into training and validation sets
# - X_train: Features for training
# - X_val: Features for validation
# - y_train: Target variable for training
# - y_val: Target variable for validation
# The validation set is 20% of the entire dataset, and the random_state ensures reproducibility
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

In [9]:
X_val[cat_cols].nunique()

patient_race                              6
payer_type                                4
patient_state                            45
patient_gender                            1
breast_cancer_diagnosis_code             37
metastatic_cancer_diagnosis_code         26
metastatic_first_novel_treatment          2
metastatic_first_novel_treatment_type     2
Region                                    5
Division                                  9
dtype: int64

In [10]:
# Create CatBoost pools for training, validation and testing data
train_pool = Pool(X_train, y_train, cat_features=cat_indices,text_features=text_cols)
val_pool_catboost = Pool(X_val, y_val, cat_features=cat_indices,text_features=text_cols)
test_pool_catboost = Pool(test, cat_features=cat_indices,text_features=text_cols)

#### Code from:WiDS 2020 Starter - Catboost
* https://www.kaggle.com/code/danofer/wids-2020-starter-catboost-0-9045-lb?scriptVersionId=28356656


Optional: Search for better model hyperparams.

In [11]:
# Create a CatBoostClassifier model
model = CatBoostClassifier(iterations=2000, verbose=0, eval_metric='AUC', use_best_model=True)

In [12]:
## disable following if running hyperparam search, wasteful
# Train a CatBoost model for classification
model.fit(train_pool, eval_set=val_pool_catboost, plot=True)

# Make predictions on the validation pool using the trained CatBoost classifier
y_pred_catboost = model.predict_proba(val_pool_catboost)[:, 1]

# Calculate ROC score between predicted and actual labels
roc_catboost = roc_auc_score(y_val, y_pred_catboost)
print(f'ROC (CatBoost): {100*roc_catboost:.3f}')

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

ROC (CatBoost): 80.388


In [13]:
%%time
### hyperparameter tuning example grid for catboost : (Note, this isn't using an eval set split)
grid = {'learning_rate': [0.04, 0.01],
        'depth': [4,7, 11],
        'l2_leaf_reg': [1, 3,15],
#        "iterations": [500],
       "custom_metric":['AUC',"Logloss"]
       } 

model = CatBoostClassifier(iterations=2000, verbose=0, eval_metric='AUC')

## can also do randomized search - more efficient typically, especially for large search space - `randomized_search`
grid_search_result = model.grid_search(grid, 
                                       train_pool,
                                       plot=True,verbose=0,
                                       refit = True, #  refit best model on all data
                                      partition_random_seed=42)

print(model.get_best_score())

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))


bestTest = 0.8069509598
bestIteration = 762

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.

bestTest = 0.8091888451
bestIteration = 1956

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.

bestTest = 0.8056387167
bestIteration = 498

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.

bestTest = 0.8059506691
bestIteration = 1997

Metric AUC


KeyboardInterrupt



In [14]:
# Make predictions on the validation pool using the trained CatBoost classifier
y_pred_catboost = model.predict_proba(val_pool_catboost)[:, 1]

# Calculate ROC score between predicted and actual labels
roc_catboost = roc_auc_score(y_val, y_pred_catboost)
print(f'ROC (CatBoost): {100*roc_catboost:.3f}')

CatBoostError: There is no trained model to use predict_proba(). Use fit() to train model. Then use this method.

# Submission

In [None]:
# Use the trained CatBoost model for prediction on the actual test set
y_pred = model.predict_proba(test_pool_catboost)[:, 1]

# Set index and set 'DiagPeriodL90D ' column in 'test' DataFrame to the average predictions
test.set_index('patient_id', inplace=True)
test['DiagPeriodL90D'] = y_pred

# Extract the 'DiagPeriodL90D ' column
submission = test['DiagPeriodL90D']

# Save the submission to a CSV file
submission.to_csv('submission.csv')

### Top features

#### Shap features importance
* Better for generalizing to other models

In [None]:
import shap
shap.initjs()

explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(train_pool)

In [None]:
# summarize the effects of all the features
# grey features = categoricals or text
shap.summary_plot(shap_values, X_train)


In [None]:
## model specific
feature_importances = model.get_feature_importance(train_pool)
feature_names = X_train.columns
for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
    if score > 0.11:
        print('{0}: {1:.2f}'.format(name, score))