In [1]:
# Necessary Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import roc_auc_score
import optuna
import logging

optuna.logging.set_verbosity(optuna.logging.WARNING)


In [2]:
# Exploratory Data Analysis
# Load Data
women_train = pd.read_csv('/kaggle/input/widsdatathon2024-challenge1/training.csv')
women_test = pd.read_csv("/kaggle/input/widsdatathon2024-challenge1/test.csv")
sample_submission = pd.read_csv("/kaggle/input/widsdatathon2024-challenge1/sample_submission.csv")


In [3]:

# Train Data Info
women_train.head(10)


Unnamed: 0,patient_id,patient_race,payer_type,patient_state,patient_zip3,patient_age,patient_gender,bmi,breast_cancer_diagnosis_code,breast_cancer_diagnosis_desc,...,disabled,poverty,limited_english,commute_time,health_uninsured,veteran,Ozone,PM25,N02,DiagPeriodL90D
0,475714,,MEDICAID,CA,924,84,F,,C50919,Malignant neoplasm of unsp site of unspecified...,...,12.871429,22.542857,10.1,27.814286,11.2,3.5,52.23721,8.650555,18.606528,1
1,349367,White,COMMERCIAL,CA,928,62,F,28.49,C50411,Malig neoplm of upper-outer quadrant of right ...,...,8.957576,10.109091,8.057576,30.606061,7.018182,4.10303,42.301121,8.487175,20.113179,1
2,138632,White,COMMERCIAL,TX,760,43,F,38.09,C50112,Malignant neoplasm of central portion of left ...,...,11.253333,9.663333,3.356667,31.394915,15.066667,7.446667,40.108207,7.642753,14.839351,1
3,617843,White,COMMERCIAL,CA,926,45,F,,C50212,Malig neoplasm of upper-inner quadrant of left...,...,8.845238,8.688095,5.280952,27.561905,4.404762,4.809524,42.070075,7.229393,15.894123,0
4,817482,,COMMERCIAL,ID,836,55,F,,1749,"Malignant neoplasm of breast (female), unspeci...",...,15.276,11.224,1.946,26.170213,12.088,13.106,41.356058,4.110749,11.722197,0
5,111545,White,MEDICARE ADVANTAGE,NY,141,66,F,,1749,"Malignant neoplasm of breast (female), unspeci...",...,13.717143,8.888235,0.638235,25.0,4.797143,7.745714,40.107248,6.181812,13.562528,0
6,914071,,COMMERCIAL,CA,900,51,F,29.05,C50912,Malignant neoplasm of unspecified site of left...,...,11.901538,20.76,14.7375,30.709375,10.341538,3.030769,41.186992,11.166898,21.644261,1
7,479368,White,COMMERCIAL,IL,619,60,F,,C50512,Malig neoplasm of lower-outer quadrant of left...,...,15.26,10.89,0.503333,24.275862,8.753333,7.506667,37.64677,7.295977,12.914805,1
8,994014,White,MEDICARE ADVANTAGE,,973,82,F,,1744,Malignant neoplasm of upper-outer quadrant of ...,...,19.371875,14.593651,1.620968,26.015254,6.645313,10.955385,36.323573,4.744352,10.439314,0
9,155485,,COMMERCIAL,IL,617,64,F,,C50912,Malignant neoplasm of unspecified site of left...,...,11.816981,8.443396,0.190566,23.843396,4.684906,9.016981,37.77383,7.299998,14.942968,1


In [4]:
# Missing Values Check
print(women_train.isnull().sum())



patient_id           0
patient_race      6385
payer_type        1803
patient_state       51
patient_zip3         0
                  ... 
veteran              1
Ozone               29
PM25                29
N02                 29
DiagPeriodL90D       0
Length: 83, dtype: int64


In [5]:
print(women_test.isnull().sum())

patient_id             0
patient_race        2901
payer_type           760
patient_state         21
patient_zip3           0
                    ... 
health_uninsured       0
veteran                0
Ozone                 14
PM25                  14
N02                   14
Length: 82, dtype: int64


In [6]:
# Feature Engineering
# Drop unnecessary columns for modeling
drop_columns = ['metastatic_first_novel_treatment', 'metastatic_first_novel_treatment_type', 'bmi','patient_race']
women_train = women_train.drop(columns=drop_columns)
women_test = women_test.drop(columns=drop_columns)


In [7]:
# Define features and target variable
X = women_train.drop(columns=['DiagPeriodL90D'])
y = women_train['DiagPeriodL90D']


In [8]:
# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:
# Preprocessing for numerical and categorical data
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [10]:
# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [11]:
# Model Building and Evaluation
# Define XGBoost model with Optuna hyperparameter optimization
def objective_xgb(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 200, 300),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1),
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
        'random_state': 42
    }

    xgb_model = XGBClassifier(**params)

    clf = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', xgb_model)])
    clf.fit(X_train, y_train)
    y_pred = clf.predict_proba(X_valid)[:, 1]
    auc = roc_auc_score(y_valid, y_pred)

    return auc

In [12]:
# Define CatBoost model with Optuna hyperparameter optimization
def objective_catboost(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 50, 200),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-3, 10.0),
        'random_state': 42,
        'verbose': 0
    }

    catboost_model = CatBoostClassifier(**params)

    clf = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', catboost_model)])
    clf.fit(X_train, y_train)
    y_pred = clf.predict_proba(X_valid)[:, 1]
    auc = roc_auc_score(y_valid, y_pred)

    return auc

In [13]:
# Create and run the Optuna study for XGBoost
study_xgb = optuna.create_study(direction='maximize')
study_xgb.optimize(objective_xgb, n_trials=30)

# Get the best parameters for XGBoost
best_params_xgb = study_xgb.best_params

In [14]:
# Train XGBoost model with best hyperparameters
xgb_model = XGBClassifier(**best_params_xgb, random_state=42)
clf_xgb = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', xgb_model)])
clf_xgb.fit(X_train, y_train)

In [15]:
# Evaluate XGBoost model
y_pred_xgb = clf_xgb.predict_proba(X_valid)[:, 1]
auc_xgb = roc_auc_score(y_valid, y_pred_xgb)
print(f'XGBoost: AUC = {auc_xgb:.4f}')

XGBoost: AUC = 0.8013


In [16]:
# Create and run the Optuna study for CatBoost
study_catboost = optuna.create_study(direction='maximize')
study_catboost.optimize(objective_catboost, n_trials=30)

# Get the best parameters for CatBoost
best_params_catboost = study_catboost.best_params


In [17]:
# Train CatBoost model with best hyperparameters
catboost_model = CatBoostClassifier(**best_params_catboost, random_state=42, verbose=0)
clf_catboost = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', catboost_model)])
clf_catboost.fit(X_train, y_train)

In [18]:
# Evaluate CatBoost model
y_pred_catboost = clf_catboost.predict_proba(X_valid)[:, 1]
auc_catboost = roc_auc_score(y_valid, y_pred_catboost)
print(f'CatBoost: AUC = {auc_catboost:.4f}')

CatBoost: AUC = 0.8008


In [19]:
# Ensemble XGBoost and CatBoost models
voting_clf = VotingClassifier([('xgb', clf_xgb), ('catboost', clf_catboost)], voting='soft')
voting_clf.fit(X_train, y_train)



In [20]:
# Evaluate the ensemble model
y_pred_ensemble = voting_clf.predict_proba(X_valid)[:, 1]
auc_ensemble = roc_auc_score(y_valid, y_pred_ensemble)
print(f'Ensemble (XGBoost + CatBoost): AUC = {auc_ensemble:.4f}')

Ensemble (XGBoost + CatBoost): AUC = 0.8018


In [21]:
# Make predictions on test set
final_predictions_ensemble = voting_clf.predict_proba(women_test)[:, 1]

In [22]:
# Prepare submission file
submission_df = pd.DataFrame({'patient_id': women_test['patient_id'], 'DiagPeriodL90D': final_predictions_ensemble})
submission_df.to_csv('submission.csv', index=False)