In [1]:
# Necessary Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import roc_auc_score
import optuna
import logging

optuna.logging.set_verbosity(optuna.logging.WARNING)

In [2]:
# Exploratory Data Analysis
# Load Data
women_train = pd.read_csv('/kaggle/input/widsdatathon2024-challenge1/training.csv')
women_test = pd.read_csv("/kaggle/input/widsdatathon2024-challenge1/test.csv")
sample_submission = pd.read_csv("/kaggle/input/widsdatathon2024-challenge1/sample_submission.csv")

In [3]:
# Feature Engineering
# Drop unnecessary columns for modeling
drop_columns = [
    'metastatic_first_novel_treatment', 'metastatic_first_novel_treatment_type', 'bmi', 'patient_race',
    'housing_units', 'density',
    'income_household_median', 'income_household_six_figure',  'home_value', 'rent_median',
    'education_college_or_above', 'unemployment_rate','rent_burden',
    'education_less_highschool', 'education_highschool', 'education_some_college',
    'education_stem_degree', 'farmer',  'limited_english',
    'commute_time', 'health_uninsured',  'PM25', 'N02'
]

women_train = women_train.drop(columns=drop_columns)
women_test = women_test.drop(columns=drop_columns)



In [4]:
# Display the updated columns after dropping
print("Remaining columns after dropping unnecessary columns:")
print(women_train.columns.tolist())


Remaining columns after dropping unnecessary columns:
['patient_id', 'payer_type', 'patient_state', 'patient_zip3', 'patient_age', 'patient_gender', 'breast_cancer_diagnosis_code', 'breast_cancer_diagnosis_desc', 'metastatic_cancer_diagnosis_code', 'Region', 'Division', 'population', 'age_median', 'age_under_10', 'age_10_to_19', 'age_20s', 'age_30s', 'age_40s', 'age_50s', 'age_60s', 'age_70s', 'age_over_80', 'male', 'female', 'married', 'divorced', 'never_married', 'widowed', 'family_size', 'family_dual_income', 'income_household_under_5', 'income_household_5_to_10', 'income_household_10_to_15', 'income_household_15_to_20', 'income_household_20_to_25', 'income_household_25_to_35', 'income_household_35_to_50', 'income_household_50_to_75', 'income_household_75_to_100', 'income_household_100_to_150', 'income_household_150_over', 'income_individual_median', 'home_ownership', 'education_bachelors', 'education_graduate', 'labor_force_participation', 'self_employed', 'race_white', 'race_black

In [5]:
# Define features and target variable
X = women_train.drop(columns=['DiagPeriodL90D'])
y = women_train['DiagPeriodL90D']

In [6]:
# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Preprocessing for numerical and categorical data
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [8]:
# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [9]:
# Model Building and Evaluation
# Define XGBoost model with Optuna hyperparameter optimization
def objective_xgb(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 200, 300),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1),
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
        'random_state': 42
    }

    xgb_model = XGBClassifier(**params)

    clf = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', xgb_model)])
    clf.fit(X_train, y_train)
    y_pred = clf.predict_proba(X_valid)[:, 1]
    auc = roc_auc_score(y_valid, y_pred)

    return auc

In [10]:
# Define CatBoost model with Optuna hyperparameter optimization
def objective_catboost(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 50, 200),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-3, 10.0),
        'random_state': 42,
        'verbose': 0
    }

    catboost_model = CatBoostClassifier(**params)

    clf = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', catboost_model)])
    clf.fit(X_train, y_train)
    y_pred = clf.predict_proba(X_valid)[:, 1]
    auc = roc_auc_score(y_valid, y_pred)

    return auc

In [11]:
# Create and run the Optuna study for XGBoost
study_xgb = optuna.create_study(direction='maximize')
study_xgb.optimize(objective_xgb, n_trials=70)

# Get the best parameters for XGBoost
best_params_xgb = study_xgb.best_params

In [12]:
# Train XGBoost model with best hyperparameters
xgb_model = XGBClassifier(**best_params_xgb, random_state=42)
clf_xgb = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', xgb_model)])
clf_xgb.fit(X_train, y_train)

In [13]:
# Evaluate XGBoost model
y_pred_xgb = clf_xgb.predict_proba(X_valid)[:, 1]
auc_xgb = roc_auc_score(y_valid, y_pred_xgb)
print(f'XGBoost: AUC = {auc_xgb:.4f}')

XGBoost: AUC = 0.8042


In [14]:
# Create and run the Optuna study for CatBoost
study_catboost = optuna.create_study(direction='maximize')
study_catboost.optimize(objective_catboost, n_trials=70)

# Get the best parameters for CatBoost
best_params_catboost = study_catboost.best_params


In [15]:
# Train CatBoost model with best hyperparameters
catboost_model = CatBoostClassifier(**best_params_catboost, random_state=42, verbose=0)
clf_catboost = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', catboost_model)])
clf_catboost.fit(X_train, y_train)

In [16]:
# Evaluate CatBoost model
y_pred_catboost = clf_catboost.predict_proba(X_valid)[:, 1]
auc_catboost = roc_auc_score(y_valid, y_pred_catboost)
print(f'CatBoost: AUC = {auc_catboost:.4f}')

CatBoost: AUC = 0.8001


In [17]:
# Ensemble XGBoost and CatBoost models
voting_clf = VotingClassifier([('xgb', clf_xgb), ('catboost', clf_catboost)], voting='soft')
voting_clf.fit(X_train, y_train)

In [18]:
# Evaluate the ensemble model
y_pred_ensemble = voting_clf.predict_proba(X_valid)[:, 1]
auc_ensemble = roc_auc_score(y_valid, y_pred_ensemble)
print(f'Ensemble (XGBoost + CatBoost): AUC = {auc_ensemble:.4f}')

Ensemble (XGBoost + CatBoost): AUC = 0.8030


In [19]:
# Make predictions on test set
final_predictions_ensemble = voting_clf.predict_proba(women_test)[:, 1]

In [20]:
# Prepare submission file
submission_df = pd.DataFrame({'patient_id': women_test['patient_id'], 'DiagPeriodL90D': final_predictions_ensemble})
submission_df.to_csv('submission.csv', index=False)