In [1]:
# Necessary Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from imblearn.over_sampling import SMOTE
import optuna
import logging

optuna.logging.set_verbosity(optuna.logging.WARNING)

In [2]:
# Exploratory Data Analysis
# Load Data
women_train = pd.read_csv('/kaggle/input/widsdatathon2024-challenge1/training.csv')
women_test = pd.read_csv("/kaggle/input/widsdatathon2024-challenge1/test.csv")
sample_submission = pd.read_csv("/kaggle/input/widsdatathon2024-challenge1/sample_submission.csv")


In [3]:
# Feature Engineering
# Drop unnecessary columns for modeling
drop_columns = ['metastatic_first_novel_treatment', 'metastatic_first_novel_treatment_type', 'bmi','patient_race']
women_train = women_train.drop(columns=drop_columns)
women_test = women_test.drop(columns=drop_columns)

In [4]:
# Define features and target variable
X = women_train.drop(columns=['DiagPeriodL90D'])
y = women_train['DiagPeriodL90D']


In [5]:
# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
# Preprocessing for numerical and categorical data
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


In [7]:
# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [8]:
# Apply one-hot encoding and SMOTE for handling imbalance
preprocessed_X_train = preprocessor.fit_transform(X_train)
preprocessed_X_valid = preprocessor.transform(X_valid)

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(preprocessed_X_train, y_train)


In [9]:
# Hyperparameter Optimization with Optuna
def objective(trial):
    # Define XGBoost parameters to tune
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0.0, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 10.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 10.0),
        'random_state': 42
    }
    
    # Initialize XGBoost classifier with the parameters
    xgb_model = XGBClassifier(**params)
    
    # Fit the model
    xgb_model.fit(X_train_resampled, y_train_resampled)
    
    # Predict probabilities
    y_pred = xgb_model.predict_proba(preprocessed_X_valid)[:, 1]
    
    # Calculate AUC
    auc = roc_auc_score(y_valid, y_pred)
    
    return auc


In [10]:
# Run optimization
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Get best parameters
best_params = study.best_params


In [11]:
# Model Building and Evaluation
# Define XGBoost model
xgb_model = XGBClassifier(random_state=42)

clf = Pipeline(steps=[('classifier', xgb_model)])


In [12]:
# Fit the model
clf.fit(X_train_resampled, y_train_resampled)


In [13]:
# Evaluate the model
y_pred = clf.predict_proba(preprocessed_X_valid)[:, 1]
auc = roc_auc_score(y_valid, y_pred)
print(f'XGBoost with SMOTE and Optuna Tuning: AUC = {auc:.4f}')

XGBoost with SMOTE and Optuna Tuning: AUC = 0.7901


In [14]:
# Make predictions on test set
preprocessed_X_test = preprocessor.transform(women_test)
final_predictions = clf.predict_proba(preprocessed_X_test)[:, 1]


In [15]:

# Prepare submission file
submission_df = pd.DataFrame({'patient_id': women_test['patient_id'], 'DiagPeriodL90D': final_predictions})
submission_df.to_csv('submission_xgb_with_smote_and_optuna.csv', index=False)