In [1]:
# Necessary Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
import optuna

# Set Optuna verbosity to warning
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [2]:
# Exploratory Data Analysis
# Load Data
women_train = pd.read_csv('/kaggle/input/widsdatathon2024-challenge1/training.csv')
women_test = pd.read_csv("/kaggle/input/widsdatathon2024-challenge1/test.csv")
sample_submission = pd.read_csv("/kaggle/input/widsdatathon2024-challenge1/sample_submission.csv")


In [3]:
# Feature Engineering
# Drop unnecessary columns for modeling
drop_columns = ['metastatic_first_novel_treatment', 'metastatic_first_novel_treatment_type', 'bmi', 'patient_race']
women_train = women_train.drop(columns=drop_columns)
women_test = women_test.drop(columns=drop_columns)

In [4]:
# Text preprocessing for 'breast_cancer_diagnosis_desc'
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation and special characters
    text = ''.join([char for char in text if char.isalnum() or char.isspace()])
    return text

women_train['breast_cancer_diagnosis_desc'] = women_train['breast_cancer_diagnosis_desc'].apply(preprocess_text)
women_test['breast_cancer_diagnosis_desc'] = women_test['breast_cancer_diagnosis_desc'].apply(preprocess_text)


In [5]:
# Define features and target variable
X = women_train.drop(columns=['DiagPeriodL90D'])
y = women_train['DiagPeriodL90D']
# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
# Preprocessing for numerical and categorical data
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [7]:
# Model Building and Evaluation
# Define XGBoost model with Optuna hyperparameter optimization
def objective_xgb(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 200, 300),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1),
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
        'random_state': 42
    }

    xgb_model = XGBClassifier(**params)

    clf = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', xgb_model)])
    clf.fit(X_train, y_train)
    y_pred = clf.predict_proba(X_valid)[:, 1]
    auc = roc_auc_score(y_valid, y_pred)

    return auc



In [8]:
# Create and run the Optuna study for XGBoost
study_xgb = optuna.create_study(direction='maximize')
study_xgb.optimize(objective_xgb, n_trials=100)

# Get the best parameters for XGBoost
best_params_xgb = study_xgb.best_params


In [9]:
# Train XGBoost model with best hyperparameters
xgb_model = XGBClassifier(**best_params_xgb, random_state=42)
clf_xgb = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', xgb_model)])
clf_xgb.fit(X_train, y_train)


In [10]:
# Evaluate XGBoost model
y_pred_xgb = clf_xgb.predict_proba(X_valid)[:, 1]
auc_xgb = roc_auc_score(y_valid, y_pred_xgb)
print(f'XGBoost: AUC = {auc_xgb:.4f}')

XGBoost: AUC = 0.8051


In [11]:
# Define KFold object
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Perform k-fold cross-validation for XGBoost
auc_scores_xgb = []

for train_index, valid_index in kf.split(X):
    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

    xgb_model = XGBClassifier(**best_params_xgb, random_state=42)
    clf_xgb = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', xgb_model)])
    clf_xgb.fit(X_train, y_train)
    
    y_pred_xgb = clf_xgb.predict_proba(X_valid)[:, 1]
    auc_xgb = roc_auc_score(y_valid, y_pred_xgb)
    auc_scores_xgb.append(auc_xgb)

print("XGBoost AUC Scores:", auc_scores_xgb)

XGBoost AUC Scores: [0.8069460155857214, 0.8114586742529142, 0.8163395212010551, 0.7703049597008514, 0.8063359727932591, 0.782199534133296, 0.7931495177221348, 0.793353573181595, 0.8120867307614296, 0.7940206937205262]


In [12]:
# Make predictions on test set with XGBoost
final_predictions_xgb = []

for train_index, valid_index in kf.split(X):
    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

    xgb_model = XGBClassifier(**best_params_xgb, random_state=42)
    clf_xgb = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', xgb_model)])
    clf_xgb.fit(X_train, y_train)
    
    final_predictions_xgb.append(clf_xgb.predict_proba(women_test)[:, 1])

# Average predictions across folds
final_predictions_xgb = np.mean(final_predictions_xgb, axis=0)



In [13]:
# Prepare submission file
submission_df_xgb = pd.DataFrame({'patient_id': women_test['patient_id'], 'DiagPeriodL90D': final_predictions_xgb})
submission_df_xgb.to_csv('submission_xgb.csv', index=False)