In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import xgboost as xgb
from sklearn.metrics import f1_score, make_scorer

In [3]:
# --- A. LOAD DATA AND SPLIT ---
try:
    df_train = pd.read_csv('data/train_processed.csv')
    df_test = pd.read_csv('data/test_processed.csv')
    test_ids = pd.read_csv('data/ids.csv')
except FileNotFoundError:
    print("FATAL ERROR: Could not find files. Please check file paths ('data/').")
    raise

# Define X/y
X_train_full = df_train.drop(['has_copd_risk'], axis=1, errors='ignore')
y_train_full = df_train['has_copd_risk']

# Split for Preprocessor fitting (80% Train, 20% Val)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.2, random_state=42, stratify=y_train_full
)

# Calculate ratio for class imbalance handling
ratio = y_train.value_counts()[0] / y_train.value_counts()[1]

# --- B. PREPROCESSING PIPELINE (Fit/Transform) ---
all_features = X_train.columns.tolist()

numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[('num', numerical_pipeline, all_features)],
    remainder='passthrough'
)
X_train_processed = preprocessor.fit_transform(X_train)

# ----------------------------------------------------------------------
# C. TRAIN MODEL, PREDICT, AND SAVE SUBMISSION
# ----------------------------------------------------------------------

# 1. Train XGBoost Model
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    n_estimators=300,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss',
    scale_pos_weight=ratio # Key for Imbalance
)
xgb_model.fit(X_train_processed, y_train)
print("✅ XGBoost Model trained successfully.")

# 2. Prepare Test Features and Predict
X_test_full = df_test.drop('patient_id', axis=1, errors='ignore')
X_test_full = X_test_full[all_features] # Ensure column order

# Apply Preprocessor to Test Data (Transform only!)
X_test_processed = preprocessor.transform(X_test_full)

# Prediction
y_test_pred = xgb_model.predict(X_test_processed)

# 3. Combine IDs and Predictions and Save
submission_df = test_ids.copy()
submission_df['has_copd_risk'] = y_test_pred.astype(int)

submission_df.to_csv('submission/xgboost.csv', index=False)
print("\n✅ XGBoost submission file saved to 'submission/xgboost.csv'")
print("\nFirst 5 predictions in submission:")
print(submission_df.head())

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


✅ XGBoost Model trained successfully.

✅ XGBoost submission file saved to 'submission/xgboost.csv'

First 5 predictions in submission:
   patient_id  has_copd_risk
0       42427              0
1       27412              0
2       19283              1
3       45261              1
4       11155              0


In [5]:
# --- A. LOAD DATA AND SPLIT (Replicate Setup) ---
try:
    df_train = pd.read_csv('data/train_processed.csv')
    df_test = pd.read_csv('data/test_processed.csv') # <-- Loaded for final prediction
    test_ids = pd.read_csv('data/ids.csv')           # <-- Loaded for final prediction
except FileNotFoundError:
    print("FATAL ERROR: Could not find 'train_processed.csv' or 'test_processed.csv'. Please check file paths.")
    raise

X_train_full = df_train.drop(['has_copd_risk', 'patient_id'], axis=1, errors='ignore')
y_train_full = df_train['has_copd_risk']

# Split for Preprocessor fitting (80% Train, 20% Val)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.2, random_state=42, stratify=y_train_full
)

# Calculate class weight ratio for imbalance handling
ratio = y_train.value_counts()[0] / y_train.value_counts()[1] 

# --- B. PREPROCESSING PIPELINE ---
all_features = X_train.columns.tolist()
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
preprocessor = ColumnTransformer(
    transformers=[('num', numerical_pipeline, all_features)],
    remainder='passthrough'
)
X_train_processed = preprocessor.fit_transform(X_train)
X_val_processed = preprocessor.transform(X_val)

# ----------------------------------------------------------------------
# C. HYPERPARAMETER TUNING 
# ----------------------------------------------------------------------

# 1. Define Model (Base Model includes imbalance handling)
xgb_base = xgb.XGBClassifier(
    objective='binary:logistic',
    random_state=42,
    eval_metric='logloss',
    scale_pos_weight=ratio, # Critical for F1 Score optimization
    n_jobs=-1
)

# 2. Define Parameter Grid
param_grid = {
    'n_estimators': [100, 300, 500],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.05, 0.01],
    'gamma': [0, 0.1, 0.5],
    'subsample': [0.7, 0.9, 1.0]
}

# Use F1 score as the primary metric for optimization
f1_scorer = make_scorer(f1_score)

# 3. Perform Randomized Search CV
xgb_search = RandomizedSearchCV(
    estimator=xgb_base,
    param_distributions=param_grid,
    n_iter=30, 
    scoring=f1_scorer,
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

# Fit the search to the preprocessed training data
xgb_search.fit(X_train_processed, y_train)

# 4. Get the Best Model and Score
best_xgb_model = xgb_search.best_estimator_
print("\n✅ XGBoost Hyperparameter Tuning Complete.")
print(f"Best parameters found: {xgb_search.best_params_}")

# Evaluate the final best model on the dedicated validation set
y_pred_val = best_xgb_model.predict(X_val_processed)
f1_tuned = f1_score(y_val, y_pred_val)
print(f"Validation F1 Score (TUNED XGBoost): {f1_tuned:.4f}")

# ----------------------------------------------------------------------
# D. PREDICT ON TEST DATA AND SAVE SUBMISSION
# ----------------------------------------------------------------------

# 5. Prepare Test Features
X_test_full = df_test.drop('patient_id', axis=1, errors='ignore')
X_test_full = X_test_full[all_features] # Ensure column order

# Apply Preprocessor to Test Data (Transform only!)
X_test_processed = preprocessor.transform(X_test_full)

# Prediction using the BEST TUNED MODEL
y_test_pred = best_xgb_model.predict(X_test_processed)

# 6. Combine IDs and Predictions and Save
submission_df = test_ids.copy()
submission_df['has_copd_risk'] = y_test_pred.astype(int)

submission_df.to_csv('submission/tuned_xgboost.csv', index=False) 
print("\n✅ Final submission file saved to 'submission/tuned_xgboost.csv'")
print("\nFirst 5 predictions in submission:")
print(submission_df.head())

Fitting 3 folds for each of 30 candidates, totalling 90 fits

✅ XGBoost Hyperparameter Tuning Complete.
Best parameters found: {'subsample': 0.7, 'n_estimators': 500, 'max_depth': 7, 'learning_rate': 0.1, 'gamma': 0}
Validation F1 Score (TUNED XGBoost): 0.7454

✅ Final submission file saved to 'submission/tuned_xgboost.csv'

First 5 predictions in submission:
   patient_id  has_copd_risk
0       42427              0
1       27412              0
2       19283              1
3       45261              1
4       11155              0
