In [3]:
# =====================================================
# üîß XGBoost Hyperparameter Tuning using RandomizedSearchCV
# =====================================================

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
from scipy.stats import uniform, randint
from sklearn.metrics import f1_score, accuracy_score

# --- 1Ô∏è‚É£ Load data ---
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# --- 2Ô∏è‚É£ Feature Engineering ---
train_df['BMI'] = train_df['Weight'] / ((train_df['Height'] / 100.0) ** 2)
test_df['BMI'] = test_df['Weight'] / ((test_df['Height'] / 100.0) ** 2)

# --- 3Ô∏è‚É£ Target and feature separation ---
target = "WeightCategory"
le = LabelEncoder()
y = le.fit_transform(train_df[target])
X = train_df.drop(columns=['id', target], errors='ignore')

# --- 4Ô∏è‚É£ Handle categorical features (One-hot encoding) ---
cat_cols = X.select_dtypes(include=['object']).columns.tolist()
X = pd.get_dummies(X, columns=cat_cols, drop_first=True)

# --- 5Ô∏è‚É£ Scale numeric features ---
num_cols = ['Height', 'Weight', 'BMI']
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])

xgb_model = XGBClassifier(
    objective='multi:softprob',
    num_class=len(le.classes_),
    tree_method='hist',
    eval_metric='mlogloss',
    seed=38,
    use_label_encoder=False
)


param_distributions = {
    'learning_rate': uniform(0.03, 0.04),       # around 0.05 ‚Üí 0.03‚Äì0.07
    'max_depth': randint(5, 8),                 # around 6 ‚Üí 5, 6, 7
    'min_child_weight': randint(2, 5),          # around 3 ‚Üí 2‚Äì4
    'subsample': uniform(0.8, 0.2),             # around 0.9 ‚Üí 0.8‚Äì1.0
    'colsample_bytree': uniform(0.8, 0.2),      # around 0.9 ‚Üí 0.8‚Äì1.0
    'gamma': uniform(0.05, 0.1)                 # around 0.1 ‚Üí 0.05‚Äì0.15
}


random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_distributions,
    n_iter=30,                 # number of random combinations to test
    scoring='f1_macro',        # scoring metric
    cv=3,                      # 3-fold cross-validation
    verbose=2,                 # print progress
    random_state=38,
    n_jobs=-1                  # use all CPU cores
)

print("\nüîé Starting RandomizedSearchCV tuning...\n")
random_search.fit(X, y)


print("\n‚úÖ Best Parameters Found:")
print(random_search.best_params_)

print(f"\nüèÜ Best F1-macro Score: {random_search.best_score_:.4f}")


xgb_params = {
    'objective': 'multi:softprob',
    'num_class': len(le.classes_),
    'tree_method': 'hist',
    'eval_metric': 'mlogloss',
    'seed': 38,
    **random_search.best_params_
}

print("\nüîß Final tuned xgb_params to use in training:")
print(xgb_params)



üîé Starting RandomizedSearchCV tuning...

Fitting 3 folds for each of 30 candidates, totalling 90 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



‚úÖ Best Parameters Found:
{'colsample_bytree': np.float64(0.8510679370361254), 'gamma': np.float64(0.08299983049123688), 'learning_rate': np.float64(0.06665234059545111), 'max_depth': 7, 'min_child_weight': 4, 'subsample': np.float64(0.8172742477700612)}

üèÜ Best F1-macro Score: 0.8913

üîß Final tuned xgb_params to use in training:
{'objective': 'multi:softprob', 'num_class': 7, 'tree_method': 'hist', 'eval_metric': 'mlogloss', 'seed': 38, 'colsample_bytree': np.float64(0.8510679370361254), 'gamma': np.float64(0.08299983049123688), 'learning_rate': np.float64(0.06665234059545111), 'max_depth': 7, 'min_child_weight': 4, 'subsample': np.float64(0.8172742477700612)}
