In [1]:
import numpy as np
import pandas as pd
from sklearn.base import clone
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.compose import ColumnTransformer
from category_encoders import MEstimateEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from xgboost import XGBClassifier
import optuna
from functools import partial
import warnings

# ---------------------
# Settings
# ---------------------
warnings.filterwarnings("ignore", message="X does not have valid feature names")
optuna.logging.set_verbosity(optuna.logging.WARNING)

TARGET = "WeightCategory"
targetMap = {
    'Insufficient_Weight':0,'Normal_Weight':1,
    'Overweight_Level_I':2,'Overweight_Level_II':3, 
    'Obesity_Type_I':4,'Obesity_Type_II':5 ,'Obesity_Type_III':6
}
targetMap_reversed = {v:k for k,v in targetMap.items()}

# ---------------------
# Cross-validation function
# ---------------------
def cross_val_score_pipeline(X_tr, y_tr, pipeline, skf, verbose=False):
    """
    Custom cross-validation for a pipeline. Returns mean accuracy.
    """
    X = X_tr.copy()
    y = y_tr.copy()
    val_scores = []

    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        model = clone(pipeline)
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        acc = accuracy_score(y_val, y_pred)
        val_scores.append(acc)

        if verbose:
            print(f"FOLD {fold}: Accuracy = {acc:.5f}")

    mean_acc = np.mean(val_scores)
    if verbose:
        print(f"Mean CV Accuracy: {mean_acc:.5f}")
    return mean_acc

# ---------------------
# Pipeline preparation
# ---------------------
def xgb_pipeline(df_train, df_test, xgb_model):
    """
    Prepares train/test features and returns pipeline.
    Drops 'id' column automatically.
    """
    X_train = df_train.copy()
    X_test = df_test.copy()

    # Drop ID if exists
    if 'id' in X_train.columns:
        X_train = X_train.drop(columns=['id'])
    if 'id' in X_test.columns:
        X_test = X_test.drop(columns=['id'])
    
    y_train = X_train.pop(TARGET).map(targetMap)

    categorical_columns = list(X_train.select_dtypes(include='object').columns)
    numerical_columns = list(X_train.select_dtypes(exclude='object').columns)

    cat_transformer = make_pipeline(MEstimateEncoder())
    num_transformer = make_pipeline(StandardScaler())

    preprocessor = ColumnTransformer([
        ("num", num_transformer, numerical_columns),
        ("cat", cat_transformer, categorical_columns)
    ])

    pipeline = make_pipeline(preprocessor, xgb_model)

    return X_train, y_train, X_test, pipeline

# ---------------------
# Optuna objective
# ---------------------
def objective(trial, pipeline, X_train, y_train, skf):
    params = {
        "max_depth": trial.suggest_int("max_depth", 2, 20),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 0.1, log=True),
        "subsample": trial.suggest_float("subsample", 0.7, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.1, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-2, 10.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-2, 10.0),
        "gamma": trial.suggest_float("gamma", 0.0, 5.0)
    }
    pipeline.named_steps['xgbclassifier'].set_params(**params)

    mean_acc = cross_val_score_pipeline(X_train, y_train, pipeline, skf)
    print(f"Trial {trial.number}: Accuracy = {mean_acc:.5f}")
    return mean_acc

# ---------------------
# Load data
# ---------------------
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

# Save test IDs for submission
test_ids = df_test['id'].copy() if 'id' in df_test.columns else pd.Series(range(len(df_test)))

# ---------------------
# Prepare pipeline & model
# ---------------------
xgb_model = XGBClassifier(
    n_estimators=500,
    n_jobs=-1,
    eval_metric="mlogloss"
)

X_train, y_train, X_test, pipeline = xgb_pipeline(df_train, df_test, xgb_model)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# ---------------------
# Optuna hyperparameter tuning
# ---------------------
study = optuna.create_study(direction="maximize")
study.optimize(
    partial(objective, pipeline=pipeline, X_train=X_train, y_train=y_train, skf=skf),
    n_trials=20
)

# ---------------------
# Best trial results
# ---------------------
trial = study.best_trial
print("\nBest CV Accuracy:", trial.value)
print("Best Hyperparameters:")
for k, v in trial.params.items():
    print(f"{k}: {v}")

# ---------------------
# Train final model with best hyperparameters
# ---------------------
best_params = trial.params
xgb_model_final = XGBClassifier(
    n_estimators=500,
    n_jobs=-1,
    use_label_encoder=False,
    eval_metric="mlogloss",
    **best_params
)

X_train, y_train, X_test, final_pipeline = xgb_pipeline(df_train, df_test, xgb_model_final)

# Fit on full training data
final_pipeline.fit(X_train, y_train)

# Predict on test set
test_preds_numeric = final_pipeline.predict(X_test)
test_preds_labels = [targetMap_reversed[i] for i in test_preds_numeric]

# Save submission
submission = pd.DataFrame({
    'id': test_ids,
    'WeightCategory': test_preds_labels
})
submission.to_csv("submission.csv", index=False)
print("\nSaved predictions to submission.csv")


Trial 0: Accuracy = 0.76373
Trial 1: Accuracy = 0.89410
Trial 2: Accuracy = 0.88592
Trial 3: Accuracy = 0.88991
Trial 4: Accuracy = 0.63928
Trial 5: Accuracy = 0.85096
Trial 6: Accuracy = 0.89397
Trial 7: Accuracy = 0.88045
Trial 8: Accuracy = 0.82206
Trial 9: Accuracy = 0.26833
Trial 10: Accuracy = 0.90195
Trial 11: Accuracy = 0.90272
Trial 12: Accuracy = 0.90523
Trial 13: Accuracy = 0.90530
Trial 14: Accuracy = 0.89532
Trial 15: Accuracy = 0.90601
Trial 16: Accuracy = 0.90684
Trial 17: Accuracy = 0.90536
Trial 18: Accuracy = 0.90472
Trial 19: Accuracy = 0.89558

Best CV Accuracy: 0.9068437160050907
Best Hyperparameters:
max_depth: 12
min_child_weight: 5
learning_rate: 0.028530248692968278
subsample: 0.790747100681893
colsample_bytree: 0.5668480674558728
reg_alpha: 1.3824572149260739
reg_lambda: 3.0432020762518794
gamma: 0.7593682249421783


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Saved predictions to submission.csv


**the above gave 91% accuracy on kaggle**

In [3]:
import numpy as np
import pandas as pd
from sklearn.base import clone
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.compose import ColumnTransformer
from category_encoders import MEstimateEncoder
from sklearn.preprocessing import StandardScaler, RobustScalery6
from sklearn.pipeline import make_pipeline
from xgboost import XGBClassifier
import optuna
from functools import partial
import warnings

# ---------------------
# Settings
# ---------------------
warnings.filterwarnings("ignore", message="X does not have valid feature names")
optuna.logging.set_verbosity(optuna.logging.WARNING)

TARGET = "WeightCategory"
targetMap = {
    'Insufficient_Weight':0,'Normal_Weight':1,
    'Overweight_Level_I':2,'Overweight_Level_II':3, 
    'Obesity_Type_I':4,'Obesity_Type_II':5 ,'Obesity_Type_III':6
}
targetMap_reversed = {v:k for k,v in targetMap.items()}

# ---------------------
# Cross-validation function
# ---------------------
def cross_val_score_pipeline(X_tr, y_tr, pipeline, skf, verbose=False):
    """
    Custom cross-validation for a pipeline. Returns mean accuracy.
    """
    X = X_tr.copy()
    y = y_tr.copy()
    val_scores = []

    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        model = clone(pipeline)
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        acc = accuracy_score(y_val, y_pred)
        val_scores.append(acc)

        if verbose:
            print(f"FOLD {fold}: Accuracy = {acc:.5f}")

    mean_acc = np.mean(val_scores)
    if verbose:
        print(f"Mean CV Accuracy: {mean_acc:.5f}")
    return mean_acc

# ---------------------
# Pipeline preparation
# ---------------------
def xgb_pipeline(df_train, df_test, xgb_model):
    """
    Prepares train/test features and returns pipeline.
    Drops 'id' column automatically.
    """
    X_train = df_train.copy()
    X_test = df_test.copy()

    # Drop ID if exists
    if 'id' in X_train.columns:
        X_train = X_train.drop(columns=['id'])
    if 'id' in X_test.columns:
        X_test = X_test.drop(columns=['id'])
    
    y_train = X_train.pop(TARGET).map(targetMap)

    categorical_columns = list(X_train.select_dtypes(include='object').columns)
    numerical_columns = list(X_train.select_dtypes(exclude='object').columns)

    cat_transformer = make_pipeline(MEstimateEncoder())
    num_transformer = make_pipeline(RobustScaler())

    preprocessor = ColumnTransformer([
        ("num", num_transformer, numerical_columns),
        ("cat", cat_transformer, categorical_columns)
    ])

    pipeline = make_pipeline(preprocessor, xgb_model)

    return X_train, y_train, X_test, pipeline

# ---------------------
# Optuna objective
# ---------------------
def objective(trial, pipeline, X_train, y_train, skf):
    params = {
        "max_depth": trial.suggest_int("max_depth", 2, 20),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 0.1, log=True),
        "subsample": trial.suggest_float("subsample", 0.7, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.1, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-2, 10.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-2, 10.0),
        "gamma": trial.suggest_float("gamma", 0.0, 5.0)
    }
    pipeline.named_steps['xgbclassifier'].set_params(**params)

    mean_acc = cross_val_score_pipeline(X_train, y_train, pipeline, skf)
    print(f"Trial {trial.number}: Accuracy = {mean_acc:.5f}")
    return mean_acc

# ---------------------
# Load data
# ---------------------
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

# Save test IDs for submission
test_ids = df_test['id'].copy() if 'id' in df_test.columns else pd.Series(range(len(df_test)))

# ---------------------
# Prepare pipeline & model
# ---------------------
xgb_model = XGBClassifier(
    n_estimators=500,
    n_jobs=-1,
    eval_metric="mlogloss"
)

X_train, y_train, X_test, pipeline = xgb_pipeline(df_train, df_test, xgb_model)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# ---------------------
# Optuna hyperparameter tuning
# ---------------------
study = optuna.create_study(direction="maximize")
study.optimize(
    partial(objective, pipeline=pipeline, X_train=X_train, y_train=y_train, skf=skf),
    n_trials=20
)

# ---------------------
# Best trial results
# ---------------------
trial = study.best_trial
print("\nBest CV Accuracy:", trial.value)
print("Best Hyperparameters:")
for k, v in trial.params.items():
    print(f"{k}: {v}")

# ---------------------
# Train final model with best hyperparameters
# ---------------------
best_params = trial.params
xgb_model_final = XGBClassifier(
    n_estimators=500,
    n_jobs=-1,
    use_label_encoder=False,
    eval_metric="mlogloss",
    **best_params
)

X_train, y_train, X_test, final_pipeline = xgb_pipeline(df_train, df_test, xgb_model_final)

# Fit on full training data
final_pipeline.fit(X_train, y_train)

# Predict on test set
test_preds_numeric = final_pipeline.predict(X_test)
test_preds_labels = [targetMap_reversed[i] for i in test_preds_numeric]

# Save submission
submission = pd.DataFrame({
    'id': test_ids,
    'WeightCategory': test_preds_labels
})
submission.to_csv("submission_robust_scaler.csv", index=False)
print("\nSaved predictions to submission.csv")


Trial 0: Accuracy = 0.88058
Trial 1: Accuracy = 0.88341
Trial 2: Accuracy = 0.87137
Trial 3: Accuracy = 0.86860
Trial 4: Accuracy = 0.78691
Trial 5: Accuracy = 0.89049
Trial 6: Accuracy = 0.87311
Trial 7: Accuracy = 0.88959
Trial 8: Accuracy = 0.90073
Trial 9: Accuracy = 0.89802
Trial 10: Accuracy = 0.86867
Trial 11: Accuracy = 0.89738
Trial 12: Accuracy = 0.88360
Trial 13: Accuracy = 0.89384
Trial 14: Accuracy = 0.89925
Trial 15: Accuracy = 0.89519
Trial 16: Accuracy = 0.89339
Trial 17: Accuracy = 0.89551
Trial 18: Accuracy = 0.85482
Trial 19: Accuracy = 0.88772

Best CV Accuracy: 0.9007277876784056
Best Hyperparameters:
max_depth: 10
min_child_weight: 7
learning_rate: 0.07521423681092154
subsample: 0.9580620215961363
colsample_bytree: 0.23429837650701
reg_alpha: 6.908145127016929
reg_lambda: 4.307601433813856
gamma: 0.5854726407556704


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Saved predictions to submission.csv


**The above gave 90.9% accuracy on kaggle**

In [5]:
import numpy as np
import pandas as pd
from sklearn.base import clone
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, PowerTransformer, OrdinalEncoder
from sklearn.pipeline import make_pipeline
from xgboost import XGBClassifier
import optuna
from functools import partial
import warnings

# ---------------------
# Settings
# ---------------------
warnings.filterwarnings("ignore")
optuna.logging.set_verbosity(optuna.logging.WARNING)

TARGET = "WeightCategory"
targetMap = {
    'Insufficient_Weight':0,'Normal_Weight':1,
    'Overweight_Level_I':2,'Overweight_Level_II':3, 
    'Obesity_Type_I':4,'Obesity_Type_II':5 ,'Obesity_Type_III':6
}
targetMap_reversed = {v:k for k,v in targetMap.items()}

# ---------------------
# Cross-validation function
# ---------------------
def cross_val_score_pipeline(X_tr, y_tr, pipeline, skf, verbose=False):
    X = X_tr.copy()
    y = y_tr.copy()
    val_scores = []

    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        model = clone(pipeline)
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        acc = accuracy_score(y_val, y_pred)
        val_scores.append(acc)

        if verbose:
            print(f"FOLD {fold}: Accuracy = {acc:.5f}")

    mean_acc = np.mean(val_scores)
    if verbose:
        print(f"Mean CV Accuracy: {mean_acc:.5f}")
    return mean_acc

# ---------------------
# Pipeline preparation
# ---------------------
def xgb_pipeline(df_train, df_test, xgb_model):
    X_train = df_train.copy()
    X_test = df_test.copy()

    # Drop ID if exists
    if 'id' in X_train.columns:
        X_train = X_train.drop(columns=['id'])
    if 'id' in X_test.columns:
        X_test = X_test.drop(columns=['id'])
    
    y_train = X_train.pop(TARGET).map(targetMap)

    # numeric and categorical columns
    numeric_cols = list(X_train.select_dtypes(exclude='object').columns)
    categorical_cols = list(X_train.select_dtypes(include='object').columns)

    numeric_cols_without_age = [c for c in numeric_cols if c != 'Age']

    # Numeric transformer
    num_transformer = ColumnTransformer([
        ('boxcox', PowerTransformer(method='box-cox'), ['Age']),
        ('scale', StandardScaler(), numeric_cols_without_age)
    ])

    # Categorical transformer
    cat_transformer = OrdinalEncoder()

    # Full preprocessor
    preprocessor = ColumnTransformer([
        ('num', num_transformer, numeric_cols),
        ('cat', cat_transformer, categorical_cols)
    ])

    pipeline = make_pipeline(preprocessor, xgb_model)
    return X_train, y_train, X_test, pipeline

# ---------------------
# Optuna objective
# ---------------------
def objective(trial, pipeline, X_train, y_train, skf):
    params = {
        "max_depth": trial.suggest_int("max_depth", 2, 20),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 0.1, log=True),
        "subsample": trial.suggest_float("subsample", 0.7, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.1, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-2, 10.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-2, 10.0),
        "gamma": trial.suggest_float("gamma", 0.0, 5.0)
    }
    pipeline.named_steps['xgbclassifier'].set_params(**params)

    mean_acc = cross_val_score_pipeline(X_train, y_train, pipeline, skf)
    print(f"Trial {trial.number}: Accuracy = {mean_acc:.5f}")
    return mean_acc

# ---------------------
# Load data
# ---------------------
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

# Save test IDs for submission
test_ids = df_test['id'].copy() if 'id' in df_test.columns else pd.Series(range(len(df_test)))

# ---------------------
# Prepare pipeline & model
# ---------------------
xgb_model = XGBClassifier(
    n_estimators=500,
    n_jobs=-1,
    eval_metric="mlogloss"
)

X_train, y_train, X_test, pipeline = xgb_pipeline(df_train, df_test, xgb_model)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# ---------------------
# Optuna hyperparameter tuning
# ---------------------
study = optuna.create_study(direction="maximize")
study.optimize(
    partial(objective, pipeline=pipeline, X_train=X_train, y_train=y_train, skf=skf),
    n_trials=30
)

# ---------------------
# Best trial results
# ---------------------
trial = study.best_trial
print("\nBest CV Accuracy:", trial.value)
print("Best Hyperparameters:")
for k, v in trial.params.items():
    print(f"{k}: {v}")

# ---------------------
# Train final model with best hyperparameters
# ---------------------
best_params = trial.params
xgb_model_final = XGBClassifier(
    n_estimators=500,
    n_jobs=-1,
    use_label_encoder=False,
    eval_metric="mlogloss",
    **best_params
)

X_train, y_train, X_test, final_pipeline = xgb_pipeline(df_train, df_test, xgb_model_final)
final_pipeline.fit(X_train, y_train)

# ---------------------
# Predict on test set
# ---------------------
test_preds_numeric = final_pipeline.predict(X_test)
test_preds_labels = [targetMap_reversed[i] for i in test_preds_numeric]

submission = pd.DataFrame({
    'id': test_ids,
    'WeightCategory': test_preds_labels
})
submission.to_csv("submission_boxcox_std_label.csv", index=False)
print("\nSaved predictions to submission_boxcox_std_label.csv")


Trial 0: Accuracy = 0.86313
Trial 1: Accuracy = 0.86094
Trial 2: Accuracy = 0.87961
Trial 3: Accuracy = 0.86345
Trial 4: Accuracy = 0.89757
Trial 5: Accuracy = 0.41203
Trial 6: Accuracy = 0.72871
Trial 7: Accuracy = 0.88315
Trial 8: Accuracy = 0.88328
Trial 9: Accuracy = 0.88547
Trial 10: Accuracy = 0.86738
Trial 11: Accuracy = 0.88792
Trial 12: Accuracy = 0.89609
Trial 13: Accuracy = 0.90317
Trial 14: Accuracy = 0.90485
Trial 15: Accuracy = 0.90060
Trial 16: Accuracy = 0.88747
Trial 17: Accuracy = 0.88721
Trial 18: Accuracy = 0.90021
Trial 19: Accuracy = 0.90053
Trial 20: Accuracy = 0.90652
Trial 21: Accuracy = 0.90581
Trial 22: Accuracy = 0.90633
Trial 23: Accuracy = 0.90620
Trial 24: Accuracy = 0.90356
Trial 25: Accuracy = 0.89841
Trial 26: Accuracy = 0.90176
Trial 27: Accuracy = 0.90697
Trial 28: Accuracy = 0.90511
Trial 29: Accuracy = 0.89912

Best CV Accuracy: 0.9069724161071184
Best Hyperparameters:
max_depth: 14
min_child_weight: 9
learning_rate: 0.05094902452143167
subsample: 

**The above gave 91.267% accuracy on kaggle**

In [3]:
import numpy as np
import pandas as pd
from sklearn.base import clone
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, PowerTransformer, OrdinalEncoder
from sklearn.pipeline import make_pipeline
from xgboost import XGBClassifier
import optuna
from functools import partial
import warnings
from category_encoders import TargetEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import PowerTransformer, StandardScaler

# ---------------------
# Settings
# ---------------------
warnings.filterwarnings("ignore")
optuna.logging.set_verbosity(optuna.logging.WARNING)

TARGET = "WeightCategory"
targetMap = {
    'Insufficient_Weight':0,'Normal_Weight':1,
    'Overweight_Level_I':2,'Overweight_Level_II':3, 
    'Obesity_Type_I':4,'Obesity_Type_II':5 ,'Obesity_Type_III':6
}
targetMap_reversed = {v:k for k,v in targetMap.items()}

# ---------------------
# Cross-validation function
# ---------------------
def cross_val_score_pipeline(X_tr, y_tr, pipeline, skf, verbose=False):
    X = X_tr.copy()
    y = y_tr.copy()
    val_scores = []

    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        model = clone(pipeline)
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        acc = accuracy_score(y_val, y_pred)
        val_scores.append(acc)

        if verbose:
            print(f"FOLD {fold}: Accuracy = {acc:.5f}")

    mean_acc = np.mean(val_scores)
    if verbose:
        print(f"Mean CV Accuracy: {mean_acc:.5f}")
    return mean_acc

# ---------------------
# Pipeline preparation
# ---------------------
def xgb_pipeline(df_train, df_test, xgb_model):
    X_train = df_train.copy()
    X_test = df_test.copy()

    # Drop ID if exists
    if 'id' in X_train.columns:
        X_train = X_train.drop(columns=['id'])
    if 'id' in X_test.columns:
        X_test = X_test.drop(columns=['id'])
    
    y_train = X_train.pop(TARGET).map(targetMap)

    # numeric and categorical columns
    numeric_cols = list(X_train.select_dtypes(exclude='object').columns)
    categorical_cols = list(X_train.select_dtypes(include='object').columns)

    numeric_cols_without_age = [c for c in numeric_cols if c != 'Age']

    # Numeric transformer (same as before)
    num_transformer = ColumnTransformer([
        ('boxcox', PowerTransformer(method='box-cox'), ['Age']),
        ('scale', StandardScaler(), numeric_cols_without_age)
    ])

    # 🔹 Replace OrdinalEncoder with TargetEncoder
    cat_transformer = TargetEncoder()

    # Full preprocessor
    preprocessor = ColumnTransformer([
        ('num', num_transformer, numeric_cols),
        ('cat', cat_transformer, categorical_cols)
    ])

    pipeline = make_pipeline(preprocessor, xgb_model)
    return X_train, y_train, X_test, pipeline

# ---------------------
# Optuna objective
# ---------------------
def objective(trial, pipeline, X_train, y_train, skf):
    params = {
        "max_depth": trial.suggest_int("max_depth", 2, 20),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 0.1, log=True),
        "subsample": trial.suggest_float("subsample", 0.7, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.1, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-2, 10.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-2, 10.0),
        "gamma": trial.suggest_float("gamma", 0.0, 5.0)
    }
    pipeline.named_steps['xgbclassifier'].set_params(**params)

    mean_acc = cross_val_score_pipeline(X_train, y_train, pipeline, skf)
    print(f"Trial {trial.number}: Accuracy = {mean_acc:.5f}")
    return mean_acc

# ---------------------
# Load data
# ---------------------
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

# Save test IDs for submission
test_ids = df_test['id'].copy() if 'id' in df_test.columns else pd.Series(range(len(df_test)))

# ---------------------
# Prepare pipeline & model
# ---------------------
xgb_model = XGBClassifier(
    n_estimators=500,
    n_jobs=-1,
    eval_metric="mlogloss"
)

X_train, y_train, X_test, pipeline = xgb_pipeline(df_train, df_test, xgb_model)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# ---------------------
# Optuna hyperparameter tuning
# ---------------------
study = optuna.create_study(direction="maximize")
study.optimize(
    partial(objective, pipeline=pipeline, X_train=X_train, y_train=y_train, skf=skf),
    n_trials=30
)

# ---------------------
# Best trial results
# ---------------------
trial = study.best_trial
print("\nBest CV Accuracy:", trial.value)
print("Best Hyperparameters:")
for k, v in trial.params.items():
    print(f"{k}: {v}")

# ---------------------
# Train final model with best hyperparameters
# ---------------------
best_params = trial.params
xgb_model_final = XGBClassifier(
    n_estimators=500,
    n_jobs=-1,
    use_label_encoder=False,
    eval_metric="mlogloss",
    **best_params
)

X_train, y_train, X_test, final_pipeline = xgb_pipeline(df_train, df_test, xgb_model_final)
final_pipeline.fit(X_train, y_train)

# ---------------------
# Predict on test set
# ---------------------
test_preds_numeric = final_pipeline.predict(X_test)
test_preds_labels = [targetMap_reversed[i] for i in test_preds_numeric]

submission = pd.DataFrame({
    'id': test_ids,
    'WeightCategory': test_preds_labels
})
submission.to_csv("submission_boxcox_std_target.csv", index=False)
print("\nSaved predictions to submission_boxcox_std_target.csv")


Trial 0: Accuracy = 0.86789
Trial 1: Accuracy = 0.89062
Trial 2: Accuracy = 0.87684
Trial 3: Accuracy = 0.86931
Trial 4: Accuracy = 0.86139
Trial 5: Accuracy = 0.89513
Trial 6: Accuracy = 0.87665
Trial 7: Accuracy = 0.90272
Trial 8: Accuracy = 0.87897
Trial 9: Accuracy = 0.83229
Trial 10: Accuracy = 0.90517
Trial 11: Accuracy = 0.90678
Trial 12: Accuracy = 0.90562
Trial 13: Accuracy = 0.89435
Trial 14: Accuracy = 0.89056
Trial 15: Accuracy = 0.86435
Trial 16: Accuracy = 0.86345
Trial 17: Accuracy = 0.90575
Trial 18: Accuracy = 0.90684
Trial 19: Accuracy = 0.90098
Trial 20: Accuracy = 0.89996
Trial 21: Accuracy = 0.90588
Trial 22: Accuracy = 0.90549
Trial 23: Accuracy = 0.89500
Trial 24: Accuracy = 0.89410
Trial 25: Accuracy = 0.89249
Trial 26: Accuracy = 0.86480
Trial 27: Accuracy = 0.90684
Trial 28: Accuracy = 0.86841
Trial 29: Accuracy = 0.66883

Best CV Accuracy: 0.9068437367297448
Best Hyperparameters:
max_depth: 14
min_child_weight: 9
learning_rate: 0.062837454333107
subsample: 0.

**The above gave 91.1% accuracy**

In [None]:
import numpy as np
import pandas as pd
from sklearn.base import clone
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, PowerTransformer, OrdinalEncoder
from sklearn.pipeline import make_pipeline
from xgboost import XGBClassifier
import optuna
from functools import partial
import warnings
from category_encoders import TargetEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import PowerTransformer, StandardScaler

# ---------------------
# Settings
# ---------------------
warnings.filterwarnings("ignore")
optuna.logging.set_verbosity(optuna.logging.WARNING)

TARGET = "WeightCategory"
targetMap = {
    'Insufficient_Weight':0,'Normal_Weight':1,
    'Overweight_Level_I':2,'Overweight_Level_II':3, 
    'Obesity_Type_I':4,'Obesity_Type_II':5 ,'Obesity_Type_III':6
}
targetMap_reversed = {v:k for k,v in targetMap.items()}

# ---------------------
# Cross-validation function
# ---------------------
def cross_val_score_pipeline(X_tr, y_tr, pipeline, skf, verbose=False):
    X = X_tr.copy()
    y = y_tr.copy()
    val_scores = []

    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        model = clone(pipeline)
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        acc = accuracy_score(y_val, y_pred)
        val_scores.append(acc)

        if verbose:
            print(f"FOLD {fold}: Accuracy = {acc:.5f}")

    mean_acc = np.mean(val_scores)
    if verbose:
        print(f"Mean CV Accuracy: {mean_acc:.5f}")
    return mean_acc

# ---------------------
# Pipeline preparation
# ---------------------
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, PowerTransformer, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline

def xgb_pipeline(df_train, df_test, xgb_model):
    X_train = df_train.copy()
    X_test = df_test.copy()

    # Drop ID if exists
    if 'id' in X_train.columns:
        X_train = X_train.drop(columns=['id'])
    if 'id' in X_test.columns:
        X_test = X_test.drop(columns=['id'])
    
    y_train = X_train.pop(TARGET).map(targetMap)

    numeric_cols = list(X_train.select_dtypes(exclude='object').columns)
    categorical_cols = list(X_train.select_dtypes(include='object').columns)

    numeric_cols_without_age = [c for c in numeric_cols if c != 'Age']

    # Numeric transformer with interaction terms
    num_transformer = ColumnTransformer([
        ('boxcox', PowerTransformer(method='box-cox'), ['Age']),
        ('poly', make_pipeline(StandardScaler(), PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)), numeric_cols_without_age)
    ])

    # Use OrdinalEncoder since TargetEncoder didn’t help
    cat_transformer = OrdinalEncoder()

    # Full preprocessor
    preprocessor = ColumnTransformer([
        ('num', num_transformer, numeric_cols),
        ('cat', cat_transformer, categorical_cols)
    ])

    pipeline = make_pipeline(preprocessor, xgb_model)
    return X_train, y_train, X_test, pipeline


# ---------------------
# Optuna objective
# ---------------------
def objective(trial, pipeline, X_train, y_train, skf):
    params = {
        "max_depth": trial.suggest_int("max_depth", 2, 20),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 0.1, log=True),
        "subsample": trial.suggest_float("subsample", 0.7, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.1, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-2, 10.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-2, 10.0),
        "gamma": trial.suggest_float("gamma", 0.0, 5.0)
    }
    pipeline.named_steps['xgbclassifier'].set_params(**params)

    mean_acc = cross_val_score_pipeline(X_train, y_train, pipeline, skf)
    print(f"Trial {trial.number}: Accuracy = {mean_acc:.5f}")
    return mean_acc

# ---------------------
# Load data
# ---------------------
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

# Save test IDs for submission
test_ids = df_test['id'].copy() if 'id' in df_test.columns else pd.Series(range(len(df_test)))

# ---------------------
# Prepare pipeline & model
# ---------------------
xgb_model = XGBClassifier(
    n_estimators=500,
    n_jobs=-1,
    eval_metric="mlogloss"
)

X_train, y_train, X_test, pipeline = xgb_pipeline(df_train, df_test, xgb_model)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# ---------------------
# Optuna hyperparameter tuning
# ---------------------
study = optuna.create_study(direction="maximize")
study.optimize(
    partial(objective, pipeline=pipeline, X_train=X_train, y_train=y_train, skf=skf),
    n_trials=30
)

# ---------------------
# Best trial results
# ---------------------
trial = study.best_trial
print("\nBest CV Accuracy:", trial.value)
print("Best Hyperparameters:")
for k, v in trial.params.items():
    print(f"{k}: {v}")

# ---------------------
# Train final model with best hyperparameters
# ---------------------
best_params = trial.params
xgb_model_final = XGBClassifier(
    n_estimators=500,
    n_jobs=-1,
    use_label_encoder=False,
    eval_metric="mlogloss",
    **best_params
)

X_train, y_train, X_test, final_pipeline = xgb_pipeline(df_train, df_test, xgb_model_final)
final_pipeline.fit(X_train, y_train)

# ---------------------
# Predict on test set
# ---------------------
test_preds_numeric = final_pipeline.predict(X_test)
test_preds_labels = [targetMap_reversed[i] for i in test_preds_numeric]

submission = pd.DataFrame({
    'id': test_ids,
    'WeightCategory': test_preds_labels
})
submission.to_csv("submission_boxcox_std_label_poly.csv", index=False)
print("\nSaved predictions to submission_boxcox_std_label_poly.csv")


Trial 0: Accuracy = 0.82624
Trial 1: Accuracy = 0.87910
Trial 2: Accuracy = 0.86358
Trial 3: Accuracy = 0.75137
Trial 4: Accuracy = 0.88109
Trial 5: Accuracy = 0.85109
