In [3]:
# NHANES Age Group Prediction using XGBoost

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

# Load datasets
train_df = pd.read_csv("Train_Data.csv")
test_df = pd.read_csv("Test_Data.csv")

# 🔧 Clean age_group (convert 'Senior' to 1, others to 0)
print("Unique values before:", train_df['age_group'].unique())

train_df['age_group'] = train_df['age_group'].astype(str).str.strip().map(
    lambda x: 1 if x.lower() == 'senior' else 0
)

print("Unique values after:", train_df['age_group'].unique())

# ✅ Prepare features and labels
X = train_df.drop(columns=["age_group", "SEQN"])
y = train_df["age_group"]

# Drop SEQN from test set
X_test = test_df.drop(columns=["SEQN"])

# 🧠 Train-validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 🚀 Train XGBoost model
model = XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)
model.fit(X_train, y_train)

# 🔍 Evaluate
val_preds = model.predict(X_val)
val_acc = accuracy_score(y_val, val_preds)
print(f"Validation Accuracy: {val_acc:.4f}")

# 📈 Predict on test set
test_preds = model.predict(X_test)

# 📦 Save submission
submission = pd.DataFrame(test_preds, columns=["is_senior"])
submission.to_csv("xgboost_fast_submission.csv", index=False)
print("Submission file saved as 'xgboost_fast_submission.csv'")


Unique values before: ['Adult' 'Senior' nan]
Unique values after: [0 1]


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Validation Accuracy: 0.8046
Submission file saved as 'xgboost_fast_submission.csv'
