In [16]:
# ✅ STEP 1: Libraries
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


In [17]:

# ✅ STEP 2: Load Data
train = pd.read_csv("Train_Data.csv")
test = pd.read_csv("Test_Data.csv")


In [18]:

# ✅ STEP 3: Cleaning Function
def clean_data(df, is_train=True):
    df = df.copy()
    if is_train:
        df = df.dropna(subset=["age_group"])
        df["age_group"] = df["age_group"].map({"Adult": 0, "Senior": 1})
    df.drop(columns=["SEQN"], errors="ignore", inplace=True)
    num_cols = df.select_dtypes(include='number').columns.tolist()
    df[num_cols] = df[num_cols].fillna(df[num_cols].median())
    for col, cap in {'LBXGLU': 200, 'LBXIN': 100, 'LBXGLT': 250, 'BMXBMI': 50}.items():
        if col in df.columns:
            df[col] = np.log1p(np.clip(df[col], 0, cap))
    return df


In [19]:

# ✅ STEP 4: Clean
train_clean = clean_data(train, is_train=True)
test_clean = clean_data(test, is_train=False)

X = train_clean.drop(columns=["age_group"])
y = train_clean["age_group"]
X_test = test_clean.copy()


In [20]:

# ✅ STEP 5: Imbalance Handling via scale_pos_weight
neg, pos = np.bincount(y)
scale = neg / pos
print("⚖️ scale_pos_weight:", scale)


⚖️ scale_pos_weight: 5.2165605095541405


In [21]:

# ✅ STEP 6: Train-Test Split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


In [22]:

# ✅ STEP 7: Model
model = xgb.XGBClassifier(
    n_estimators=400,
    max_depth=6,
    learning_rate=0.03,
    subsample=0.8,
    colsample_bytree=0.8,
    use_label_encoder=False,
    eval_metric='logloss',
    scale_pos_weight=scale,
    random_state=42
)
model.fit(X_train, y_train)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\MukundKumarGupta\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.13_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python313\\site-packages\\sklearn\\utils\\_repr_html\\estimator.js'

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\MukundKumarGupta\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.13_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python313\\site-packages\\sklearn\\utils\\_repr_html\\estimator.js'

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.8, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='logloss',
              feature_types=None, feature_weights=None, gamma=None,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.03, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=6, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=400, n_jobs=None,
              num_parallel_tree=None, ...)

In [23]:

# ✅ STEP 8: Evaluation
val_preds = model.predict(X_val)
print("✅ Validation Accuracy:", accuracy_score(y_val, val_preds))
print(classification_report(y_val, val_preds))


✅ Validation Accuracy: 0.7595907928388747
              precision    recall  f1-score   support

           0       0.87      0.84      0.85       328
           1       0.30      0.37      0.33        63

    accuracy                           0.76       391
   macro avg       0.59      0.60      0.59       391
weighted avg       0.78      0.76      0.77       391



In [25]:

# ✅ STEP 9: Submission
test_preds = model.predict(X_test)
submission = pd.DataFrame({'age_group': test_preds})
submission.to_csv("lastHackethon.csv", index=False)
print("🚀 Submission file saved as lasthackthon.csv")

🚀 Submission file saved as lasthackthon.csv
