In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score, accuracy_score, classification_report
import xgboost as xgb

# === Load Data ===
train_df = pd.read_csv("/kaggle/input/datasetxl/Train_Data.csv")
test_df = pd.read_csv("/kaggle/input/datasetxl/Test_Data.csv")

# === Encode Target ===
train_df = train_df.dropna(subset=['age_group']).copy()
le_target = LabelEncoder()
train_df['age_group'] = le_target.fit_transform(train_df['age_group'])  # Adult=0, Senior=1

# === Feature Engineering ===
train_df['GLU_INS_RATIO'] = train_df['LBXGLU'] / (train_df['LBXIN'] + 1e-3)
test_df['GLU_INS_RATIO'] = test_df['LBXGLU'] / (test_df['LBXIN'] + 1e-3)

def bmi_category(bmi):
    if pd.isna(bmi): return 'missing'
    elif bmi < 18.5: return 'underweight'
    elif bmi < 25: return 'normal'
    elif bmi < 30: return 'overweight'
    else: return 'obese'

train_df['BMI_CAT'] = train_df['BMXBMI'].apply(bmi_category)
test_df['BMI_CAT'] = test_df['BMXBMI'].apply(bmi_category)

le_bmi = LabelEncoder()
train_df['BMI_CAT'] = le_bmi.fit_transform(train_df['BMI_CAT'])
test_df['BMI_CAT'] = le_bmi.transform(test_df['BMI_CAT'])

# === Prepare Features ===
X = train_df.drop(columns=['SEQN', 'age_group'])
y = train_df['age_group']
X_test = test_df.drop(columns=['SEQN'])

# === Impute + Scale ===
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X)
X_test_imputed = imputer.transform(X_test)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

# === Manual Oversampling ===
df_combined = pd.DataFrame(X_scaled)
df_combined['target'] = y.values
df_minority = df_combined[df_combined['target'] == 1]
df_majority = df_combined[df_combined['target'] == 0]

df_minority_upsampled = df_minority.sample(n=len(df_majority), replace=True, random_state=42)
df_balanced = pd.concat([df_majority, df_minority_upsampled])

X_bal = df_balanced.drop(columns='target').values
y_bal = df_balanced['target'].values

# === Train-Test Split for Validation ===
X_train, X_val, y_train, y_val = train_test_split(X_bal, y_bal, test_size=0.2, stratify=y_bal, random_state=42)

# === Train XGBoost Model ===
model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False,
    random_state=42,
    n_estimators=150,
    max_depth=5,
    learning_rate=0.08,
    subsample=0.8,
    colsample_bytree=0.8
)
model.fit(X_train, y_train)

# === Predict with Custom Threshold ===
y_proba = model.predict_proba(X_val)[:, 1]
threshold = 0.4  # Favor recall of class 1
y_pred = (y_proba >= threshold).astype(int)

# === Evaluation ===
f1 = f1_score(y_val, y_pred)
acc = accuracy_score(y_val, y_pred)
print(f"✅ F1 Score: {f1:.4f}")
print(f"✅ Accuracy: {acc:.4f}")
print("\nClassification Report:\n", classification_report(y_val, y_pred, target_names=['Adult (0)', 'Senior (1)']))

# === Retrain on Full Data ===
model.fit(X_bal, y_bal)
final_probs = model.predict_proba(X_test_scaled)[:, 1]
final_preds = (final_probs >= threshold).astype(int)

# === Save Submission ===
submission = pd.DataFrame({'age_group': final_preds})
submission.to_csv("submission_improved_no_smote.csv", index=False)
print("📄 Saved: submission_improved_no_smote.csv")


✅ F1 Score: 0.8549
✅ Accuracy: 0.8338

Classification Report:
               precision    recall  f1-score   support

   Adult (0)       0.97      0.69      0.81       328
  Senior (1)       0.76      0.98      0.85       328

    accuracy                           0.83       656
   macro avg       0.86      0.83      0.83       656
weighted avg       0.86      0.83      0.83       656

📄 Saved: submission_improved_no_smote.csv
