In [1]:
!pip install pandas numpy scikit-learn xgboost matplotlib seaborn




In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

from xgboost import XGBClassifier

print("✅ Step 0 complete: Libraries installed and imported.")


✅ Step 0 complete: Libraries installed and imported.


In [10]:
# Reload data to start fresh
train = pd.read_csv("Train_Data.csv")
test = pd.read_csv("Test_Data.csv")

# Drop rows with missing target
train = train.dropna(subset=['age_group'])

# Map target
train['age_group'] = train['age_group'].map({'Adult': 0, 'Senior': 1})

# Combine for consistent preprocessing
test['age_group'] = -1  # Dummy column to allow concat
df = pd.concat([train, test], axis=0).reset_index(drop=True)

# Categorical columns
cat_cols = ['RIAGENDR', 'PAQ605', 'DIQ010']

# Fill categorical with mode
for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

# Fill numerical with median
num_cols = ['BMXBMI', 'LBXGLU', 'LBXGLT', 'LBXIN']
for col in num_cols:
    df[col] = df[col].fillna(df[col].median())

# Convert categorical to int
for col in cat_cols:
    df[col] = df[col].astype(int)

# Drop SEQN (ID column)
df = df.drop(columns=['SEQN'])

# Final split
train_df = df[df['age_group'] != -1]
test_df = df[df['age_group'] == -1].drop(columns=['age_group'])

X = train_df.drop(columns=['age_group'])
y = train_df['age_group']
X_test = test_df.reset_index(drop=True)

print("✅ Data cleaned and encoded. Shape of X:", X.shape)


✅ Data cleaned and encoded. Shape of X: (1952, 7)


In [11]:
from collections import Counter
from sklearn.model_selection import train_test_split

# Check class distribution
counter = Counter(y)
print(f"Class distribution: {counter}")

# Calculate scale_pos_weight = majority / minority
scale_pos_weight = counter[0] / counter[1]
print(f"scale_pos_weight: {round(scale_pos_weight, 2)}")

# Split data
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


Class distribution: Counter({0: 1638, 1: 314})
scale_pos_weight: 5.22


In [12]:
from xgboost import XGBClassifier
from sklearn.metrics import f1_score

# Train XGBoost with tuned hyperparameters
model = XGBClassifier(
    n_estimators=500,
    learning_rate=0.03,
    max_depth=6,
    subsample=0.9,
    colsample_bytree=0.9,
    scale_pos_weight=scale_pos_weight,  # Handle imbalance
    eval_metric='logloss',
    random_state=42
)

model.fit(X_train, y_train)

# Predict on validation set
y_val_pred = model.predict(X_val)

# Evaluate
f1 = f1_score(y_val, y_val_pred)
print("📊 Validation F1 Score:", round(f1 * 100, 2), "%")


📊 Validation F1 Score: 30.3 %


In [14]:
# Predict on cleaned test data
final_predictions = model.predict(X_test)

# Prepare submission DataFrame
submission = pd.DataFrame({
    "age_group": final_predictions  # Already 0 (Adult) or 1 (Senior)
})

# Save submission file
submission.to_csv("submission.csv", index=False)

print("✅ submission.csv file created successfully!")


✅ submission.csv file created successfully!
