In [10]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

train_df = pd.read_csv("Train_Data.csv")
test_df = pd.read_csv("Test_Data.csv")

X = train_df.drop(columns=["age_group"])
y = train_df["age_group"]
X_test = test_df.copy()

X = X[~y.isna()]
y = y[~y.isna()]

imputer = SimpleImputer(strategy="median")
X_imputed = imputer.fit_transform(X)
X_test_imputed = imputer.transform(X_test)

le = LabelEncoder()
y_encoded = le.fit_transform(y)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

X_train, X_val, y_train, y_val = train_test_split(X_scaled, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42)

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

xgb = XGBClassifier(
    n_estimators=200,
    learning_rate=0.03,
    max_depth=6,
    scale_pos_weight=2.5,
    subsample=0.9,
    colsample_bytree=0.9,
    gamma=0.2,
    reg_alpha=0.3,
    reg_lambda=0.7,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)

rf = RandomForestClassifier(n_estimators=150, max_depth=10, random_state=42)
lr = LogisticRegression(max_iter=1000, C=0.7, solver='lbfgs')

voting_clf = VotingClassifier(
    estimators=[('xgb', xgb), ('rf', rf), ('lr', lr)],
    voting='soft'
)

voting_clf.fit(X_train_smote, y_train_smote)

y_val_pred = voting_clf.predict(X_val)
print("🔍 Evaluation on Validation Set (Voting Ensemble + SMOTE):")
print("F1 Score:", f1_score(y_val, y_val_pred, average='weighted'))
print("Precision:", precision_score(y_val, y_val_pred, average='weighted'))
print("Recall:", recall_score(y_val, y_val_pred, average='weighted'))
print("\nClassification Report:\n", classification_report(y_val, y_val_pred, target_names=le.classes_))

X_full_smote, y_full_smote = smote.fit_resample(X_scaled, y_encoded)
voting_clf.fit(X_full_smote, y_full_smote)

test_pred_encoded = voting_clf.predict(X_test_scaled)
test_pred_labels = le.inverse_transform(test_pred_encoded)
submission_df = pd.DataFrame({
    "age_group": test_pred_labels  
})

submission_df.to_csv("submission.csv", index=False)




🔍 Evaluation on Validation Set (Voting Ensemble + SMOTE):
F1 Score: 0.7103225513376066
Precision: 0.7681288398561735
Recall: 0.6751918158567775

Classification Report:
               precision    recall  f1-score   support

       Adult       0.87      0.72      0.79       328
      Senior       0.23      0.44      0.31        63

    accuracy                           0.68       391
   macro avg       0.55      0.58      0.55       391
weighted avg       0.77      0.68      0.71       391

