In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import f1_score, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.exceptions import ConvergenceWarning


In [8]:
def load_and_clean():
    train = pd.read_csv('Train_Data.csv')
    test = pd.read_csv('Test_Data.csv')
    
    column_map = {
        'BWKBWL': 'BMXBMI',
        'LENGUU': 'LBXGLU',
        'SEQW': 'SEQN',
        'RIAGENDR': 'Gender',
        'PAQ605': 'PhysicalActivity'
    }
    train = train.rename(columns=column_map)
    test = test.rename(columns=column_map)
    train = train.dropna(subset=['age_group'])
    
    return train, test

train_data, test_data = load_and_clean()

In [9]:
def create_features(df):
    median_bmi = df['BMXBMI'].median()
    df['BMI_Category'] = pd.cut(
        df['BMXBMI'].fillna(median_bmi),
        bins=[0, 18.5, 25, 30, 100],
        labels=['Underweight', 'Normal', 'Overweight', 'Obese']
    )
    
    median_glu = df['LBXGLU'].median()
    df['High_Glucose'] = (df['LBXGLU'].fillna(median_glu) > 125).astype(int)
    df['Glucose_Level'] = pd.cut(
        df['LBXGLU'].fillna(median_glu),
        bins=[0, 99, 125, 200, 500],
        labels=['Normal', 'Prediabetes', 'Diabetes', 'Severe']
    )
    
    return df

In [10]:
X_train = create_features(train_data).drop(['SEQN', 'age_group'], axis=1)
y_train = train_data['age_group'].map({'Adult': 0, 'Senior': 1})
X_test = create_features(test_data).drop(['SEQN'], axis=1)

numeric_features = X_train.select_dtypes(include='number').columns.tolist()
categorical_features = X_train.select_dtypes(exclude='number').columns.tolist()

preprocessor = ColumnTransformer([
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]), numeric_features),
    ('cat', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ]), categorical_features)
], remainder='passthrough')

In [11]:
models = {
    'RandomForest': {
        'model': RandomForestClassifier(class_weight='balanced'),
        'params': {
            'model__n_estimators': [100, 200],
            'model__max_depth': [3, 5, None],
            'model__min_samples_split': [2, 5]
        }
    }
}
best_model = None
best_score = 0
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [12]:
for name, config in models.items():
    pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('model', config['model'])
    ])
    
    grid = GridSearchCV(
        pipe,
        config['params'],
        cv=cv,
        scoring='f1',
        n_jobs=-1,
        error_score='raise'
    )
    
    grid.fit(X_train, y_train)
    
    if grid.best_score_ > best_score:
        best_score = grid.best_score_
        best_model = grid.best_estimator_
        print(f"\n{name}")
        print(f"{best_score:.4f}")
            


RandomForest
0.4278


In [13]:
test_preds = best_model.predict(X_test)
pd.DataFrame({
        'SEQN': test_data['SEQN'],
        'age_group': test_preds
    }).to_csv('final_submission.csv', index=False)