<a href="https://colab.research.google.com/github/Niranjan0311/sa_2025/blob/main/Improved_Hackathon_Solution_for_Age_Group_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from collections import Counter
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

print("Starting improved hackathon solution script...")

# Load the datasets
try:
    train_df = pd.read_csv('Train_Data.csv')
    test_df = pd.read_csv('Test_Data.csv')
    sample_submission_df = pd.read_csv('Sample_Submission.csv')
    print("Initial data loading complete.")
except FileNotFoundError:
    print("Error: Make sure 'Train_Data.csv', 'Test_Data.csv', and 'Sample_Submission.csv' are in the same directory.")
    exit()

print("Train_Data.csv head:")
print(train_df.head())
print("\nTest_Data.csv head:")
print(test_df.head())

# Create copies to avoid modifying the original DataFrames
train_df_processed = train_df.copy()
test_df_processed = test_df.copy()

# --- Data Preprocessing ---

# 1. Handle Missing Values
print("\nHandling missing values...")

# Impute 'DIQ010' specifically with its mode, as it seems to be a discrete/categorical feature
# Calculate mode from training data only to prevent data leakage
diq010_mode_train = train_df_processed['DIQ010'].mode()[0]
train_df_processed['DIQ010'].fillna(diq010_mode_train, inplace=True)
test_df_processed['DIQ010'].fillna(diq010_mode_train, inplace=True)
print(f"Missing values in 'DIQ010' imputed with mode: {diq010_mode_train}")


# Separate features (X) and target (y) before general imputation and feature engineering
X_train_raw = train_df_processed.drop('age_group', axis=1)
y_train_raw = train_df_processed['age_group']

# Identify numerical columns for median imputation (excluding 'SEQN' and 'DIQ010' from this specific imputer's fit)
numerical_cols = X_train_raw.select_dtypes(include=np.number).columns.tolist()
# Remove 'SEQN' and 'DIQ010' if they were accidentally included and already handled
numerical_cols = [col for col in numerical_cols if col not in ['SEQN']]


# Initialize the imputer with median strategy for remaining numerical features
imputer_numerical = SimpleImputer(strategy='median')

# Fit and transform on training data
X_train_imputed = pd.DataFrame(imputer_numerical.fit_transform(X_train_raw[numerical_cols]),
                               columns=numerical_cols, index=X_train_raw.index)

# Transform test data using the imputer fitted on training data
test_df_imputed = pd.DataFrame(imputer_numerical.transform(test_df_processed[numerical_cols]),
                               columns=numerical_cols, index=test_df_processed.index)

# Recombine with DIQ010 (which was separately imputed) and SEQN (which will be dropped later)
X_train_final = X_train_imputed.copy()
X_train_final['DIQ010'] = train_df_processed['DIQ010']
X_train_final['SEQN'] = train_df_processed['SEQN'] # Keep SEQN for now, drop later

test_df_final = test_df_imputed.copy()
test_df_final['DIQ010'] = test_df_processed['DIQ010']
test_df_final['SEQN'] = test_df_processed['SEQN'] # Keep SEQN for now, drop later


# For the 'age_group' target column, impute missing values with the mode
y_train_imputed_series = y_train_raw.fillna(y_train_raw.mode()[0])
print("Missing values in features imputed using median/mode strategy.")


# 2. Encode Target Variable
# Map 'Adult' to 0 and 'Senior' to 1
y_train_encoded = y_train_imputed_series.map({'Adult': 0, 'Senior': 1})
print("Target variable 'age_group' encoded to 0 (Adult) and 1 (Senior).")
print("Encoded target value counts (before SMOTE):")
print(y_train_encoded.value_counts()) # Check imbalance


# 3. Feature Engineering
print("\nPerforming feature engineering...")
epsilon = 1e-6 # Small value to prevent division by zero

# Glucose to Insulin Ratio
X_train_final['GLU_to_IN_Ratio'] = X_train_final['LBXGLU'] / (X_train_final['LBXIN'] + epsilon)
test_df_final['GLU_to_IN_Ratio'] = test_df_final['LBXGLU'] / (test_df_final['LBXIN'] + epsilon)

# Glucose Tolerance Difference
X_train_final['Glucose_Tolerance_Diff'] = X_train_final['LBXGLT'] - X_train_final['LBXGLU']
test_df_final['Glucose_Tolerance_Diff'] = test_df_final['LBXGLT'] - test_df_final['LBXGLU']

# BMI and Physical Activity Interaction
X_train_final['BMI_PAQ_Interaction'] = X_train_final['BMXBMI'] * X_train_final['PAQ605']
test_df_final['BMI_PAQ_Interaction'] = test_df_final['BMXBMI'] * test_df_final['PAQ605']

print("New features created: GLU_to_IN_Ratio, Glucose_Tolerance_Diff, BMI_PAQ_Interaction.")

# 4. Drop 'SEQN' column (identifier)
X_train_final = X_train_final.drop('SEQN', axis=1)
test_df_final = test_df_final.drop('SEQN', axis=1)
print("'SEQN' column dropped from training and test feature sets.")

print("\nPreprocessed training data info (with new features):")
print(X_train_final.info())
print("\nPreprocessed test data info (with new features):")
print(test_df_final.info())

# --- Model Training ---

# Split the training data into training and validation sets for evaluation
# Stratify ensures that the proportion of 'Adult' and 'Senior' is maintained in splits
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train_final, y_train_encoded, test_size=0.2, random_state=42, stratify=y_train_encoded
)
print("\nTraining data split into training (80%) and validation (20%) sets.")
print(f"Original training split class distribution: {Counter(y_train_split)}")

# 5. Handle Class Imbalance with SMOTE on the training split
print("Applying SMOTE to the training data to handle class imbalance...")
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_split, y_train_split)
print(f"Resampled training split class distribution: {Counter(y_train_smote)}")


# Initialize XGBoostClassifier
# Calculate scale_pos_weight to balance classes for XGBoost directly
# This gives more weight to the minority class
neg_count = y_train_smote.value_counts()[0] # Count of majority class (Adult)
pos_count = y_train_smote.value_counts()[1] # Count of minority class (Senior)
scale_pos_weight_value = neg_count / pos_count
print(f"Calculated scale_pos_weight for XGBoost: {scale_pos_weight_value:.2f}")


xgb_model = XGBClassifier(
    objective='binary:logistic',  # For binary classification
    eval_metric='logloss',        # Metric for evaluation
    use_label_encoder=False,      # Suppress warning
    n_jobs=-1,                    # Use all available CPU cores
    random_state=42,
    scale_pos_weight=scale_pos_weight_value # Important for imbalance
)

# 6. Hyperparameter Tuning for XGBoost using RandomizedSearchCV
print("\nPerforming RandomizedSearchCV for hyperparameter tuning (this may take a moment)...")
param_distributions = {
    'n_estimators': [100, 200, 300, 400, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5, 6, 7, 8],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
    'gamma': [0, 0.1, 0.2, 0.3],
    'lambda': [0.5, 1, 1.5], # L2 regularization
    'alpha': [0, 0.1, 0.2]   # L1 regularization
}

# n_iter controls how many different combinations are tried. Increase for more thorough search.
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_distributions,
    n_iter=50, # Number of parameter settings that are sampled.
    scoring='f1', # Optimize for F1 score
    cv=5, # 5-fold cross-validation
    verbose=1,
    random_state=42,
    n_jobs=-1 # Use all available cores
)

random_search.fit(X_train_smote, y_train_smote)

best_xgb_model = random_search.best_estimator_
print(f"\nBest XGBoost parameters found: {random_search.best_params_}")
print("XGBoostClassifier model trained with best parameters.")

# --- Model Evaluation ---

# Make predictions on the original (un-SMOTEd) validation set
y_pred_val = best_xgb_model.predict(X_val_split)
y_pred_proba_val = best_xgb_model.predict_proba(X_val_split)[:, 1] # Probabilities for positive class

# Evaluate the model using F1 score
f1 = f1_score(y_val_split, y_pred_val)
print(f"\nF1 Score on the validation set: {f1:.4f}")

print("\nClassification Report on Validation Set:")
print(classification_report(y_val_split, y_pred_val, target_names=['Adult', 'Senior']))

print("\nConfusion Matrix on Validation Set:")
print(confusion_matrix(y_val_split, y_pred_val))

# You can also explore threshold tuning to potentially optimize F1 score further if needed
# from sklearn.metrics import precision_recall_curve
# precisions, recalls, thresholds = precision_recall_curve(y_val_split, y_pred_proba_val)
# f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
# best_threshold = thresholds[np.argmax(f1_scores)]
# print(f"Best threshold for F1: {best_threshold:.4f}")
# y_pred_val_tuned = (y_pred_proba_val >= best_threshold).astype(int)
# f1_tuned = f1_score(y_val_split, y_pred_val_tuned)
# print(f"F1 Score on validation set with tuned threshold: {f1_tuned:.4f}")


# --- Prediction and Submission ---

# Make predictions on the preprocessed test data
test_predictions = best_xgb_model.predict(test_df_final)
print("\nPredictions made on the test dataset.")

# Create the submission DataFrame
submission_df = pd.DataFrame({
    'age_group': test_predictions
})

# Convert 0 back to 'Adult' and 1 back to 'Senior' for submission
submission_df['age_group'] = submission_df['age_group'].map({0: 'Adult', 1: 'Senior'})

# Save the submission file in the specified format
submission_df.to_csv('submission.csv', index=False)

print("\nSubmission file 'submission.csv' created successfully.")
print("Submission file head:")
print(submission_df.head())
print("\n--- Solution Completed ---")

Starting improved hackathon solution script...
Initial data loading complete.
Train_Data.csv head:
      SEQN  RIAGENDR  PAQ605  BMXBMI  LBXGLU  DIQ010  LBXGLT  LBXIN age_group
0  73564.0       2.0     2.0    35.7   110.0     2.0   150.0  14.91     Adult
1  73568.0       2.0     2.0    20.3    89.0     2.0    80.0   3.85     Adult
2  73576.0       1.0     2.0    23.2    89.0     2.0    68.0   6.14     Adult
3  73577.0       1.0     2.0    28.9   104.0     NaN    84.0  16.15     Adult
4  73580.0       2.0     1.0    35.9   103.0     2.0    81.0  10.92     Adult

Test_Data.csv head:
      SEQN  RIAGENDR  PAQ605  BMXBMI  LBXGLU  DIQ010  LBXGLT  LBXIN
0  77017.0       1.0     1.0    32.2    96.0     2.0   135.0  15.11
1  75580.0       2.0     2.0    26.3   100.0     2.0   141.0  15.26
2  73820.0       1.0     2.0    28.6   107.0     2.0   136.0   8.82
3  80489.0       2.0     1.0    22.1    93.0     2.0   111.0  12.13
4  82047.0       1.0     1.0    24.7    91.0     2.0   105.0   3.12

Han