In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#reading the csv file
df=pd.read_csv("Train_Data.csv")
df['age_group']=df['age_group'].map({'Adult':0,'Senior':1})

def impute_with_model(df, target_column, drop_cols=None, categorical_columns=None):
    from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
    from sklearn.preprocessing import LabelEncoder

    df_copy = df.copy()
    
    if drop_cols is None:
        drop_cols = []
    if categorical_columns is None:
        categorical_columns = []

    # Spliting into known and unknown
    df_known = df_copy[df_copy[target_column].notna()].copy()
    df_unknown = df_copy[df_copy[target_column].isna()].copy()

    # Determining if target is categorical
    is_categorical = (
        df_copy[target_column].dtype == 'object' or
        str(df_copy[target_column].dtype).startswith('category') or
        target_column in categorical_columns
    )

    # Encoding target if categorical
    le = None
    if is_categorical:
        le = LabelEncoder()
        df_known[target_column] = le.fit_transform(df_known[target_column])

    # Preparing X and y
    X_train = df_known.drop(columns=[target_column] + drop_cols).dropna()
    y_train = df_known.loc[X_train.index, target_column]
    X_test = df_unknown.drop(columns=[target_column] + drop_cols).dropna()

    if X_train.empty or X_test.empty:
        print(f" Skipping '{target_column}' — insufficient data.")
        return df_copy

    # Fiting model
    if is_categorical:
        model = RandomForestClassifier(random_state=42)
    else:
        model = RandomForestRegressor(random_state=42)

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    if le:
        y_pred = le.inverse_transform(y_pred.astype(int))

    df_copy.loc[X_test.index, target_column] = y_pred
    print(f" Imputed {len(y_pred)} missing values in '{target_column}'")
    return df_copy

# Explicitly mentioniing categorical numeric columns
categorical_cols = ['RIAGENDR', 'PAQ605', 'DIQ010', 'age_group']

df = impute_with_model(df, 'RIAGENDR', drop_cols=['SEQN'], categorical_columns=categorical_cols)
df = impute_with_model(df, 'age_group', drop_cols=['SEQN'], categorical_columns=categorical_cols)
df = impute_with_model(df, 'PAQ605', drop_cols=['SEQN'], categorical_columns=categorical_cols)
df = impute_with_model(df, 'DIQ010', drop_cols=['SEQN'], categorical_columns=categorical_cols)
df = impute_with_model(df, 'BMXBMI', drop_cols=['SEQN'], categorical_columns=categorical_cols) 
df = impute_with_model(df, 'LBXGLU', drop_cols=['SEQN'], categorical_columns=categorical_cols) 
df = impute_with_model(df, 'LBXGLT', drop_cols=['SEQN'], categorical_columns=categorical_cols) 
df = impute_with_model(df, 'LBXIN', drop_cols=['SEQN'], categorical_columns=categorical_cols)  

df1=df.copy()
#making features
df1['has_diabetes'] = df1['DIQ010'].map({1: 1, 2: 0, 3: 0})
df1['borderline_diabetes'] = df1['DIQ010'].map({1: 0, 2: 0, 3: 1})
df1['glu_insulin_ratio'] = df1['LBXGLU'] / (df1['LBXIN'] + 1e-3)
df1['active'] = df1['PAQ605'].replace({7: np.nan}).map({1: 1, 2: 0})
df1['high_glucose'] = (df1['LBXGLU'] > 125).astype(int)  # Fasting blood sugar >125 = diabetes



from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, classification_report
from xgboost import XGBClassifier
from collections import Counter
import numpy as np
import pandas as pd


# Selected best features based on model performance your results
features = [ 'PAQ605', 'LBXGLU', 'LBXGLT', 'LBXIN', 'has_diabetes']
X = df1[features]
y = df1['age_group']

# Handling class imbalance
counter = Counter(y)
scale_pos_weight = counter[0] / counter[1]  # Adult / Senior ratio

# Model with imbalance handling
model = XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    scale_pos_weight=scale_pos_weight,
    random_state=42
)

# Stratified K-Fold setup
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

f1_scores = []

# Cross-validation loop
for fold, (train_index, val_index) in enumerate(skf.split(X, y), 1):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    model.fit(X_train, y_train)
    y_prob = model.predict_proba(X_val)[:, 1]

    #  Trying threshold tuning 
    threshold = 0.26
    y_pred = (y_prob > threshold).astype(int)

    score = f1_score(y_val, y_pred)
    f1_scores.append(score)

    print(f"\n Fold {fold} F1 Score: {score:.4f}")
    print(classification_report(y_val, y_pred))

# Final score
print(f"\n Average F1 Score across folds: {np.mean(f1_scores):.4f}")



#  Predicting on test.csv
test_df=pd.read_csv('Test_Data.csv')
#####################################################################
#imputation
def impute_with_model(df, target_column, drop_cols=None, categorical_columns=None):
    from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
    from sklearn.preprocessing import LabelEncoder

    df_copy = test_df.copy()
    
    if drop_cols is None:
        drop_cols = []
    if categorical_columns is None:
        categorical_columns = []

    #  Spliting into known and unknown
    df_known = df_copy[df_copy[target_column].notna()].copy()
    df_unknown = df_copy[df_copy[target_column].isna()].copy()

    #  Determining if target is categorical
    is_categorical = (
        df_copy[target_column].dtype == 'object' or
        str(df_copy[target_column].dtype).startswith('category') or
        target_column in categorical_columns
    )

    #  Encoding target if categorical
    le = None
    if is_categorical:
        le = LabelEncoder()
        df_known[target_column] = le.fit_transform(df_known[target_column])

    #  Preparing X and y
    X_train = df_known.drop(columns=[target_column] + drop_cols).dropna()
    y_train = df_known.loc[X_train.index, target_column]
    X_test = df_unknown.drop(columns=[target_column] + drop_cols).dropna()

    if X_train.empty or X_test.empty:
        print(f" Skipping '{target_column}' — insufficient data.")
        return df_copy

    #  Fiting model
    if is_categorical:
        model = RandomForestClassifier(random_state=42)
    else:
        model = RandomForestRegressor(random_state=42)

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    if le:
        y_pred = le.inverse_transform(y_pred.astype(int))

    df_copy.loc[X_test.index, target_column] = y_pred
    print(f" Imputed {len(y_pred)} missing values in '{target_column}'")
    return df_copy
##################################################################3
# Explicitly mention categorical numeric columns
categorical_cols = ['RIAGENDR', 'PAQ605', 'DIQ010', 'age_group']

test_df = impute_with_model(test_df, 'RIAGENDR', drop_cols=['SEQN'], categorical_columns=categorical_cols)
test_df = impute_with_model(test_df, 'PAQ605', drop_cols=['SEQN'], categorical_columns=categorical_cols)
test_df = impute_with_model(test_df, 'DIQ010', drop_cols=['SEQN'], categorical_columns=categorical_cols)
test_df = impute_with_model(test_df, 'BMXBMI', drop_cols=['SEQN'], categorical_columns=categorical_cols)  
test_df = impute_with_model(test_df, 'LBXGLU', drop_cols=['SEQN'], categorical_columns=categorical_cols)  
test_df = impute_with_model(test_df, 'LBXGLT', drop_cols=['SEQN'], categorical_columns=categorical_cols)  
test_df = impute_with_model(test_df, 'LBXIN', drop_cols=['SEQN'], categorical_columns=categorical_cols)  

######################################################################


test_df['has_diabetes'] = test_df['DIQ010'].map({1: 1, 2: 0, 3: 0})
test_df['glu_insulin_ratio'] = test_df['LBXGLU'] / (test_df['LBXIN'] + 1e-3)
test_df['active'] = test_df['PAQ605'].replace({7: np.nan}).map({1: 1, 2: 0})


######################################################################
model.fit(X,y)
######################################################################

test_features = test_df[features]
test_pred = model.predict(test_features)

#  Creating submission file
submission = pd.DataFrame({'age_group': test_pred})
submission.to_csv("submission.csv", index=False)

print("\n Submission file 'submission.csv' created!")

 Imputed 18 missing values in 'RIAGENDR'
 Imputed 14 missing values in 'age_group'
 Imputed 13 missing values in 'PAQ605'
 Imputed 18 missing values in 'DIQ010'
 Imputed 18 missing values in 'BMXBMI'
 Imputed 13 missing values in 'LBXGLU'
 Imputed 11 missing values in 'LBXGLT'
 Imputed 9 missing values in 'LBXIN'

 Fold 1 F1 Score: 0.3827
              precision    recall  f1-score   support

         0.0       0.89      0.79      0.84       331
         1.0       0.31      0.49      0.38        63

    accuracy                           0.75       394
   macro avg       0.60      0.64      0.61       394
weighted avg       0.80      0.75      0.77       394



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



 Fold 2 F1 Score: 0.3497
              precision    recall  f1-score   support

         0.0       0.89      0.73      0.80       331
         1.0       0.26      0.52      0.35        62

    accuracy                           0.70       393
   macro avg       0.58      0.62      0.58       393
weighted avg       0.79      0.70      0.73       393


 Fold 3 F1 Score: 0.3905
              precision    recall  f1-score   support

         0.0       0.90      0.78      0.83       330
         1.0       0.31      0.52      0.39        63

    accuracy                           0.74       393
   macro avg       0.60      0.65      0.61       393
weighted avg       0.80      0.74      0.76       393



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



 Fold 4 F1 Score: 0.2932
              precision    recall  f1-score   support

         0.0       0.87      0.70      0.77       330
         1.0       0.22      0.44      0.29        63

    accuracy                           0.66       393
   macro avg       0.54      0.57      0.53       393
weighted avg       0.76      0.66      0.70       393


 Fold 5 F1 Score: 0.3575
              precision    recall  f1-score   support

         0.0       0.89      0.75      0.81       330
         1.0       0.28      0.51      0.36        63

    accuracy                           0.71       393
   macro avg       0.58      0.63      0.58       393
weighted avg       0.79      0.71      0.74       393


 Average F1 Score across folds: 0.3547


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


 Imputed 2 missing values in 'RIAGENDR'
 Imputed 1 missing values in 'PAQ605'
 Imputed 1 missing values in 'DIQ010'
 Imputed 1 missing values in 'BMXBMI'
 Imputed 1 missing values in 'LBXGLU'
 Imputed 2 missing values in 'LBXGLT'
 Imputed 1 missing values in 'LBXIN'

 Submission file 'submissiona.csv' created!


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
