In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, f1_score

df=pd.read_csv("Train_Data.csv")

In [2]:
df.head()

Unnamed: 0,SEQN,RIAGENDR,PAQ605,BMXBMI,LBXGLU,DIQ010,LBXGLT,LBXIN,age_group
0,73564.0,2.0,2.0,35.7,110.0,2.0,150.0,14.91,Adult
1,73568.0,2.0,2.0,20.3,89.0,2.0,80.0,3.85,Adult
2,73576.0,1.0,2.0,23.2,89.0,2.0,68.0,6.14,Adult
3,73577.0,1.0,2.0,28.9,104.0,,84.0,16.15,Adult
4,73580.0,2.0,1.0,35.9,103.0,2.0,81.0,10.92,Adult


In [3]:
df=df[df['age_group'].notna()].copy()

In [4]:
df['age_group']= df['age_group'].map({'Adult': 0,'Senior': 1})
df['PAQ605'] =df['PAQ605'].map({1: 1,2: 0})
df['DIQ010'] =df['DIQ010'].map({1: 1, 2: 0})

In [5]:
df =df.drop(columns=['SEQN'])

In [6]:
base_features =['RIAGENDR','PAQ605','BMXBMI','LBXGLU','DIQ010','LBXGLT','LBXIN']
X =df[base_features].copy()
y =df['age_group']

In [7]:
X =pd.DataFrame(SimpleImputer(strategy='median').fit_transform(X),columns=base_features)

In [8]:
X['BMI_GT_30'] =(X['BMXBMI']>30).astype(int)
X['GLU_BY_INS'] =X['LBXGLU']/(X['LBXIN']+1e-5)
X['GLU_BY_TOL'] =X['LBXGLU']/(X['LBXGLT']+1e-5)
X['INS_BY_BMI'] =X['LBXIN']/(X['BMXBMI']+1e-5)

In [9]:
features =X.columns.tolist()
scaler= StandardScaler()
X_scaled =scaler.fit_transform(X)

In [10]:
X_train,X_test,y_train,y_test =train_test_split(X_scaled, y,stratify=y,test_size=0.2,random_state=42)

In [11]:
model =LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)
model.fit(X_train, y_train)

y_proba=model.predict_proba(X_test)[:, 1]
bestf1=0
bestthresh=0.5
for t in np.linspace(0.3,0.9,61):
    preds =(y_proba >= t).astype(int)
    f1 =f1_score(y_test, preds, average='macro')
    if f1>bestf1:
        bestf1=f1
        bestthresh=t

print(f"Best F1: {bestf1:} at threshold {bestthresh:}")
print(classification_report(y_test,(y_proba >= bestthresh).astype(int),target_names=["Adult","Senior"]))

Best F1: 0.6150398913597013 at threshold 0.6000000000000001
              precision    recall  f1-score   support

       Adult       0.88      0.85      0.87       328
      Senior       0.34      0.40      0.36        63

    accuracy                           0.78       391
   macro avg       0.61      0.62      0.62       391
weighted avg       0.79      0.78      0.78       391



In [12]:
test_df=pd.read_csv("Test_Data.csv")
test_df=test_df.copy()
test_df['PAQ605'] =test_df['PAQ605'].map({1: 1,2: 0})
test_df['DIQ010'] =test_df['DIQ010'].map({1: 1,2: 0})

test_df =test_df.drop(columns=['SEQN'])

X_test_final =test_df[base_features].copy()

X_test_final=pd.DataFrame(SimpleImputer(strategy='median').fit(X[base_features]).transform(X_test_final),columns=base_features)

X_test_final['BMI_GT_30'] =(X_test_final['BMXBMI']>30).astype(int)
X_test_final['GLU_BY_INS'] =X_test_final['LBXGLU']/(X_test_final['LBXIN']+1e-5)
X_test_final['GLU_BY_TOL']=X_test_final['LBXGLU']/(X_test_final['LBXGLT']+1e-5)
X_test_final['INS_BY_BMI'] =X_test_final['LBXIN']/(X_test_final['BMXBMI']+1e-5)

X_test_scaled = scaler.transform(X_test_final)

y_test_proba = model.predict_proba(X_test_scaled)[:, 1]
y_test_final = (y_test_proba>=0.60).astype(int)

submission = pd.DataFrame({'age_group':y_test_final})
    
submission.to_csv("final_submission.csv",index=False)