In [19]:
import pandas as pd

train = pd.read_csv("Train_Data.csv")
test = pd.read_csv("Test_Data.csv")
sample = pd.read_csv("Sample_Submission.csv")

print("Train shape:", train.shape)
print("Test shape:", test.shape)
print("Sample Submission shape:", sample.shape)

display(train.head())
display(test.head())
display(sample.head())

Train shape: (1966, 9)
Test shape: (312, 8)
Sample Submission shape: (312, 1)


Unnamed: 0,SEQN,RIAGENDR,PAQ605,BMXBMI,LBXGLU,DIQ010,LBXGLT,LBXIN,age_group
0,73564.0,2.0,2.0,35.7,110.0,2.0,150.0,14.91,Adult
1,73568.0,2.0,2.0,20.3,89.0,2.0,80.0,3.85,Adult
2,73576.0,1.0,2.0,23.2,89.0,2.0,68.0,6.14,Adult
3,73577.0,1.0,2.0,28.9,104.0,,84.0,16.15,Adult
4,73580.0,2.0,1.0,35.9,103.0,2.0,81.0,10.92,Adult


Unnamed: 0,SEQN,RIAGENDR,PAQ605,BMXBMI,LBXGLU,DIQ010,LBXGLT,LBXIN
0,77017.0,1.0,1.0,32.2,96.0,2.0,135.0,15.11
1,75580.0,2.0,2.0,26.3,100.0,2.0,141.0,15.26
2,73820.0,1.0,2.0,28.6,107.0,2.0,136.0,8.82
3,80489.0,2.0,1.0,22.1,93.0,2.0,111.0,12.13
4,82047.0,1.0,1.0,24.7,91.0,2.0,105.0,3.12


Unnamed: 0,age_group
0,0
1,0
2,0
3,0
4,0


In [20]:
train = train.drop(columns=["SEQN"])
test = test.drop(columns=["SEQN"])

print("Missing values in Train:\n", train.isnull().sum())
print("\nMissing values in Test:\n", test.isnull().sum())

print("\nTrain dtypes:\n", train.dtypes)

Missing values in Train:
 RIAGENDR     18
PAQ605       13
BMXBMI       18
LBXGLU       13
DIQ010       18
LBXGLT       11
LBXIN         9
age_group    14
dtype: int64

Missing values in Test:
 RIAGENDR    2
PAQ605      1
BMXBMI      1
LBXGLU      1
DIQ010      1
LBXGLT      2
LBXIN       1
dtype: int64

Train dtypes:
 RIAGENDR     float64
PAQ605       float64
BMXBMI       float64
LBXGLU       float64
DIQ010       float64
LBXGLT       float64
LBXIN        float64
age_group     object
dtype: object


In [4]:
for col in train.columns:
    if col != 'age_group':
        median_val = train[col].median()
        train[col] = train[col].fillna(median_val)
        test[col] = test[col].fillna(median_val)

train = train.dropna(subset=['age_group'])

train['age_group'] = train['age_group'].map({'Adult': 0, 'Senior': 1}).astype(int)

print("Train shape after cleaning:", train.shape)
print("Test shape after cleaning:", test.shape)
print("Any NaNs left in train:", train.isnull().sum().sum())
print("Any NaNs left in test:", test.isnull().sum().sum())


Train shape after cleaning: (1952, 8)
Test shape after cleaning: (312, 7)
Any NaNs left in train: 0
Any NaNs left in test: 0


In [6]:
from sklearn.preprocessing import StandardScaler

X = train.drop('age_group', axis=1)
y = train['age_group']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
test_scaled = scaler.transform(test)


In [7]:
y = y.map({'Adult': 0, 'Senior': 1})


In [11]:
X = train.drop('age_group', axis=1)
y = train['age_group'].astype(int)
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_valid)
accuracy = accuracy_score(y_valid, y_pred)
print("Validation Accuracy:", accuracy)


Validation Accuracy: 0.8286445012787724


In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_valid)
acc = accuracy_score(y_valid, y_pred)

print("Validation Accuracy with Random Forest:", acc)


Validation Accuracy with Random Forest: 0.8235294117647058


In [16]:
test_df = test.dropna()
final_model = RandomForestClassifier(n_estimators=100, random_state=42)
final_model.fit(X, y)
final_preds = final_model.predict(test_df)
submission = pd.DataFrame({'age_group': final_preds})
submission.to_csv("Final_Submission.csv", index=False)
