In [1]:
import numpy as np
import pandas as pd
import sklearn 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.metrics import f1_score, make_scorer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier

In [2]:
df = pd.read_csv("Train_data.csv")
df.head(n = 10)

Unnamed: 0,SEQN,RIAGENDR,PAQ605,BMXBMI,LBXGLU,DIQ010,LBXGLT,LBXIN,age_group
0,73564.0,2.0,2.0,35.7,110.0,2.0,150.0,14.91,Adult
1,73568.0,2.0,2.0,20.3,89.0,2.0,80.0,3.85,Adult
2,73576.0,1.0,2.0,23.2,89.0,2.0,68.0,6.14,Adult
3,73577.0,1.0,2.0,28.9,104.0,,84.0,16.15,Adult
4,73580.0,2.0,1.0,35.9,103.0,2.0,81.0,10.92,Adult
5,73581.0,1.0,2.0,23.6,110.0,2.0,100.0,6.08,Adult
6,73587.0,1.0,2.0,38.7,94.0,2.0,202.0,21.11,Adult
7,73596.0,2.0,2.0,38.3,107.0,2.0,164.0,20.93,Adult
8,73607.0,1.0,2.0,38.9,89.0,2.0,113.0,17.47,Senior
9,73610.0,1.0,1.0,28.9,90.0,2.0,95.0,3.24,Adult


In [3]:
df.fillna(df.median(numeric_only=True), inplace=True) #simple mean imputation [This part has a lot of scope for imporovement.]
#keep in mind that the data is inherently noisy and the test dataset is not.
df.isnull().sum()

SEQN          0
RIAGENDR      0
PAQ605        0
BMXBMI        0
LBXGLU        0
DIQ010        0
LBXGLT        0
LBXIN         0
age_group    14
dtype: int64

In [4]:
# Drop ID column
df.drop(columns=['SEQN'], inplace=True)

# Map 'Adult' and 'Senior' to 0 and 1
df['age_group'] = df['age_group'].map({'Adult': 0, 'Senior': 1})

# Drop rows where mapping failed (i.e., where age_group is now NaN)
df = df[df['age_group'].notna()]

# Final y and X
y = df['age_group'].astype(int)
X = df.drop(columns=['age_group'])


# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

model = RandomForestClassifier()
model.fit(X_train, y_train)
# getting the predicted probability
y_pred_prob =  model.predict_proba(X_test)
# Predict on test set
threshold = 0.11
y_pred = (y_pred_prob[:, 1] >= threshold).astype(int)

print(classification_report(y_test, y_pred[:len(y_test)]))


              precision    recall  f1-score   support

           0       0.89      0.51      0.65       328
           1       0.21      0.68      0.32        63

    accuracy                           0.54       391
   macro avg       0.55      0.60      0.49       391
weighted avg       0.78      0.54      0.60       391



In [5]:
print(df['age_group'].value_counts())

age_group
0.0    1638
1.0     314
Name: count, dtype: int64


In [6]:
# Load and preprocess test data
test_data = pd.read_csv("Test_Data.csv")
test_data.drop(columns=["SEQN"], inplace=True)
# Handle missing values (using median is good for skewed data)
test_data.fillna(test_data.median(numeric_only=True), inplace=True)

# Ensure columns match training data
test_data = test_data[X_train.columns]

In [7]:
# Predict probabilities
y_test_prob = model.predict_proba(test_data)

# Apply tuned threshold
threshold = 0.13
y_test_pred = (y_test_prob[:, 1] >= threshold).astype(int)

In [8]:
result = pd.DataFrame({
    'age_group': y_test_pred
})
result

Unnamed: 0,age_group
0,0
1,1
2,1
3,0
4,0
...,...
307,0
308,0
309,0
310,1


In [9]:
y_pred = model.predict_proba(X_test)[:, 1]
y_pred = (y_pred >= 0.11).astype(int)
f1 = f1_score(y_test, y_pred)
print("F1 Score:", f1)

F1 Score: 0.3233082706766917


In [10]:
import matplotlib.pyplot as plt

# Predict probabilities on validation set
y_probs = model.predict_proba(X_test)[:, 1]

# Try thresholds from 0.0 to 1.0 in 0.01 steps
thresholds = np.arange(0, 1.01, 0.01)
f1_scores = []

for t in thresholds:
    y_pred_thresh = (y_probs >= t).astype(int)
    score = f1_score(y_test, y_pred_thresh)
    f1_scores.append(score)

# Find best threshold
best_idx = np.argmax(f1_scores)
best_thresh = thresholds[best_idx]
best_f1 = f1_scores[best_idx]

print(f"Best Threshold: {best_thresh:.2f}")
print(f"Best F1 Score: {best_f1:.4f}")


Best Threshold: 0.17
Best F1 Score: 0.3671


In [11]:
result.to_csv("Super_submission.csv", index=False)