In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np
import pickle
# Load the data
data = pd.read_csv('Train_Data.csv')


In [2]:
data = data.fillna(data[['SEQN','RIDAGEYR','RIAGENDR']].mean())

In [3]:
data = data.fillna(data[['PAQ605','BMXBMI','LBXGLU','DIQ010','LBXGLT','LBXIN']].median())

In [4]:
data.isnull().sum()

SEQN          0
RIDAGEYR      0
RIAGENDR      0
PAQ605        0
BMXBMI        0
LBXGLU        0
DIQ010        0
LBXGLT        0
LBXIN         0
age_group    14
dtype: int64

In [5]:
data['age_group'] = data['age_group'].map({'Adult':0,'Senior':1,np.nan:0})

In [6]:
X = data[['RIDAGEYR', 'RIAGENDR', 'PAQ605', 'BMXBMI', 'LBXGLU', 'DIQ010', 'LBXGLT', 'LBXIN']]
y = data['age_group']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [9]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

In [10]:
rf_model.fit(X_train, y_train)

In [11]:
y_pred = rf_model.predict(X_test)

In [12]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Random Forest Accuracy: {accuracy * 100:.2f}%')

Random Forest Accuracy: 100.00%


In [13]:
with open('random_forest_model.pkl', 'wb') as model_file:
    pickle.dump(rf_model, model_file)
with open('scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)

## Test data

In [14]:
test_data = pd.read_csv('Test_Data.csv')
test_data.head()

Unnamed: 0,SEQN,RIDAGEYR,RIAGENDR,PAQ605,BMXBMI,LBXGLU,DIQ010,LBXGLT,LBXIN
0,77017.0,34.0,1.0,1.0,32.2,96.0,2.0,135.0,15.11
1,75580.0,12.0,2.0,2.0,26.3,100.0,2.0,141.0,15.26
2,73820.0,56.0,1.0,2.0,28.6,107.0,2.0,136.0,8.82
3,80489.0,20.0,2.0,1.0,22.1,93.0,2.0,111.0,12.13
4,82047.0,64.0,1.0,1.0,24.7,91.0,2.0,105.0,3.12


In [15]:
test_data.isnull().sum()

SEQN        2
RIDAGEYR    3
RIAGENDR    2
PAQ605      1
BMXBMI      1
LBXGLU      1
DIQ010      1
LBXGLT      2
LBXIN       1
dtype: int64

In [16]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import joblib

# Load the trained model and scaler
rf_model = joblib.load('random_forest_model.pkl')
scaler = joblib.load('scaler.pkl')

# Load the test data
test_data = pd.read_csv('test_data.csv')

# Handle missing values in features (same as training)
test_data.fillna(test_data[['SEQN', 'RIDAGEYR', 'RIAGENDR']].mean(), inplace=True)
test_data.fillna(test_data[['PAQ605', 'BMXBMI', 'LBXGLU', 'DIQ010', 'LBXGLT', 'LBXIN']].median(), inplace=True)

# Select features (same columns as training)
X_test = test_data[['RIDAGEYR', 'RIAGENDR', 'PAQ605', 'BMXBMI', 'LBXGLU', 'DIQ010', 'LBXGLT', 'LBXIN']]

# Scale features using the saved scaler
X_test_scaled = scaler.transform(X_test)

# Make predictions
predictions = rf_model.predict(X_test_scaled)

# Create submission DataFrame
submission = pd.DataFrame({'age_group': predictions})

# Save to CSV
submission.to_csv('submission1.csv', index=False)

print("Sample submission file created successfully!")
print(submission.head())


Sample submission file created successfully!
   age_group
0          0
1          0
2          0
3          0
4          0
