# Predicting Age from National Health and Nutrition Health Survey 2013-2014

by Ismail (Husain) Bhinderwala, Rashid Mammadov, Sienko Ikhabi, Dongchun Chen

# Methods and Results

In [22]:
import numpy as np
import requests
import zipfile
import pandas as pd
import altair as alt
from sklearn.model_selection import train_test_split
from sklearn import set_config

In [23]:
# downloading data as zip and extracting
url = "https://archive.ics.uci.edu/static/public/887/national+health+and+nutrition+health+survey+2013-2014+(nhanes)+age+prediction+subset.zip"

request = requests.get(url)
with open("../data/raw/national+health+and+nutrition+health+survey+2013-2014+(nhanes)+age+prediction+subset.zip", 'wb') as f:
    f.write(request.content)

with zipfile.ZipFile("../data/raw/national+health+and+nutrition+health+survey+2013-2014+(nhanes)+age+prediction+subset.zip", 'r') as zip_ref:
    zip_ref.extractall("../data/raw")

In [24]:
# pre-processing data
col_names = [
    "id",
    "age_group",
    "age",
    "gender",
    "weekly_physical_activity",
    "bmi",
    "blood_glucose_fasting",
    "diabetic",
    "oral",
    "insulin_level"
]

data = pd.read_csv("../data/raw/NHANES_age_prediction.csv", names=col_names, skiprows=1).drop(columns=["id","age"])

data["gender"] = data["gender"].replace({
    1 : "Male",
    2 : "Female"
})

data["weekly_physical_activity"] = data["weekly_physical_activity"].replace({
    1 : "Yes",
    2 : "No"
})

data["diabetic"] = data["diabetic"].replace({
    1 : "Yes",
    2 : "No",
    3 : "Borderline"
})

data

Unnamed: 0,age_group,gender,weekly_physical_activity,bmi,blood_glucose_fasting,diabetic,oral,insulin_level
0,Adult,Female,No,35.7,110.0,No,150.0,14.91
1,Adult,Female,No,20.3,89.0,No,80.0,3.85
2,Adult,Male,No,23.2,89.0,No,68.0,6.14
3,Adult,Male,No,28.9,104.0,No,84.0,16.15
4,Adult,Female,Yes,35.9,103.0,No,81.0,10.92
...,...,...,...,...,...,...,...,...
2273,Adult,Female,No,33.5,100.0,No,73.0,6.53
2274,Adult,Male,No,30.0,93.0,No,208.0,13.02
2275,Adult,Male,No,23.7,103.0,No,124.0,21.41
2276,Adult,Female,No,27.4,90.0,No,108.0,4.99


In [25]:
data.describe()

Unnamed: 0,bmi,blood_glucose_fasting,oral,insulin_level
count,2278.0,2278.0,2278.0,2278.0
mean,27.95518,99.553117,114.978929,11.834794
std,7.248962,17.889834,47.061239,9.718812
min,14.5,63.0,40.0,0.14
25%,22.8,91.0,87.0,5.86
50%,26.8,97.0,105.0,9.04
75%,31.2,104.0,130.0,14.44
max,70.1,405.0,604.0,102.29


In [26]:
data["age_group"].value_counts() #target variable
#class imbalance in the data

age_group
Adult     1914
Senior     364
Name: count, dtype: int64

In [27]:
data["gender"].value_counts()

gender
Female    1165
Male      1113
Name: count, dtype: int64

In [28]:
data["weekly_physical_activity"].value_counts()

weekly_physical_activity
No     1868
Yes     409
7.0       1
Name: count, dtype: int64

In [29]:
data["diabetic"].value_counts()

diabetic
No            2199
Borderline      58
Yes             21
Name: count, dtype: int64

In [30]:
np.random.seed(522)
set_config(transform_output="pandas")

# creating the train test split
data_train, data_test = train_test_split(
    data, train_size=0.75, stratify=data["age_group"] #using stratify for class imbalance in target variable
)

data_train.to_csv("../data/processed/data_train.csv")
data_test.to_csv("../data/processed/data_test.csv")

# References

NA N. National Health and Nutrition Health Survey 2013-2014 (NHANES) Age Prediction Subset [dataset]. 2019. UCI Machine Learning Repository. Available from: https://doi.org/10.24432/C5BS66.

Harris CR, Millman KJ, Van Der Walt SJ, Gommers R, Virtanen P, Cournapeau D, Wieser E, Taylor J, Berg S, Smith NJ, Kern R. Array programming with NumPy. Nature. 2020 Sep 17;585(7825):357-62.

VanderPlas J, Granger B, Heer J, Moritz D, Wongsuphasawat K, Satyanarayan A, Lees E, Timofeev I, Welsh B, Sievert S. Altair: interactive statistical visualizations for Python. Journal of open source software. 2018 Dec 10;3(32):1057.

Pedregosa F. Scikit‐learn: Machine learning in python Fabian. Journal of machine learning research. 2011;12:2825.

Van Rossum G, Drake FL. Introduction to python 3: python documentation manual part 1. CreateSpace; 2009 Mar 20.