# Webscraping NHANES 2017-2018 Data Files

Here we automate the process of downloading XPT files from the National Health and Nutrition Examination Survey (NHANES) dataset for the year 2017-18. It utilizes web scraping libraries like BeautifulSoup and requests to extract XPT file links from a specific webpage. The script then uses the wget library to download each XPT file and save them to a designated directory.

In [2]:
import os
import requests
from bs4 import BeautifulSoup
import wget

In [89]:
# NHANES 2017-18 Laboratory Data URL
url = "https://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component=Laboratory&CycleBeginYear=2017"

response = requests.get(url)

# Parsing the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, "html.parser")

# Finding all anchor tags with links
anchor_tags = soup.find_all("a")

# Extracting the XPT file links
xpt_links = [tag["href"] for tag in anchor_tags if tag.get("href", "").endswith(".XPT")]

# Specifying download location
download_dir = "../Dataset/Lab_XPT"
if not os.path.exists(download_dir):
    os.makedirs(download_dir)

# Downloading each XPT file
for xpt_link in xpt_links:
    xpt_url = "https://wwwn.cdc.gov" + xpt_link
    xpt_file = os.path.basename(xpt_url)
    save_path = os.path.join(download_dir, xpt_file)
    wget.download(xpt_url, save_path)
    print(f" {xpt_file} downloaded")

print("Download completed!")

100% [........................................................] 509760 / 509760 ALB_CR_J.XPT downloaded
100% [..........................................................] 96640 / 96640 UTAS_J.XPT downloaded
100% [........................................................] 336400 / 336400 UAS_J.XPT downloaded
100% [........................................................] 179680 / 179680 HDL_J.XPT downloaded
100% [........................................................] 245040 / 245040 TRIGLY_J.XPT downloaded
100% [........................................................] 179680 / 179680 TCHOL_J.XPT downloaded
100% [..........................................................] 96640 / 96640 UCM_J.XPT downloaded
100% [........................................................] 207680 / 207680 CRCO_J.XPT downloaded
100% [......................................................] 1476320 / 1476320 CBC_J.XPT downloaded
100% [........................................................] 318880 / 318880 C

In [87]:
# Downloading Demographic Data
download_dir = "../Dataset/Demographic_XPT"
if not os.path.exists(download_dir):
    os.makedirs(download_dir)

demo_link = 'https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/DEMO_J.XPT'
demo_file = os.path.basename(demo_link)
save_path = os.path.join(download_dir, demo_file)
wget.download(demo_link, save_path)
print(f" {demo_file} downloaded")

100% [......................................................] 3412720 / 3412720 DEMO_J.XPT downloaded


In [5]:
# Downloading Examination Data
download_dir = "../Dataset/Examination_XPT"
if not os.path.exists(download_dir):
    os.makedirs(download_dir)

# We only download the Body Measurements
ex_link = 'https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/BMX_J.XPT'
ex_file = os.path.basename(ex_link)
ex_path = os.path.join(download_dir, ex_file)
wget.download(ex_link, ex_path)
print(f" {ex_file} downloaded")

100% [......................................................] 1466000 / 1466000 BMX_J.XPT downloaded


In [95]:
# Downloading Questionnaire Data
q_url = 'https://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component=Questionnaire&CycleBeginYear=2017'
q_response = requests.get(q_url)

q_soup = BeautifulSoup(q_response.content, "html.parser")

q_anchor_tags = q_soup.find_all("a")

q_xpt_links = [tag["href"] for tag in q_anchor_tags if tag.get("href", "").endswith(".XPT")]

q_download_dir = "../Dataset/Questionnaire_XPT"
if not os.path.exists(q_download_dir):
    os.makedirs(q_download_dir)
    
for q_xpt_link in q_xpt_links:
    q_xpt_url = "https://wwwn.cdc.gov" + q_xpt_link
    q_xpt_file = os.path.basename(q_xpt_url)
    q_save_path = os.path.join(q_download_dir, q_xpt_file)
    wget.download(q_xpt_url, q_save_path)
    print(f" {q_xpt_file} downloaded")

print("Download completed!")

100% [........................................................] 405840 / 405840 ACQ_J.XPT downloaded
100% [........................................................] 444800 / 444800 ALQ_J.XPT downloaded
100% [......................................................] 4137120 / 4137120 AUQ_J.XPT downloaded
100% [........................................................] 544560 / 544560 BPQ_J.XPT downloaded
100% [........................................................] 531120 / 531120 CDQ_J.XPT downloaded
100% [........................................................] 445840 / 445840 CBQ_J.XPT downloaded
100% [......................................................] 2789440 / 2789440 CBQPFA_J.XPT downloaded
100% [......................................................] 1188720 / 1188720 CBQPFC_J.XPT downloaded
100% [........................................................] 604400 / 604400 HSQ_J.XPT downloaded
100% [........................................................] 248240 / 248240 DEQ_J