In [5]:
import pandas as pd
import json
import requests
import re
from bs4 import BeautifulSoup

In [6]:
def get_bs_doc(url = 'https://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component=Dietary&CycleBeginYear=1999'):
  result = requests.get(url)
  doc = BeautifulSoup(result.text, "html.parser")
  return doc

def read_xpt(file_path):
    with open(file_path, "rb") as f:
        file = pd.read_sas(f, format="xport")
    return file


def read_json(file_path):
    with open(file_path, "r") as f:
        file = json.load(f)
    return file


def save_json(file, file_path):
    with open(file_path, "w") as f:
        json.dump(file, f)

In [8]:
### Helper functions for the cell below


def get_table(table):
  # Return a dictionary where each key is a header and each value is a list of that header's values
  if not table:
    return ""

  header_tags = table.find_all("th")
  headers = [header_tag.text.strip() for header_tag in header_tags]
  cell_tags = table.find_all("td")
  cells = [cell_tag.text.strip() for cell_tag in cell_tags]

  tabledict = dict()
  for i in range(len(headers)):
    value_list = [cells[j] for j in range(len(cells)) if j % len(headers) == i]
    tabledict[headers[i]] = value_list

  return tabledict


def matchbyindex_items_tables(pagebreaks):
  items = [""]*len(pagebreaks)
  tables = [""]*len(pagebreaks)
  for i, pagebreak in enumerate(pagebreaks):
    item = pagebreak.find("dl")
    table = pagebreak.find("table")
    # if table and/or item is missing from a pagebreak, store with empty string placeholder
    if item:
      items[i] = item
    if table:
      tables[i] = table

  return items, tables


In [9]:
### Formatting description file into JSON ###

# this cell will be a function later, I'm just getting it work
# The cell returns a json-formatted dictionary for the codebook of the description file

### start URLS I'm testing
acq_j_url = 'https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/ACQ_J.htm' # original url in example
acq_i_url= 'https://wwwn.cdc.gov/Nchs/Nhanes/2015-2016/ACQ_I.htm' # another url to test on, a bit older
### end URLS I'm testing


# Getting the entire page into bs4 object
response = requests.get(acq_i_url) # params={'format':'json'})
html_content = response.content
soup = BeautifulSoup(html_content, "html.parser")

# Target_div is the Codebook div (this cell only returns codebook json, at least for now)
target_div = soup.find(id="Codebook")


title_tags = target_div.find_all("h3")
titles = [title.text.strip() for title in title_tags] # titles of each feature name (eg SEQN, ACQBOX1, etc.)

pagebreak_tags = target_div.find_all(class_="pagebreak") # sections are conveniently divided by pagebreak divs

raw_codebook_items, raw_codebook_tables = matchbyindex_items_tables(pagebreak_tags) # this fxn ensures that corresponding items & tables are stored together (some tables are missing)

# I want a list of dictionaries(key=title of block, value=dictionary(key=term, value=defn))
processed_codebook_items = []


for i in range(len(titles)):
  item = raw_codebook_items[i]
  term_tags = item.find_all("dt") # dt tags hold a "description term"
  terms = [term.text.strip() for term in term_tags]
  def_tags = item.find_all("dd") # dd tags hold a "description definition"
  definitions = [defn.text.strip() for defn in def_tags]
  terms.insert(0, "title")
  definitions.insert(0, re.search(r"- (.*)", titles[i]).group()[2:])
  terms.append("table")
  definitions.append(get_table(raw_codebook_tables[i]))
  term_def_pairs = {terms[i] : definitions[i] for i in range(len(terms))}
  processed_codebook_items.append({re.search(r"(.*?) -", titles[i]).group()[:-1]: term_def_pairs})


json_version = json.dumps(processed_codebook_items, indent=4)
print(json_version)


[
    {
        "SEQN ": {
            "title": "Respondent sequence number",
            "Variable Name:": "SEQN",
            "SAS Label:": "Respondent sequence number",
            "English Text:": "Respondent sequence number.",
            "Target:": "Both males and females 3 YEARS -\r\n\t\t\t150 YEARS",
            "table": ""
        }
    },
    {
        "ACQBOX1 ": {
            "title": "CHECK ITEM",
            "Variable Name:": "ACQBOX1",
            "English Instructions:": "BOX 1. CHECK ITEM: IF THE PARTICIPANT SELF-IDENTIFIED AS \"MEXICAN AMERICAN\" OR \"OTHER HISPANIC\" (i.e., RIDRETH3=1, OR 2), GO TO ACD040;  IF THE PARTICIPANT SELF-IDENTIFIED AS \"NON-HISPANIC ASIAN\" (i.e., RIDRETH3=6), GO TO ACD110;  IF THE PARTICIPANT SELF-IDENTIFIED AS \"NON-HISPANIC WHITE\", \"NON-HISPANIC BLACK\", OR \"OTHER RACE - INCLUDING MULTI-RACIAL\" (i.e., RIDRETH3=3, 4, OR 7), CONTINUE WITH ACD011A.",
            "Target:": "Both males and females 3 YEARS -\r\n\t\t\t150 YEARS",
         

### **BEGIN 5/31 WORK**

In [10]:

### Formatting description file into JSON ###

# This function returns a json-formatted dictionary for the codebook section of a doc file

def format_codebook_as_json(url):
  # Getting the entire page into bs4 object
  response = requests.get(url)
  html_content = response.content
  soup = BeautifulSoup(html_content, "html.parser")

  # Target_div is the Codebook div (this cell only returns codebook json)
  target_div = soup.find(id="Codebook")


  title_tags = target_div.find_all("h3")
  titles = [title.text.strip() for title in title_tags] # titles of each feature name (eg SEQN, ACQBOX1, etc.)

  pagebreak_tags = target_div.find_all(class_="pagebreak") # sections are conveniently divided by pagebreak divs

  raw_codebook_items, raw_codebook_tables = matchbyindex_items_tables(pagebreak_tags) # this fxn ensures that corresponding items & tables are stored together (some tables are missing)

  # I want a list of dictionaries(key=title of block, value=dictionary(key=term, value=defn))
  processed_codebook_items = dict()


  for i in range(len(titles)):
    item = raw_codebook_items[i]
    term_tags = item.find_all("dt") # dt tags hold a "description term"
    terms = [term.text.strip() for term in term_tags]
    def_tags = item.find_all("dd") # dd tags hold a "description definition"
    definitions = [defn.text.strip() for defn in def_tags]
    terms.insert(0, "title")
    definitions.insert(0, re.search(r"- (.*)", titles[i]).group()[2:])
    terms.append("table")
    definitions.append(get_table(raw_codebook_tables[i]))
    term_def_pairs = {terms[i] : definitions[i] for i in range(len(terms))}
    processed_codebook_items[re.search(r"(.*?) -", titles[i]).group()[:-1]] = term_def_pairs


  return processed_codebook_items



In [11]:
formatted_codebook = format_codebook_as_json('https://wwwn.cdc.gov/Nchs/Nhanes/2009-2010/ALQ_F.htm')
json_version = json.dumps(formatted_codebook, indent=4)
print(json_version)


{
    "SEQN ": {
        "title": "Respondent sequence number",
        "Variable Name:": "SEQN",
        "SAS Label:": "Respondent sequence number",
        "English Text:": "Respondent sequence number.",
        "Target:": "Both males and females 20 YEARS -\r\n\t\t\t150 YEARS",
        "table": ""
    },
    "ALQ101 ": {
        "title": "Had at least 12 alcohol drinks/1 yr?",
        "Variable Name:": "ALQ101",
        "SAS Label:": "Had at least 12 alcohol drinks/1 yr?",
        "English Text:": "The next questions are about drinking alcoholic beverages.  Included are liquor (such as whiskey or gin), beer, wine, wine coolers, and any other type of alcoholic beverage.In any one year, {have you/has SP} had at least 12 drinks of any type of alcoholic beverage?  By a drink, I mean a 12 oz. beer, a 5 oz. glass of wine, or one and half ounces of liquor.",
        "Target:": "Both males and females 20 YEARS -\r\n\t\t\t150 YEARS",
        "table": {
            "Code or Value": [
         

In [12]:
### TODO: make functions for the rest of the categories on a doc page, including:
#Component Description
#Eligible Sample
#Interview Setting and Mode of Administration
#Quality Assurance & Quality Control
#Data Processing and Editing
#Analytic Notes

# Already complete:
# Codebook

# after that, join the above list together in a dictionary, with each item of the above list being a key with value=dict returned from their respective fxns







In [17]:
def format_ComponentDescription_as_json(url):

  # I want a dict: {}


   # Getting the entire page into bs4 object
  response = requests.get(url)
  html_content = response.content
  soup = BeautifulSoup(html_content, "html.parser")

  # Target_div is the ComponentDescription
  target_header = soup.find("h2", id="Component_Description")
  paragraph = target_header.find_next('p')
  print(paragraph.text)


In [18]:
format_ComponentDescription_as_json('https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/ACQ_J.htm')

The Acculturation section (variable name prefix ACQ) provides personal interview data on language use in the home. Questions asked and response categories used were customized, based on self-identified race and Hispanic origin, as shown in the table below.
