In [29]:
import pandas as pd
import json

# Reads in the file as a panda data frame 
def read_xpt(file_path):
    with open(file_path, "rb") as f:
        file = pd.read_sas(f, format="xport")
    return file

# this reads in a file and reads it as a json file 
def read_json(file_path):
    with open(file_path, "r") as f:
        file = json.load(f)
    return file

# this saves the json file on the described directory 
def save_json(file, file_path):
    with open(file_path, "w") as f:
        json.dump(file, f,indent=4)

In [30]:
import requests 
from bs4 import BeautifulSoup 
import html5lib
import csv

URL = 'https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/ACQ_J.htm'
html = requests.get(URL)
ACQ_G_file = BeautifulSoup(html.text,'html.parser')


In [31]:
# original code to get paragraph in component_description

# div_element = ACQ_G_file.find('div',id ="Sections")
# h2_element = div_element.find('p')
# p_content = h2_element.get_text(strip=True)

# imrpoved code to find component_descirption

# component description 
component_description = ACQ_G_file.find(id='Component_Description').find_next('p').text.strip()
print(component_description)

# first table 
table = ACQ_G_file.find('table')
thread = table.find('tr')
tbody = table.find('tbody')

table1_col_name = [th.get_text(strip=True) for th in thread.find_all('th')]
table1_data = []

# get the four coulumns into four different lists
self_identification = [tr.find_all('td')[0].text.strip() for tr in tbody.find_all('tr')]
questions_asked = [tr.find_all('td')[1].text.strip() for tr in tbody.find_all('tr')]
response_categories = [tr.find_all('td')[2].text.strip().replace('\r', '').replace('            ', '') for tr in tbody.find_all('tr')]
variable_name = [tr.find_all('td')[3].text.strip() for tr in tbody.find_all('tr')]

table1_data = [self_identification,questions_asked,response_categories,variable_name]
print(table1_data)
print(table1_col_name)
# print(self_identification)
# print(questions_asked)
# print(response_categories)
# print(variable_name)



The Acculturation section (variable name prefix ACQ) provides personal interview data on language use in the home. Questions asked and response categories used were customized, based on self-identified race and Hispanic origin, as shown in the table below.
[['Non-Hispanic white, Non-Hispanic black, or other race - including multi-racial', 'Mexican American or other Hispanic', 'Non-Hispanic Asian'], ['What language(s) do you usually speak at home?', 'What language(s) do you usually speak at home? Do you speak only Spanish, more Spanish than English, both equally, more English than Spanish, or only English?', 'Do you speak only (NON-ENGLISH LANGUAGE), more (NON-ENGLISH LANGUAGE) than English, both equally, more English than (NON-ENGLISH LANGUAGE), or only English?\n\r\n            Note: Participant self-reported "NON-ENGLISH LANGUAGE" used at home is pre-filled in this question during the interview.'], ['Check all that apply:\nEnglish (ACD011A)\nSpanish (ACD011B)\nOther languages (ACD011

In [32]:
# for data processing and editing 
data_processing = ACQ_G_file.find(id="Data_Processing_and_Editing").find_next('p').text.strip()
print(data_processing)    

Edits were made to ensure the completeness, consistency and analytic usefulness of the data.


In [33]:
# Codebook and Frequencies 

all_variable_keys = [] # contains the bolded text for codebook and frequencies 
all_variable_values = [] # contains the unbolded text that described the bolded text 
Data_For_tables = [] # contains the data for each table of the page. 
table_col_names = [] # contains the names for each column of each table. 
has_table = [] #use to see if block as table or not table, 0 for no and 1 for yes. 
list_dict = []
name_block = []

''' each codebook block starts with 'div',class='pagebreak'. So I iterate through creating list for each set of data. 
created a condtional within the for loop to check if they have a table. IF the block has a table it pulls all the data 
from the table and makes a nested list. 
'''
for variable_name in ACQ_G_file.find_all('div',class_="pagebreak"):

    codebook_dict = {}
    title_text = variable_name.find('h3').get_text()
    title_part = [part.strip() for part in title_text.split('-')]
    name_block.append(title_part[0])
    codebook_dict['Title']=title_part[1]
    # print(title_part)
    # gets all the bolded text from the block like Variable Name and SAS label 
    variable_keys = [dt.get_text(strip=True) for dt in variable_name.find_all('dt')]
    # gets all the unbolded text fromn the block which is used to describe the variable. 
    variable_attributes = [dd.get_text(strip=True).replace('\r\n\t\t\t',' ').replace('\n','') for dd in variable_name.find_all('dd')]

    for i in range(len(variable_keys)):
        codebook_dict[variable_keys[i]] = variable_attributes[i]
    
    # append both list to global list 
    # all_variable_keys.append(variable_keys)
    # all_variable_values.append(variable_attributes)

    # checks to see if the block has a table 
    if variable_name.find('table'):

        #goes to code block named table then tr  
        table = variable_name.find('table')
        thread = table.find('tr')

        # pulls the columns names of the table 
        column_name = [th.get_text(strip=True) for th in thread.find_all('th')]
        #table_col_names.append(column_name)


        # print(column_name)
        # print()

        # empty list for data in the table 
        data_for_col = []

        # Goes into the body of the table 
        tbody = table.find('tbody')

        # for loop to iterate through each column 
        for i in range(len(column_name)):

            # gets data for each olumn stores it in a list
            column_data = [tr.find_all('td')[i].text.strip() for tr in tbody.find_all('tr')]
            data_for_col.append(column_data)
            print(column_data)

        table_dict = {}
        for i in range(len(column_name)):
            table_dict[column_name[i]] = data_for_col[i]
        Data_For_tables.append(data_for_col)
        has_table.append(1)
        codebook_dict['Table:'] = table_dict
    else:
        has_table.append(0)   
    list_dict.append(codebook_dict)
print(list_dict)   
#     print(variable_keys)
#     print(variable_attributes)
#     print()
#     print(table_col_names)
#     print()
#     print(Data_For_tables)
#     print()
#     print(all_variable_keys)
#     print()
#     print(all_variable_values)
# print(has_table)

['1', '77', '99', '.']
['English', 'Refused', "Don't know", 'Missing']
['5230', '0', '0', '3191']
['5230', '5230', '5230', '8421']
['', '', '', '']
['8', '.']
['Spanish', 'Missing']
['16', '8405']
['16', '8421']
['', '']
['9', '.']
['Other', 'Missing']
['159', '8262']
['159', '8421']
['', '']
['1', '2', '3', '4', '5', '7', '9', '.']
['Only Spanish,', 'More Spanish than English', 'Both equally', 'More English than Spanish', 'Only English', 'Refused', "Don't know", 'Missing']
['529', '322', '334', '353', '414', '2', '3', '6464']
['529', '851', '1185', '1538', '1952', '1954', '1957', '8421']
['', '', '', '', '', '', '', '']
['1', '2', '3', '4', '5', '7', '9', '.']
['Only Non-English language', 'More Non-English than English', 'Both equally', 'More English than Non-English', 'Only English', 'Refused', "Don't know", 'Missing']
['439', '107', '124', '134', '327', '0', '0', '7290']
['439', '546', '670', '804', '1131', '1131', '1131', '8421']
['', '', '', '', '', '', '', '']
[{'Title': 'Respon

In [34]:
res = {
    "Component Description": 
    {
        "Description": component_description,
        "Table":{} 
    },
    "Data Processsing and Editing":
        {
        "Description": data_processing
        },
    "Codebook and Frequencies":{}
    
}
comp_des_table = {}
for i in range(len(table1_col_name)):
    comp_des_table[table1_col_name[i]] = table1_data[i]
res["Component Description"]["Table"] = comp_des_table 

codebook_freq = {}
for i in range(len(list_dict)):
    codebook_freq[name_block[i]] = list_dict[i]
print(codebook_freq)
res["Codebook and Frequencies"] = codebook_freq

save_path = "CODEBOOK_ACQ_J.json"

save_json(res, save_path)
print(res)

{'SEQN': {'Title': 'Respondent sequence number', 'Variable Name:': 'SEQN', 'SAS Label:': 'Respondent sequence number', 'English Text:': 'Respondent sequence number.', 'Target:': 'Both males and females 3 YEARS - 150 YEARS'}, 'ACQBOX1': {'Title': 'CHECK ITEM', 'Variable Name:': 'ACQBOX1', 'English Instructions:': 'BOX 1. CHECK ITEM: IF THE PARTICIPANT SELF-IDENTIFIED AS "MEXICAN AMERICAN" OR "OTHER HISPANIC" (i.e., RIDRETH3=1, OR 2), GO TO ACD040;  IF THE PARTICIPANT SELF-IDENTIFIED AS "NON-HISPANIC ASIAN" (i.e., RIDRETH3=6), GO TO ACD110;  IF THE PARTICIPANT SELF-IDENTIFIED AS "NON-HISPANIC WHITE", "NON-HISPANIC BLACK", OR "OTHER RACE - INCLUDING MULTI-RACIAL" (i.e., RIDRETH3=3, 4, OR 7), CONTINUE WITH ACD011A.', 'Target:': 'Both males and females 3 YEARS - 150 YEARS'}, 'ACD011A': {'Title': 'Speak English at home', 'Variable Name:': 'ACD011A', 'SAS Label:': 'Speak English at home - NHW or NHB', 'English Text:': 'What language(s) {do you/does SP} usually speak at home?', 'English Instru