In [None]:
import requests
import pandas as pd
import numpy as np
import json
import regex as re
pd.set_option('display.max_columns', 500)

## Explore all the ACS Variables and Determine the Appropriate Variables

#### This is simply an exploratory script. There is a significant amount of variety in the ACS variables, as they change every couple years. In additon, there are different levels of specification for the Occupation, Education, and Income categories.

### Set-up EPA API requests

In [None]:
def get_request(endpoint, parameters=dict()):
    '''Build and request an ACS GET endpoint with the appropriate parameters'''
    api_url = 'https://api.census.gov/data/'
    response = requests.get(api_url + endpoint, parameters)
    if response.status_code != 200:
        print("Request to {} failed. Error code {}:{}".format(api_url + endpoint, response.status_code, response.text))
    
    response = json.loads(response.text)
    return response

### Education Attainment

In [9]:
# Retrieve profile of data variables
url ='/acs/acs1/variables.json'
response = get_request(str(2019) + url)
variables = (response['variables'])

# Retrieve the list of education attainment variables by age
education = list()
for var in variables.keys():
    levels = ["less than high school graduate", "bachelor's degree", "high school graduate (includes equivalency)", "some college or associate's degree", "graduate or professional degree"]
    levels = [i.lower() for i in levels]
    edu_str = re.compile('^estimate!!total.*!!.+')  #regex for educational attainment
    # if bool(re.match(edu_str, variables[var]['label'].lower())):
    label = variables[var]['label'].lower()
    if bool(re.match(edu_str, label)) and (label.replace('estimate!!total!!', '').replace('estimate!!total:!!', '').lower() in levels):
        if 'concept' in variables[var] and ('geographical mobility' in variables[var]['concept'].lower() and 'united states' in variables[var]['concept'].lower()):
            education.append(var)
            print("{: <70} {}".format(variables[var]['label'], variables[var]['concept']))
print(len(education))

Estimate!!Total:!!Less than high school graduate                       GEOGRAPHICAL MOBILITY IN THE PAST YEAR BY EDUCATIONAL ATTAINMENT FOR CURRENT RESIDENCE IN THE UNITED STATES
Estimate!!Total:!!Graduate or professional degree                      GEOGRAPHICAL MOBILITY IN THE PAST YEAR BY EDUCATIONAL ATTAINMENT FOR CURRENT RESIDENCE IN THE UNITED STATES
Estimate!!Total:!!Some college or associate's degree                   GEOGRAPHICAL MOBILITY IN THE PAST YEAR BY EDUCATIONAL ATTAINMENT FOR CURRENT RESIDENCE IN THE UNITED STATES
Estimate!!Total:!!Bachelor's degree                                    GEOGRAPHICAL MOBILITY IN THE PAST YEAR BY EDUCATIONAL ATTAINMENT FOR CURRENT RESIDENCE IN THE UNITED STATES
Estimate!!Total:!!High school graduate (includes equivalency)          GEOGRAPHICAL MOBILITY IN THE PAST YEAR BY EDUCATIONAL ATTAINMENT FOR CURRENT RESIDENCE IN THE UNITED STATES
5


In [99]:
income_vars = ["B19001_{:03.0f}E".format(i) for i in [1, 10, 11, 12, 13, 14, 15, 16, 17]]

# Retrieve the list of education attainment variables by age
education = list()
for year in range(2005, 2020, 1):
    # Retrieve profile of data variables
    url ='/acs/acs1/variables.json'
    response = get_request(str(year) + url)
    variables = (response['variables'])
    for var in variables.keys():
        if var in income_vars:
            education.append(variables[var]['label'].replace(':',''))
np.unique(education)

array(['Estimate!!Total', 'Estimate!!Total!!$100,000 to $124,999',
       'Estimate!!Total!!$125,000 to $149,999',
       'Estimate!!Total!!$150,000 to $199,999',
       'Estimate!!Total!!$200,000 or more',
       'Estimate!!Total!!$45,000 to $49,999',
       'Estimate!!Total!!$50,000 to $59,999',
       'Estimate!!Total!!$60,000 to $74,999',
       'Estimate!!Total!!$75,000 to $99,999'], dtype='<U37')

In [48]:
education_vars = ['B07009_002E', 'B07009_006E', 'B07009_004E', 'B07009_005E', 'B07009_003E', 'B07009_001E']

# Retrieve the list of education attainment variables by age
education = list()
for year in range(2005, 2020, 1):
    print(year)
    # Retrieve profile of data variables
    url ='/acs/acs1/variables.json'
    response = get_request(str(year) + url)
    variables = (response['variables'])
    for var in variables.keys():
        if var in education_vars:
            education.append(var)
            print("{: <70} {}".format(variables[var]['label'], var))
print(len(education))

2005
Estimate!!Total!!Less than high school graduate                        B07009_002E
Estimate!!Total                                                        B07009_001E
Estimate!!Total!!Graduate or professional degree                       B07009_006E
Estimate!!Total!!Some college or associate's degree                    B07009_004E
Estimate!!Total!!Bachelor's degree                                     B07009_005E
Estimate!!Total!!High school graduate (includes equivalency)           B07009_003E
2006
Estimate!!Total!!Less than high school graduate                        B07009_002E
Estimate!!Total                                                        B07009_001E
Estimate!!Total!!Graduate or professional degree                       B07009_006E
Estimate!!Total!!Some college or associate's degree                    B07009_004E
Estimate!!Total!!Bachelor's degree                                     B07009_005E
Estimate!!Total!!High school graduate (includes equivalency)           B07009

### Employment Status

In [41]:
# Retrieve profile of data variables
url ='/acs/acs1/variables.json'
response = get_request(str(2005) + url)
variables = (response['variables'])

# Retrieve list of employment status variables 
employment = list()
for var in variables.keys():
    emp_re = re.compile('^estimate!!total!!(female)!!.*!!(not in) labor force$')
    # concept_re = 'selected economic characteristics'
    # edu_str = 'employment status'
    # if bool(re.match(emp_re, variables[var]['label'].lower())):
    # if bool(re.match(emp_re, variables[var]['label'].lower())) and bool(re.match(concept_re, variables[var]['concept'].lower())):
    # if 'employment status' in  variables[var]['label'].lower() or ('concept' in variables[var] and 'employment status' in variables[var]['concept'].lower()):
    if 'concept' in variables[var] and 'SEX BY AGE BY EMPLOYMENT STATUS FOR THE POPULATION 16 YEARS AND OVER' == variables[var]['concept'] and bool(re.match(emp_re, variables[var]['label'].lower())):
        employment.append(var)
        print("{: <60} {}".format(variables[var]['label'], var))
print(len(employment))

Estimate!!Total!!Female!!16 to 19 years!!Not in labor force  B23001_095E
Estimate!!Total!!Female!!20 and 21 years!!Not in labor force B23001_102E
Estimate!!Total!!Female!!22 to 24 years!!Not in labor force  B23001_109E
Estimate!!Total!!Female!!25 to 29 years!!Not in labor force  B23001_116E
Estimate!!Total!!Female!!30 to 34 years!!Not in labor force  B23001_123E
Estimate!!Total!!Female!!35 to 44 years!!Not in labor force  B23001_130E
Estimate!!Total!!Female!!45 to 54 years!!Not in labor force  B23001_137E
Estimate!!Total!!Female!!55 to 59 years!!Not in labor force  B23001_144E
Estimate!!Total!!Female!!60 and 61 years!!Not in labor force B23001_151E
Estimate!!Total!!Female!!62 to 64 years!!Not in labor force  B23001_158E
Estimate!!Total!!Female!!65 to 69 years!!Not in labor force  B23001_163E
Estimate!!Total!!Female!!70 to 74 years!!Not in labor force  B23001_168E
Estimate!!Total!!Female!!75 years and over!!Not in labor force B23001_173E
13


In [47]:
employed_male = ['B23001_067E', 'B23001_060E', 'B23001_074E', 'B23001_079E', 'B23001_084E', 'B23001_004E', 'B23001_018E', 'B23001_011E', 'B23001_025E', 'B23001_032E', 'B23001_039E', 'B23001_046E', 'B23001_053E']
employed_female = ['B23001_097E', 'B23001_090E', 'B23001_104E', 'B23001_118E', 'B23001_111E', 'B23001_125E', 'B23001_139E', 'B23001_132E', 'B23001_146E', 'B23001_153E', 'B23001_160E', 'B23001_165E', 'B23001_170E']
unemployed_male = ['B23001_058E', 'B23001_065E', 'B23001_077E', 'B23001_072E', 'B23001_087E', 'B23001_082E', 'B23001_009E', 'B23001_016E', 'B23001_023E', 'B23001_030E', 'B23001_037E', 'B23001_044E', 'B23001_051E']
unemployed_female = ['B23001_095E', 'B23001_102E', 'B23001_109E', 'B23001_116E', 'B23001_123E', 'B23001_130E', 'B23001_137E', 'B23001_144E', 'B23001_151E', 'B23001_158E', 'B23001_163E', 'B23001_168E', 'B23001_173E']

employment = list()
employment.extend(employed_male)
employment.extend(employed_female)
employment.extend(unemployed_female)
employment.extend(unemployed_male)

# Retrieve the list of education attainment variables by age
education = list()
for year in range(2005, 2020, 1):
    print(year)
    # Retrieve profile of data variables
    url ='/acs/acs1/variables.json'
    response = get_request(str(year) + url)
    variables = (response['variables'])
    count = 0
    for var in variables.keys():
        if var in employment:
            education.append(var)
            print("{: <70} {}".format(variables[var]['label'], variables[var]['concept'] if 'concept' in variables[var] else var))
            count += 1
    print(count)
            
print(len(education))

2005
Estimate!!Total!!Male!!55 to 59 years!!Not in labor force              SEX BY AGE BY EMPLOYMENT STATUS FOR THE POPULATION 16 YEARS AND OVER
Estimate!!Total!!Male!!62 to 64 years!!In labor force                  SEX BY AGE BY EMPLOYMENT STATUS FOR THE POPULATION 16 YEARS AND OVER
Estimate!!Total!!Male!!60 and 61 years!!Not in labor force             SEX BY AGE BY EMPLOYMENT STATUS FOR THE POPULATION 16 YEARS AND OVER
Estimate!!Total!!Male!!60 and 61 years!!In labor force                 SEX BY AGE BY EMPLOYMENT STATUS FOR THE POPULATION 16 YEARS AND OVER
Estimate!!Total!!Male!!65 to 69 years!!In labor force                  SEX BY AGE BY EMPLOYMENT STATUS FOR THE POPULATION 16 YEARS AND OVER
Estimate!!Total!!Male!!70 to 74 years!!In labor force                  SEX BY AGE BY EMPLOYMENT STATUS FOR THE POPULATION 16 YEARS AND OVER
Estimate!!Total!!Male!!65 to 69 years!!Not in labor force              SEX BY AGE BY EMPLOYMENT STATUS FOR THE POPULATION 16 YEARS AND OVER
Estimate!!Total

### Income search

In [64]:
# Retrieve profile of data variables
url ='/acs/acs1/variables.json'
response = get_request(str(2005) + url)
variables = (response['variables'])

income = list()
for var in variables.keys():
    income_re = re.compile("percent!!income and benefits.+!!families!!.+")
    income_str = 'selected economic characteristics'
    label = variables[var]['label'].lower()
    # if bool(re.match(income_re, label)) and (income_str.lower() == variables[var]['concept'].lower()) and ('median' not in label) and ('mean' not in label):
    #     income.append(var)
    #     print(label)
        
    # income_re = re.compile("estimate!!income and benefits.+!!families!!(mean|median).+")
    # if bool(re.match(income_re, label)) and (income_str.lower() == variables[var]['concept'].lower()):
    #     income.append(var)
    #     print(label)
    
    income_re = re.compile("^HOUSEHOLD INCOME IN THE PAST 12 MONTHS (IN 2005 INFLATION-ADJUSTED DOLLARS)$")
    if ('concept' in variables[var] and 'HOUSEHOLD INCOME IN THE PAST 12 MONTHS (IN 2005 INFLATION-ADJUSTED DOLLARS)' == variables[var]['concept']):
        income.append(var)
        print("{: <50} {}".format( variables[var]['label'], var))
        
print(len(income))

Estimate!!Total!!$200,000 or more                  C19001_011E
Estimate!!Total!!$150,000 to $199,999              C19001_010E
Estimate!!Total!!$100,000 to $149,999              C19001_009E
Estimate!!Total!!$75,000 to $99,999                C19001_008E
Estimate!!Total!!$10,000 to $14,999                C19001_003E
Estimate!!Total!!Less than $10,000                 C19001_002E
Estimate!!Total                                    C19001_001E
Estimate!!Total!!$50,000 to $74,999                C19001_007E
Estimate!!Total!!$35,000 to $49,999                C19001_006E
Estimate!!Total!!$25,000 to $34,999                C19001_005E
Estimate!!Total!!$15,000 to $24,999                C19001_004E
Estimate!!Total!!$25,000 to $29,999                B19001_006E
Estimate!!Total!!$30,000 to $34,999                B19001_007E
Estimate!!Total!!$35,000 to $39,999                B19001_008E
Estimate!!Total!!$40,000 to $44,999                B19001_009E
Estimate!!Total!!Less than $10,000                 B190

In [80]:
income = ['B19001_001E', 'B19001_014E', 'B19001_015E', 'B19001_016E', 'B19001_017E', 'B19001_010E', 'B19001_011E', 'B19001_012E', 'B19001_013E']
# Retrieve the list of education attainment variables by age
education = list()
for year in range(2005, 2020, 1):
    print(year)
    # Retrieve profile of data variables
    url ='/acs/acs1/variables.json'
    response = get_request(str(year) + url)
    variables = (response['variables'])
    count = 0
    for var in variables.keys():
        if var in income:
            education.append(var)
            print("{: <70} {}".format(variables[var]['label'], var))
            count += 1
    print(count)
print(len(education))

2005
Estimate!!Total                                                        B19001_001E
Estimate!!Total!!$100,000 to $124,999                                  B19001_014E
Estimate!!Total!!$125,000 to $149,999                                  B19001_015E
Estimate!!Total!!$150,000 to $199,999                                  B19001_016E
Estimate!!Total!!$200,000 or more                                      B19001_017E
Estimate!!Total!!$45,000 to $49,999                                    B19001_010E
Estimate!!Total!!$50,000 to $59,999                                    B19001_011E
Estimate!!Total!!$60,000 to $74,999                                    B19001_012E
Estimate!!Total!!$75,000 to $99,999                                    B19001_013E
9
2006
Estimate!!Total                                                        B19001_001E
Estimate!!Total!!$100,000 to $124,999                                  B19001_014E
Estimate!!Total!!$125,000 to $149,999                                  B190

### Occupation search

In [90]:
# Retrieve profile of data variables
url ='/acs/acs1/variables.json'
response = get_request(str(2005) + url)
variables = (response['variables'])

# Retrieve list of occupation variables by age and occupation category
occupation = list()
for var in variables.keys():
    occupation_re = re.compile('^estimate!!percent (male|female)!!civilian employed population 16 years and over($|!!.+:$)')
    cat_str = 'OCCUPATION BY SEX FOR THE CIVILIAN EMPLOYED POPULATION 16 YEARS AND OVER'
    edu_str = 'employment status'
    # if (edu_str.lower() in variables[var]['label'].lower()) and (cat_str.lower() in variables[var]['concept'].lower()) and (':!!' not in variables[var]['label']):
    # if bool(re.match(occupation_re, variables[var]['label'].lower())) and (cat_str.lower() in variables[var]['concept'].lower()) and (':!!' not in variables[var]['label']):
    # if "occupation" in variables[var]['label'].lower() or ('concept' in variables[var] and 'occupation' in variables[var]['concept'].lower()):
    # occupation_re = re.compile('^estimate!!total!!
    if 'concept' in variables[var] and 'INDUSTRY BY OCCUPATION FOR THE CIVILIAN  EMPLOYED POPULATION 16 YEARS AND OVER' == variables[var]['concept'] and '!!' not in variables[var]['label'].replace('Estimate!!Total!!', ''):
        occupation.append(var)
        print("{: <170} {}".format(variables[var]['label'], var))
print(len(occupation))



Estimate!!Total!!Construction                                                                                                                                              C24050_003E
Estimate!!Total!!Agriculture, forestry, fishing and hunting, and mining                                                                                                    C24050_002E
Estimate!!Total!!Wholesale trade                                                                                                                                           C24050_005E
Estimate!!Total!!Manufacturing                                                                                                                                             C24050_004E
Estimate!!Total!!Transportation and warehousing, and utilities                                                                                                             C24050_007E
Estimate!!Total!!Retail trade                                                        

In [92]:
occupation = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 29, 43, 57, 85, 71] 
occupation = ["C24050_{:03.0f}E".format(i) for i in occupation]
print(occupation)

# Retrieve the list of education attainment variables by age
education = list()
for year in range(2005, 2020, 1):
    print(year)
    # Retrieve profile of data variables
    url ='/acs/acs1/variables.json'
    response = get_request(str(year) + url)
    variables = (response['variables'])
    count = 0
    for var in variables.keys():
        if var in occupation:
            education.append(var)
            print("{: <115} {}".format(variables[var]['label'],  var))
            count += 1
    print(count)
print(len(education))

['C24050_001E', 'C24050_002E', 'C24050_003E', 'C24050_004E', 'C24050_005E', 'C24050_006E', 'C24050_007E', 'C24050_008E', 'C24050_009E', 'C24050_010E', 'C24050_011E', 'C24050_012E', 'C24050_013E', 'C24050_014E', 'C24050_015E', 'C24050_085E', 'C24050_029E', 'C24050_043E', 'C24050_071E', 'C24050_057E']
2005
Estimate!!Total                                                                                                     C24050_001E
Estimate!!Total!!Construction                                                                                       C24050_003E
Estimate!!Total!!Agriculture, forestry, fishing and hunting, and mining                                             C24050_002E
Estimate!!Total!!Wholesale trade                                                                                    C24050_005E
Estimate!!Total!!Manufacturing                                                                                      C24050_004E
Estimate!!Total!!Transportation and warehousing, and u