In [65]:
import requests
import pandas as pd
import numpy as np
import json
import regex as re
pd.set_option('display.max_columns', 500)

In [2]:
def get_request(endpoint, parameters=dict()):
    api_url = 'https://api.census.gov/data/'
    response = requests.get(api_url + endpoint, parameters)
    if response.status_code != 200:
        print("Request to {} failed. Error code {}:{}".format(api_url + endpoint, response.status_code, response.text))
    
    response = json.loads(response.text)
    return response

In [168]:
# record possibilites for variables
# var_file = open('variable_possibilites.txt', 'w')

In [370]:
## Retrieve subject level variables (parameters for api)
url ='/acs/acs1/subject/variables.json'
response = get_request(str(2019) + url)
variables = response['variables']

# Retrieve profile of data variables
url ='/acs/acs1/profile/variables.json'
response = get_request(str(2019) + url)
variables.update(response['variables'])

In [372]:
# Retrieve the list of education attainment variables by age
education = list()
for var in variables.keys():
    edu_str = re.compile('^estimate!!percent!!age by educational attainment!!.+(18 to 24|25 years).+!!')  #regex for educational attainment
    if bool(re.match(edu_str, variables[var]['label'].lower())):
        education.append(var)
        print(variables[var]['label'])
print(len(education))

Estimate!!Percent!!AGE BY EDUCATIONAL ATTAINMENT!!Population 25 years and over!!Less than 9th grade
Estimate!!Percent!!AGE BY EDUCATIONAL ATTAINMENT!!Population 25 years and over!!9th to 12th grade, no diploma
Estimate!!Percent!!AGE BY EDUCATIONAL ATTAINMENT!!Population 18 to 24 years!!Bachelor's degree or higher
Estimate!!Percent!!AGE BY EDUCATIONAL ATTAINMENT!!Population 18 to 24 years!!High school graduate (includes equivalency)
Estimate!!Percent!!AGE BY EDUCATIONAL ATTAINMENT!!Population 18 to 24 years!!Some college or associate's degree
Estimate!!Percent!!AGE BY EDUCATIONAL ATTAINMENT!!Population 18 to 24 years!!Less than high school graduate
Estimate!!Percent!!AGE BY EDUCATIONAL ATTAINMENT!!Population 25 years and over!!High school graduate (includes equivalency)
Estimate!!Percent!!AGE BY EDUCATIONAL ATTAINMENT!!Population 25 years and over!!Bachelor's degree or higher
Estimate!!Percent!!AGE BY EDUCATIONAL ATTAINMENT!!Population 25 years and over!!Graduate or professional degree


In [373]:
print(education)

['S1501_C02_007E', 'S1501_C02_008E', 'S1501_C02_005E', 'S1501_C02_003E', 'S1501_C02_004E', 'S1501_C02_002E', 'S1501_C02_009E', 'S1501_C02_015E', 'S1501_C02_013E', 'S1501_C02_014E', 'S1501_C02_011E', 'S1501_C02_012E', 'S1501_C02_010E']


In [374]:
# Retrieve list of occupation variables by age and occupation category
occupation = list()
for var in variables.keys():
    occupation_re = re.compile('^estimate!!percent (male|female)!!civilian employed population 16 years and over($|!!.+:$)')
    cat_str = 'OCCUPATION BY SEX FOR THE CIVILIAN EMPLOYED POPULATION 16 YEARS AND OVER'
    edu_str = 'employment status'
    # if (edu_str.lower() in variables[var]['label'].lower()) and (cat_str.lower() in variables[var]['concept'].lower()) and (':!!' not in variables[var]['label']):
    if bool(re.match(occupation_re, variables[var]['label'].lower())) and (cat_str.lower() in variables[var]['concept'].lower()) and (':!!' not in variables[var]['label']):
        occupation.append(var)
        print(variables[var]['label'])
print(len(occupation))

Estimate!!Percent Female!!Civilian employed population 16 years and over!!Natural resources, construction, and maintenance occupations:
Estimate!!Percent Female!!Civilian employed population 16 years and over!!Sales and office occupations:
Estimate!!Percent Female!!Civilian employed population 16 years and over!!Production, transportation, and material moving occupations:
Estimate!!Percent Female!!Civilian employed population 16 years and over!!Service occupations:
Estimate!!Percent Female!!Civilian employed population 16 years and over
Estimate!!Percent Female!!Civilian employed population 16 years and over!!Management, business, science, and arts occupations:
Estimate!!Percent Male!!Civilian employed population 16 years and over
Estimate!!Percent Male!!Civilian employed population 16 years and over!!Management, business, science, and arts occupations:
Estimate!!Percent Male!!Civilian employed population 16 years and over!!Production, transportation, and material moving occupations:
E

In [375]:
# Retrieve list of employment status variables 
employment = list()
for var in variables.keys():
    emp_re = re.compile('^percent!!employment status!!population')
    concept_re = 'selected economic characteristics'
    edu_str = 'employment status'
    # if (edu_str.lower() in variables[var]['label'].lower()) and ('and over in the united states' in variables[var]['concept'].lower()):
    if bool(re.match(emp_re, variables[var]['label'].lower())) and bool(re.match(concept_re, variables[var]['concept'].lower())):
        employment.append(var)
        print(variables[var]['label'])
print(len(employment))

Percent!!EMPLOYMENT STATUS!!Population 16 years and over!!In labor force!!Civilian labor force!!Unemployed
Percent!!EMPLOYMENT STATUS!!Population 16 years and over!!In labor force!!Armed Forces
Percent!!EMPLOYMENT STATUS!!Population 16 years and over!!In labor force
Percent!!EMPLOYMENT STATUS!!Population 16 years and over!!In labor force!!Civilian labor force
Percent!!EMPLOYMENT STATUS!!Population 16 years and over!!In labor force!!Civilian labor force!!Employed
Percent!!EMPLOYMENT STATUS!!Population 16 years and over
Percent!!EMPLOYMENT STATUS!!Population 16 years and over!!Not in labor force
7


In [376]:
income = list()
for var in variables.keys():
    income_re = re.compile("percent!!income and benefits.+!!families!!.+")
    income_str = 'selected economic characteristics'
    label = variables[var]['label'].lower()
    if bool(re.match(income_re, label)) and (income_str.lower() == variables[var]['concept'].lower()) and ('median' not in label) and ('mean' not in label):
        income.append(var)
        print(label)
        
    income_re = re.compile("estimate!!income and benefits.+!!families!!(mean|median).+")
    if bool(re.match(income_re, label)) and (income_str.lower() == variables[var]['concept'].lower()):
        income.append(var)
        print(label)
        
print(len(income))

percent!!income and benefits (in 2019 inflation-adjusted dollars)!!families!!$25,000 to $34,999
percent!!income and benefits (in 2019 inflation-adjusted dollars)!!families!!$150,000 to $199,999
percent!!income and benefits (in 2019 inflation-adjusted dollars)!!families!!$200,000 or more
percent!!income and benefits (in 2019 inflation-adjusted dollars)!!families!!$10,000 to $14,999
percent!!income and benefits (in 2019 inflation-adjusted dollars)!!families!!$75,000 to $99,999
estimate!!income and benefits (in 2019 inflation-adjusted dollars)!!families!!median family income (dollars)
estimate!!income and benefits (in 2019 inflation-adjusted dollars)!!families!!mean family income (dollars)
percent!!income and benefits (in 2019 inflation-adjusted dollars)!!families!!$15,000 to $24,999
percent!!income and benefits (in 2019 inflation-adjusted dollars)!!families!!$100,000 to $149,999
percent!!income and benefits (in 2019 inflation-adjusted dollars)!!families!!$35,000 to $49,999
percent!!incom

In [394]:
data = pd.DataFrame({})
for year in range(2010,2020,1): #2005-2019
    
    #Retrieve all subject variable info from census api
    params = education.copy()
    params.extend(occupation)
    endpoint = '{}/acs/acs1/subject?get=NAME,{}&for=metropolitan%20statistical%20area/micropolitan%20statistical%20area:*'.format(year, ','.join(params))
    results = get_request(endpoint)
    columns = results.pop(0)
    df_subject = pd.DataFrame(results, columns=columns)
    df_subject.set_index('metropolitan statistical area/micropolitan statistical area', inplace=True)
    
    #Retrieve all profile variable data from census api
    params = income.copy()
    params.extend(employment)
    endpoint = '{}/acs/acs1/profile?get=NAME,{}&for=metropolitan%20statistical%20area/micropolitan%20statistical%20area:*'.format(year, ','.join(params))
    results = get_request(endpoint)
    columns = results.pop(0)
    df_profile = pd.DataFrame(results, columns=columns)
    df_profile.drop('NAME', axis=1, inplace=True) #drop redundant columns
    df_profile.set_index('metropolitan statistical area/micropolitan statistical area', inplace=True)
    
    #combine dataframes and add final data
    df = pd.concat([df_subject, df_profile], axis=1)
    df.index.names = ['cbsa']     #replace name with a more common term
    df.reset_index(inplace=True)
    df['year'] = [year] * df.shape[0]      #add year
    data = pd.concat([data, df], ignore_index=True)
    
# Add final touches to final dataset
data['metropolitan_area'] = ['Metro Area' in i for i in data['NAME']]                   #Only use metropolitan areas (urban like)

#Clean up area name
data['NAME'] = [re.sub('( Metro Area| Micro Area)', '', i) for i in data['NAME']]
#Seperate the state and city within NAME for convience
states = list()
cities = list()
for place in data['NAME']:
    m = re.search('(.+), (.+)', place)
    states.append(m.group(2))
    cities.append(m.group(1))
    
data['city'] = cities
data['state'] = states
data.drop('NAME', axis=1, inplace=True)

#replace column codes with actual labels
new_columns = [variables[i]['label'] if (i in variables) else i for i in data.columns]  
data.columns = new_columns

In [395]:
#Store all the metropolitcan areas (urban like) as data
data[data['metropolitan_area']==True].to_csv('acs_survey.csv', index=False)
data[data['metropolitan_area']==True].shape

(3846, 49)

In [397]:
data[data['metropolitan_area'] == True].isnull().sum().sum()

0