# Acquire Data - US Census Data  via CensusData

**Table of Contents**

1. [Intro](#1.-Intro)
2. [URL](#2.-URL)
3. [Functions](#3.-Functions)

## 1. Intro

We used a Python library that uses the US Census Bureau API endpoints. It can be installed using pip and the following link has more information about it.

## 2. URL
https://pypi.org/project/CensusData/. 

## 3. Functions

In [11]:
import censusdata, pandas as pd
import uszipcode
from uszipcode import SearchEngine

In [8]:
def import_censusdata(year):
    '''This function leverages the CensusData Python library that pulls data from the Census repository.'''
    #list of variables of interest for EDA  
    dictionary = {'B02001_001E':'Total Pop Estimate', 'B19001_016E':'HHI 150K-200K','B19001_017E':'HHI 220K+',\
                  'B01001_006E':'Males 15-17', 'B01001_030E':'Females 15-17',\
                  'B01001A_006E':'White Males 15-17','B01001B_006E':'Black Males 15-17', 'B01001I_006E':'Hispanic Males 15-17',\
                  'B01001A_021E':'While Females 15-17', 'B01001B_021E':'Black Females 15-17', 'B01001I_021E': 'Hispanic Females 15-17'
                  }
    llaves = sorted(list(dictionary.keys()), reverse=True)
    census_data = censusdata.download('acs5', year, censusdata.censusgeo([('state','*'), ('zip code tabulation area', '*')]), llaves)
    census_data.rename(columns =dictionary, inplace=True)
    census_data.reset_index(inplace=True)
    census_data['zip code'] = census_data['index'].apply(lambda x: x.params()[1][1])
    census_data['year'] = year
    census_data.drop(columns = 'index', inplace=True)
    census_dataframe = pd.DataFrame(census_data)
    return census_dataframe


In [9]:
def api_iterator(begin_year, end_year, function):
    '''This function iterates over a range of years and pulls Census data by using the import_censusdata defined previously'''
    consolidated_years = []
    for year in range(begin_year, end_year):
        all_years = function(year)
        consolidated_years.append(all_years)
    consolidated_years = pd.concat(consolidated_years)
        #year_name = 'Census_'+ str(year)
        #consolidated_years[year_name] = pd.DataFrame(import_censusdata(year))
    consolidated_years.reset_index(inplace= True, drop=True)
    return consolidated_years

census_data = api_iterator(2015,2019, import_censusdata)

In [None]:
#save data census to a csv file
census_data.to_csv('census raw.csv', index=False, sep ='\t')

In [None]:
#create a helper table with all the state, cities and zip codes from the Census data. This table will be used to 
# create relationships in the postgresql database where all data will be warehoused

def locale(data):
    '''This functions relies on the uszipcode Python library and its SearchEngine function to link zip code values from the census
    with their corresponding state and city information'''    

    unique_zips = pd.DataFrame(data['zip code'].drop_duplicates())
    unique_zips['state'] = unique_zips['zip code'].apply(lambda x: SearchEngine().by_zipcode(int(x)).state)
    unique_zips['city'] = unique_zips['zip code'].apply(lambda x: SearchEngine().by_zipcode(int(x)).major_city)
    return unique_zips

local_data = locale(census_data)
local_data.to_csv('zip to statecity.csv', index=False) #create csv file to load to the database