In [2]:
import re;
import pandas as pd;
import geopandas as gpd;
import numpy as np;
import requests
from census import Census
from us import states

raw_dir='raw_data/'
edits_dir='edits/'
pre_edits_dir='pre_edits_hard_to_count/'
output_dir='output/'

chambers = ['congress', 'state_house', 'state_senate']

# To reproduce the data below, you'll need to save your 
# Census API key to `../data/census-api-key.txt`.
# You can obtain a key here: https://api.census.gov/data/key_signup.html
api_key = open("census-api-key.txt").read().strip()
c = Census(api_key)

In [3]:
# We don't want to have to call the API every time
# So we'll create a cached version in memory
# Which we'll have to re-run every time we close out the window
acs_data_cache = []

# Our Census API call
def get_acs_data(chamber):
    if chamber == 'congress':
        api_geo = 'congressional district'
        leg_year = '115th Congress'
    elif chamber == 'state_senate':
        api_geo = 'state legislative district (upper chamber)'
        leg_year = '2016'
    elif chamber == 'state_house':
        api_geo = 'state legislative district (lower chamber)'
        leg_year = '2016'
    
    results = c.acs5.get(
        [
            # Total population
            'B05002_001E', 'B05002_001M',
            # Native
            'B05002_002E', 'B05002_002M',
            # Foreign born
            'B05002_013E', 'B05002_013M',
            # Not a citizen
            'B05002_021E', 'B05002_021M',
        ],
        geo={'for': '{}:*'.format(api_geo),
             'in': 'state:{}'.format(states.TX.fips)
        })
        
    return [ {
        'district': int(res[api_geo]),
        'leg_year': leg_year,
        'pop': res['B05002_001E'],
        'pop_moe': res['B05002_001M'],
        'native': res['B05002_002E'],
        'native_moe': res['B05002_002M'],
        'foreign_born': res['B05002_013E'],
        'foreign_born_moe': res['B05002_013M'],
        'not_citizen': res['B05002_021E'],
        'not_citizen_moe': res['B05002_021M'],
        'not_citizen_perc': round((res['B05002_021E'] / res['B05002_001E']) * 100, 1),
        'not_citizen_moe_perc': round((res['B05002_021M'] / res['B05002_001E']) * 100, 1)
    } for res in results ]


for chamber in chambers:
    acs_data_cache.append({
        'chamber': chamber,
        'data': get_acs_data(chamber)
    })

def get_acs_data_cache(chamber):
    for x in acs_data_cache:
        if x['chamber'] == chamber:
            return x['data']
        
print('Done creating cached version of Census API data')

Done creating cached version of Census API data


In [10]:
# Analysis for each chamber
for chamber in chambers:
    print('Working on the ' + chamber + ' sheet')
    
    # Download data using Census API
    # End up with foreign born info for each chamber
    print('Calling Census API')
    census_api_data = get_acs_data_cache(chamber)
    census_cols = list(census_api_data[0].keys())
    df_main = pd.DataFrame(census_api_data, columns=census_cols)
    
    print('Done with Census API')
    
    # Use these files to merge party
    # Congressional districts are pulled from our voter tracker
    if chamber == 'congress':
        df_party = pd.read_csv(raw_dir + 'delegation_tracker_2019_congress.csv')
        df_party = df_party[df_party['district'].apply(str) != 'FALSE']
        df_party['district'] = df_party['district'].apply(int)
        df_party['party'] = df_party['party'].str.replace('Democrat ', 'D').replace('Republican', 'R')
    # State legislative districts are pulled from demographics sheet we track
    else:
        df_party = pd.read_csv(raw_dir + 'demographics_tx_lege_2019_' + chamber + '.csv')
        df_party.columns = map(str.lower, df_party.columns)
        
    # Merge the current dataframe with csv that has party info
    df_main = df_main.merge(df_party, on='district')
    
    # Select columns
    df_main_select = df_main[[
        'district', 'leg_year',
        'pop', 'native', 'foreign_born',
        'not_citizen', 'not_citizen_perc', 'party'
    ]].sort_values(by='not_citizen_perc', ascending=False)

    df_main_select_moe = df_main[[
        'district', 'leg_year',
        'pop_moe', 'native_moe', 'foreign_born_moe',
        'not_citizen_moe', 'not_citizen_moe_perc'
    ]].sort_values(by='not_citizen_moe_perc', ascending=False)

    # Output percent cols
    df_main_select.to_csv(output_dir + 'tx_' + chamber + '_not_citizen.csv', index=False)
    df_main_select_moe.to_csv(output_dir + 'tx_' + chamber + '_not_citizen_moe.csv', index=False)
    print('Done with districts csv')

    # Summary stats for population in districts
    pop_groupby = df_main.groupby(['leg_year'])['pop']
    sum_all_districts = pop_groupby.agg('sum').values[0]
    median_all_districts = pop_groupby.agg('median').values[0]

    # Summary stats for foreign population in districts
    not_citizen_groupby = df_main.groupby(['leg_year'])['not_citizen']
    count_not_citizen = not_citizen_groupby.agg('count').values[0]
    sum_not_citizen = not_citizen_groupby.agg('sum').values[0]
    median_not_citizen = round(not_citizen_groupby.agg('median'), 1).values[0]
    mean_not_citizen = round(not_citizen_groupby.agg('mean'), 1).values[0]

    # Create our final csv with these stats
    df_summary = pd.DataFrame(columns=['calculation', 'value'])
    c_loc = 0
    df_summary.loc[c_loc] = ['count_districts', count_not_citizen]
    c_loc += 1
    df_summary.loc[c_loc] = ['pop_not_citizen', sum_not_citizen]
    c_loc += 1
    df_summary.loc[c_loc] = ['pop_all', sum_all_districts]
    c_loc += 1
    df_summary.loc[c_loc] = ['not_citizen_pop_all_perc', round((sum_not_citizen / sum_all_districts) * 100, 1)]
    c_loc += 1
    df_summary.loc[c_loc] = ['median_pop_all_districts', median_all_districts]
    c_loc += 1
    df_summary.loc[c_loc] = ['not_citizen_pop_in_median_district', round(sum_not_citizen / median_all_districts, 1)]
    
    parties = ['D', 'R']
    for party in parties:
        c_df_main_party = df_main[df_main['party'] == party]
        not_citizen_party_groupby = c_df_main_party.groupby(['party'])['not_citizen']
        sum_not_citizen_party = not_citizen_party_groupby.agg('sum').values[0]

        c_loc += 1
        df_summary.loc[c_loc] = ['pop_not_citizen_' + party + '_districts', sum_not_citizen_party]
        c_loc += 1
        df_summary.loc[c_loc] = ['not_citizen_pop_in_' + party + '_districts_perc', round((sum_not_citizen_party / sum_not_citizen) * 100, 1)]
    
    # Save our file
    df_summary.to_csv(output_dir + 'tx_' + chamber + '_not_citizen_summary.csv', index=False)
    print('Done with summary csv')
    print('---')

print('Done with everything')
print('---')

Working on the congress sheet
Calling Census API
Done with Census API
Done with districts csv
Done with summary csv
---
Working on the state_house sheet
Calling Census API
Done with Census API
Done with districts csv
Done with summary csv
---
Working on the state_senate sheet
Calling Census API
Done with Census API
Done with districts csv
Done with summary csv
---
Done with everything
---
