In [103]:
import pandas as pd
import numpy as np
import re

# Census Data

In [104]:
census_2015_df = pd.read_csv('data/acs2015_census_tract_data.csv')
census_2015_df['Year'] = 2015
census_2017_df = pd.read_csv('data/acs2017_census_tract_data.csv')
census_2017_df['Year'] = 2017


In [105]:
cols_2015 = set(census_2015_df.columns)
cols_2017 = set(census_2017_df.columns)

cols_2015_not_2017 = cols_2015 - cols_2017
print('Cols in 2015 but not 2017:', cols_2015_not_2017)

cols_2017_not_2015 = cols_2017 - cols_2015
print('Cols in 2017 but not 2015:', cols_2017_not_2015)


Cols in 2015 but not 2017: {'Citizen', 'CensusTract'}
Cols in 2017 but not 2015: {'VotingAgeCitizen', 'TractId'}


In [106]:
# We can drop the Citizen column in 2015 as it is not present in 2017
census_2015_df = census_2015_df.drop('Citizen', axis=1)
# We can drop the VotingAgeCitizen column in 2017 as it is not present in 2015
census_2017_df = census_2017_df.drop('VotingAgeCitizen', axis=1)

# In 2015, the TractId column was name CensusTract, so we can rename it to match the 2017 column name
census_2015_df = census_2015_df.rename(columns={'CensusTract': 'TractId'})

# We can now concatenate the two dataframes
census_df = pd.concat([census_2015_df, census_2017_df])

In [107]:
print(census_df.shape)

(148002, 37)


In [108]:
pattern = re.compile(r'(?<!^)(?=[A-Z])')
census_df.columns = [pattern.sub('_', col).lower() for col in census_df.columns]
census_df.columns

Index(['tract_id', 'state', 'county', 'total_pop', 'men', 'women', 'hispanic',
       'white', 'black', 'native', 'asian', 'pacific', 'income', 'income_err',
       'income_per_cap', 'income_per_cap_err', 'poverty', 'child_poverty',
       'professional', 'service', 'office', 'construction', 'production',
       'drive', 'carpool', 'transit', 'walk', 'other_transp', 'work_at_home',
       'mean_commute', 'employed', 'private_work', 'public_work',
       'self_employed', 'family_work', 'unemployment', 'year'],
      dtype='object')

In [109]:
# Drop columns that are not needed
columns_to_drop = [
    'child_poverty',
    'income_per_cap_err',
    'income_err',
    'child_poverty',
    'public_work',
    'private_work',
    'family_work',
    'self_employed',
]
census_df.drop(columns=columns_to_drop, inplace=True)

In [110]:
census_df.head()

Unnamed: 0,tract_id,state,county,total_pop,men,women,hispanic,white,black,native,...,drive,carpool,transit,walk,other_transp,work_at_home,mean_commute,employed,unemployment,year
0,1001020100,Alabama,Autauga,1948,940,1008,0.9,87.4,7.7,0.3,...,90.2,4.8,0.0,0.5,2.3,2.1,25.0,943,5.4,2015
1,1001020200,Alabama,Autauga,2156,1059,1097,0.8,40.4,53.3,0.0,...,86.3,13.1,0.0,0.0,0.7,0.0,23.4,753,13.3,2015
2,1001020300,Alabama,Autauga,2968,1364,1604,0.0,74.5,18.6,0.5,...,94.8,2.8,0.0,0.0,0.0,2.5,19.6,1373,6.2,2015
3,1001020400,Alabama,Autauga,4423,2172,2251,10.5,82.8,3.7,1.6,...,86.6,9.1,0.0,0.0,2.6,1.6,25.3,1782,10.8,2015
4,1001020500,Alabama,Autauga,10763,4922,5841,0.7,68.5,24.8,0.0,...,88.0,10.5,0.0,0.0,0.6,0.9,24.8,5037,4.2,2015


In [111]:
len(census_df.state.unique())

52

In [112]:
census_df.state.unique()

array(['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California',
       'Colorado', 'Connecticut', 'Delaware', 'District of Columbia',
       'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana',
       'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland',
       'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi',
       'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire',
       'New Jersey', 'New Mexico', 'New York', 'North Carolina',
       'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania',
       'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee',
       'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington',
       'West Virginia', 'Wisconsin', 'Wyoming', 'Puerto Rico'],
      dtype=object)

**District of Colombia** is a federal district, not a state, and **Puerto Rico** is a territory. 

## Processing the Data

Split the data into Tract, County, and State.
The Tract table will have all the columns from the original table, but State and County will have only the columns `State` and `County` respectively with each row having a unique `id` column. We already have data for county and state. 

In [113]:
census_df.head()
census_df["tract_id"] = census_df["tract_id"].astype(str)
census_df_final = census_df[['tract_id']]
census_df_final = pd.concat([census_df_final, census_df[['year']]], axis = 1)
county_cw = pd.read_csv("data/county.csv")
census_df_final['tract_id'] = census_df_final['tract_id'].astype(str)
def s_add_zero(s):
    if len(s) == 10:
        return '0' + s
    else:
        return s
census_df_final['tract_id'] = census_df_final['tract_id'].apply(s_add_zero)
census_df_final['state_id'] = census_df_final['tract_id'].str.slice(start = 0, stop = 2)
census_df_final['county_id'] = census_df_final['tract_id'].str.slice(start = 2, stop = 5)

In [114]:
census_df_final['population'] = census_df['total_pop']
census_df_final['employed_population'] = census_df['employed']
census_df_final['poverty_percent'] = (census_df['poverty']/100).round(2)
census_df_final['walkability'] = (1/census_df['mean_commute']).round(2)
census_df_final['avg_income'] = census_df['income']
census_df_final['top_ethnicity'] = census_df[['hispanic', 'white', 'black', 'asian', 'pacific', 'native']].idxmax(axis=1)
census_df_final['top_mode_of_transport'] = census_df[['drive', 'carpool', 'transit', 'walk', 'other_transp']].idxmax(axis=1)
census_df['diversity'] = 0.1*census_df[['men','women']].std(axis = 1)+census_df[['hispanic', 'white', 'black', 'asian', 'pacific', 'native']].std(axis = 1)
census_df['diversity_cat'] = pd.qcut(census_df['diversity'], q=[0, 0.25, 0.75, 1], labels=['high', 'medium', 'low'])
census_df_final['diversity'] = census_df['diversity_cat']
census_df_final['work_from_home'] = pd.qcut(census_df['work_at_home'], q=[0, 0.25, 0.75, 1], labels=['low', 'medium', 'high'])
census_df['job_diversity'] = census_df[['construction', 'production', 'service', 'professional', 'office']].idxmax(axis=1)
census_df['job_diversity_cat'] = pd.qcut(census_df['diversity'], q=[0, 0.25, 0.75, 1], labels=['low', 'medium', 'high'])
census_df_final['job_diversity'] = census_df['job_diversity_cat']

  census_df_final['top_ethnicity'] = census_df[['hispanic', 'white', 'black', 'asian', 'pacific', 'native']].idxmax(axis=1)
  census_df_final['top_mode_of_transport'] = census_df[['drive', 'carpool', 'transit', 'walk', 'other_transp']].idxmax(axis=1)
  census_df['job_diversity'] = census_df[['construction', 'production', 'service', 'professional', 'office']].idxmax(axis=1)


In [115]:
census_df.head(100)

Unnamed: 0,tract_id,state,county,total_pop,men,women,hispanic,white,black,native,...,other_transp,work_at_home,mean_commute,employed,unemployment,year,diversity,diversity_cat,job_diversity,job_diversity_cat
0,1001020100,Alabama,Autauga,1948,940,1008,0.9,87.4,7.7,0.3,...,2.3,2.1,25.0,943,5.4,2015,39.835102,medium,professional,medium
1,1001020200,Alabama,Autauga,2156,1059,1097,0.8,40.4,53.3,0.0,...,0.7,0.0,23.4,753,13.3,2015,26.841812,high,service,low
2,1001020300,Alabama,Autauga,2968,1364,1604,0.0,74.5,18.6,0.5,...,0.0,2.5,19.6,1373,6.2,2015,46.584169,medium,professional,medium
3,1001020400,Alabama,Autauga,4423,2172,2251,10.5,82.8,3.7,1.6,...,2.6,1.6,25.3,1782,10.8,2015,38.333703,medium,professional,medium
4,1001020500,Alabama,Autauga,10763,4922,5841,0.7,68.5,24.8,0.0,...,0.6,0.9,24.8,5037,4.2,2015,92.289083,low,professional,high
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1015001800,Alabama,Calhoun,6488,2851,3637,4.0,79.7,10.3,0.3,...,1.6,2.5,26.0,2527,10.5,2015,87.042270,low,professional,high
96,1015002000,Alabama,Calhoun,7063,3429,3634,4.0,92.7,1.8,0.5,...,2.0,1.2,25.4,2902,5.8,2015,51.856845,medium,professional,medium
97,1015002101,Alabama,Calhoun,3778,1997,1781,0.9,59.5,37.2,0.0,...,1.6,8.8,20.4,1767,15.2,2015,40.997691,medium,service,medium
98,1015002102,Alabama,Calhoun,3315,1590,1725,3.3,86.8,6.9,0.0,...,0.7,4.5,20.2,1335,12.3,2015,44.175837,medium,professional,medium


In [116]:
census_df.dtypes


tract_id               object
state                  object
county                 object
total_pop               int64
men                     int64
women                   int64
hispanic              float64
white                 float64
black                 float64
native                float64
asian                 float64
pacific               float64
income                float64
income_per_cap        float64
poverty               float64
professional          float64
service               float64
office                float64
construction          float64
production            float64
drive                 float64
carpool               float64
transit               float64
walk                  float64
other_transp          float64
work_at_home          float64
mean_commute          float64
employed                int64
unemployment          float64
year                    int64
diversity             float64
diversity_cat        category
job_diversity          object
job_divers

In [117]:
census_df_final.head()

Unnamed: 0,tract_id,year,state_id,county_id,population,employed_population,poverty_percent,walkability,avg_income,top_ethnicity,top_mode_of_transport,diversity,work_from_home,job_diversity
0,1001020100,2015,1,1,1948,943,0.08,0.04,61838.0,white,drive,medium,medium,medium
1,1001020200,2015,1,1,2156,753,0.26,0.04,32303.0,black,drive,high,low,low
2,1001020300,2015,1,1,2968,1373,0.13,0.05,44922.0,white,drive,medium,medium,medium
3,1001020400,2015,1,1,4423,1782,0.02,0.04,54329.0,white,drive,medium,low,medium
4,1001020500,2015,1,1,10763,5037,0.11,0.04,51965.0,white,drive,low,low,high


In [118]:
census_df_final["tract_id"] = census_df_final["tract_id"].astype(str)
census_df_final["state_id"] = census_df_final["state_id"].astype(str)
census_df_final["county_id"] = census_df_final["state_id"] + census_df_final["county_id"]
len(census_df_final)

148002

In [119]:
county = pd.read_csv("data/county.csv", dtype=str)
# Only keep the rows where county_id is in the county.csv
census_df_final = census_df_final[census_df_final["county_id"].isin(county["county_id"])]
len(census_df_final)

146798

In [120]:
census_df_final.to_csv("data/tract.csv", index=False)