In [1]:
# !pip install censusgeocode

In [3]:
import glob
import json
import requests
import pandas as pd
from pprint import pprint

# Census Examples 

This notebook uses the `censusgeocode` package in Python (which is simply a wrapper around the US Census' official Geocoder API) to get census geographies for list of addresses or lat/longs

- https://pypi.org/project/censusgeocode/

### Step 1 | Grab your data at the address level

In [17]:
df = pd.read_csv('Housing_violations_raw.csv', low_memory=False, usecols=["ViolationID", "ViolationStatus", "Latitude", "Longitude", "NTA", 'ApprovedDate', 'OriginalCertifyByDate',
       'OriginalCorrectByDate', 'CurrentStatusDate', 'CurrentStatus', 'CensusTract'])
df.head(1)

Unnamed: 0,ViolationID,ApprovedDate,OriginalCertifyByDate,OriginalCorrectByDate,CurrentStatus,CurrentStatusDate,ViolationStatus,Latitude,Longitude,CensusTract,NTA
0,13957715,12/24/2020,01/24/2021,01/19/2021,VIOLATION CLOSED,04/16/2021,Close,40.660628,-73.995235,145.0,Sunset Park West


In [32]:
#395, 793, 792
df = df.drop(labels=[769])

In [18]:
for_geocode = df.drop_duplicates(subset=['CensusTract'], keep='first')
for_geocode.head(1)

Unnamed: 0,ViolationID,ApprovedDate,OriginalCertifyByDate,OriginalCorrectByDate,CurrentStatus,CurrentStatusDate,ViolationStatus,Latitude,Longitude,CensusTract,NTA
0,13957715,12/24/2020,01/24/2021,01/19/2021,VIOLATION CLOSED,04/16/2021,Close,40.660628,-73.995235,145.0,Sunset Park West


In [27]:
# for_geocode.loc[392:]

In [11]:
# for_geocode = for_geocode.drop(labels=392)

### Step 2 | Geoode Lat/Long if they're not already present

It already exists in this dataset. Census geocode has a function to go from addresss --> lat/long, but I haven't had time to implement it here. This dataset already has lat/longs. Message me if you're struggling with this step.

### Step 3 | Get Census Geographies

In [36]:
# Code adapted from. Defines a geocode function and then runs it in parallel (for speed)
# https://gis.stackexchange.com/questions/363830/applying-the-censusgeocode-package-to-an-entire-dataframe-of-geocoded-data

import pandas as pd
import censusgeocode as cg
from concurrent.futures import ThreadPoolExecutor
from tqdm.notebook import tqdm

def geocode(lat, lng):
    census = cg.coordinates(lng, lat)['2020 Census Blocks'][0]

    data = dict(geoid=census['GEOID'], 
                state=census['STATE'], 
                county=census['COUNTY'], 
                tract=census['TRACT'], 
                block=census['BLOCK'])
    
    return data


with ThreadPoolExecutor() as tpe:
    latitudes = df['Latitude']
    longitudes = df['Longitude']
    mapped_results = tpe.map(geocode, latitudes, longitudes)
    data = list(tqdm(mapped_results, total=len(df)))

census_geos_df = pd.DataFrame(data)
census_geos_df.head()

  0%|          | 0/745260 [00:00<?, ?it/s]

ConnectionError: ('Connection aborted.', OSError(22, 'Invalid argument'))

In [10]:
census_geos_df.to_csv('geocode_data.csv', index=False)

In [13]:
census_geos_df1 = pd.read_csv('geocode_data.csv')
census_geos_df1.head(1)

Unnamed: 0,geoid,state,county,tract,block
0,360470145002000,36,47,14500,2000


In [14]:
df1 = pd.read_csv('Housing_violations_raw.csv', low_memory=False, usecols=["ViolationID", "ViolationStatus", "Latitude", "Longitude", "NTA", 'ApprovedDate', 'OriginalCertifyByDate',
       'OriginalCorrectByDate', 'CurrentStatusDate', 'CurrentStatus'])
df1.head(2)

Unnamed: 0,ViolationID,ApprovedDate,OriginalCertifyByDate,OriginalCorrectByDate,CurrentStatus,CurrentStatusDate,ViolationStatus,Latitude,Longitude,NTA
0,13957715,12/24/2020,01/24/2021,01/19/2021,VIOLATION CLOSED,04/16/2021,Close,40.660628,-73.995235,Sunset Park West
1,13975317,01/07/2021,02/08/2021,02/03/2021,VIOLATION CLOSED,04/16/2021,Close,40.849519,-73.909293,Mount Hope


In [7]:
df_with_geos = pd.concat(
    [ 
        df1.reset_index(drop=True),
        census_geos_df1.reset_index(drop=True)
    ], 
    axis=1)

df_with_geos.head()

Unnamed: 0,ViolationID,ApprovedDate,OriginalCertifyByDate,OriginalCorrectByDate,CurrentStatus,CurrentStatusDate,ViolationStatus,Latitude,Longitude,NTA,geoid,state,county,tract,block
0,13957715,12/24/2020,01/24/2021,01/19/2021,VIOLATION CLOSED,04/16/2021,Close,40.660628,-73.995235,Sunset Park West,360470100000000.0,36.0,47.0,14500.0,2000.0
1,13975317,01/07/2021,02/08/2021,02/03/2021,VIOLATION CLOSED,04/16/2021,Close,40.849519,-73.909293,Mount Hope,360050200000000.0,36.0,5.0,23301.0,4000.0
2,14018854,02/10/2021,02/24/2021,02/17/2021,NOV SENT OUT,02/11/2021,Open,40.635605,-74.132229,Port Richmond,360850200000000.0,36.0,85.0,20702.0,3005.0
3,13998721,01/26/2021,02/27/2021,02/22/2021,VIOLATION WILL BE REINSPECTED,05/03/2021,Close,40.851277,-73.931018,Washington Heights North,360610300000000.0,36.0,61.0,26900.0,5001.0
4,13998722,01/26/2021,02/27/2021,02/22/2021,VIOLATION WILL BE REINSPECTED,05/03/2021,Close,40.851277,-73.931018,Washington Heights North,360050400000000.0,36.0,5.0,38500.0,4001.0


In [15]:
df_with_geos

Unnamed: 0,ViolationID,ApprovedDate,OriginalCertifyByDate,OriginalCorrectByDate,CurrentStatus,CurrentStatusDate,ViolationStatus,Latitude,Longitude,NTA,geoid,state,county,tract,block
0,13957715,12/24/2020,01/24/2021,01/19/2021,VIOLATION CLOSED,04/16/2021,Close,40.660628,-73.995235,Sunset Park West,3.604701e+14,36.0,47.0,14500.0,2000.0
1,13975317,01/07/2021,02/08/2021,02/03/2021,VIOLATION CLOSED,04/16/2021,Close,40.849519,-73.909293,Mount Hope,3.600502e+14,36.0,5.0,23301.0,4000.0
2,14018854,02/10/2021,02/24/2021,02/17/2021,NOV SENT OUT,02/11/2021,Open,40.635605,-74.132229,Port Richmond,3.608502e+14,36.0,85.0,20702.0,3005.0
3,13998721,01/26/2021,02/27/2021,02/22/2021,VIOLATION WILL BE REINSPECTED,05/03/2021,Close,40.851277,-73.931018,Washington Heights North,3.606103e+14,36.0,61.0,26900.0,5001.0
4,13998722,01/26/2021,02/27/2021,02/22/2021,VIOLATION WILL BE REINSPECTED,05/03/2021,Close,40.851277,-73.931018,Washington Heights North,3.600504e+14,36.0,5.0,38500.0,4001.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
745260,15829335,03/21/2023,04/20/2023,04/15/2023,VIOLATION CLOSED,04/11/2023,Close,40.822618,-73.912773,Morrisania-Melrose,,,,,
745261,15829337,03/21/2023,04/20/2023,04/15/2023,VIOLATION CLOSED,04/11/2023,Close,40.845178,-73.882849,East Tremont,,,,,
745262,15837242,03/23/2023,,,VIOLATION OPEN,03/22/2023,Open,40.837791,-73.885348,Crotona Park East,,,,,
745263,15844558,03/23/2023,04/02/2023,03/28/2023,NOV SENT OUT,03/24/2023,Open,40.760304,-73.912392,Astoria,,,,,


# Step 4 | Pick a geographical level and get Census data
Do you want Census data at the state level? county? tract? block?

1. Pick a geographical level.
2. See `census-example.ipynb` if you want to learn how to get Census data at your desired level

# Hope that helps!