In [2]:
import geopandas as gpd
import pandas as pd
from tqdm import tqdm

### Merge MA Census Tract Shapefiles with census data from social explorer

In [2]:
#only read these in again if you don't have the ma census tracts files


# census_data = pd.read_csv('../data/census_data.csv') #relevant statistics from social explorer, by census tract
# us_tracts_gdf = gpd.read_file("../data/TRACT_2019_US_SL140__2019-11-18_12-56-55-677/TRACT_2019_US_SL140_Coast_Clipped.shp", crs="EPSG:26986") # all US census tracts
# ma_tracts_gdf = us_tracts_gdf[us_tracts_gdf['STATEFP'] == '25'].sort_values(by="GEOID") # filter out just MA census tracts



# ma_tracts_data_gdf = ma_tracts_gdf.merge(census_data, left_on='GEOID', right_on='FIPS', how='left') #left merge because not all of the census tract geos have census data from social explorer
# ma_tracts_data_gdf.to_file("../data/ma_tracts_census_data.geojson", driver='GeoJSON')

In [3]:
new_data = gpd.read_file('../data/preprocessed_data/station_buffer_census_bgs_separate.geojson')


In [10]:
new_data.iloc[0:500].explore()


In [3]:
census_bg_df = gpd.read_file('../data/ma_tracts_census_data.geojson')
census_bg_df = census_bg_df.to_crs(crs="EPSG:26986")
stops_with_buffer = gpd.read_file('../data/mbta_stops.geojson')
stops_with_buffer = stops_with_buffer.to_crs(crs="EPSG:26986")


In [4]:
# Compute area of census bg
census_bg_df['bg_area_km2'] = census_bg_df.area / 1e6

In [5]:
mbtac_gdf = gpd.read_file("../data/mbta_municipalities.geojson")
mbtac_gdf = mbtac_gdf.to_crs(crs="EPSG:26986")

In [6]:
assert census_bg_df.crs == stops_with_buffer.crs

In [7]:
columns_to_weight = [
    'Total Population',
    'Population Density (Per Sq. Mile)',
    'Total Population:',
    'Total Population: Male',
    'Total Population: Female',
    '% Total Population: Male',
    '% Total Population: Female',
    'Total Population.1',
    'Total Population: Not Hispanic or Latino',
    'Total Population: Not Hispanic or Latino: White Alone',
    'Total Population: Not Hispanic or Latino: Black or African American Alone',
    'Total Population: Not Hispanic or Latino: American Indian and Alaska Native Alone',
    'Total Population: Not Hispanic or Latino: Asian Alone',
    'Total Population: Not Hispanic or Latino: Native Hawaiian and Other Pacific Islander Alone',
    'Total Population: Not Hispanic or Latino: Some Other Race Alone',
    'Total Population: Not Hispanic or Latino: Two or More Races',
    'Total Population: Hispanic or Latino',
    'Total Population: Hispanic or Latino: White Alone',
    'Total Population: Hispanic or Latino: Black or African American Alone',
    'Total Population: Hispanic or Latino: American Indian and Alaska Native Alone',
    'Total Population: Hispanic or Latino: Asian Alone',
    'Total Population: Hispanic or Latino: Native Hawaiian and Other Pacific Islander Alone',
    'Total Population: Hispanic or Latino: Some Other Race Alone',
    'Total Population: Hispanic or Latino: Two or More Races',
    '% Total Population: Not Hispanic or Latino',
    '% Total Population: Not Hispanic or Latino: White Alone',
    '% Total Population: Not Hispanic or Latino: Black or African American Alone',
    '% Total Population: Not Hispanic or Latino: American Indian and Alaska Native Alone',
    '% Total Population: Not Hispanic or Latino: Asian Alone',
    '% Total Population: Not Hispanic or Latino: Native Hawaiian and Other Pacific Islander Alone',
    '% Total Population: Not Hispanic or Latino: Some Other Race Alone',
    '% Total Population: Not Hispanic or Latino: Two or More Races',
    '% Total Population: Hispanic or Latino',
    '% Total Population: Hispanic or Latino: White Alone',
    '% Total Population: Hispanic or Latino: Black or African American Alone',
    '% Total Population: Hispanic or Latino: American Indian and Alaska Native Alone',
    '% Total Population: Hispanic or Latino: Asian Alone',
    '% Total Population: Hispanic or Latino: Native Hawaiian and Other Pacific Islander Alone',
    '% Total Population: Hispanic or Latino: Some Other Race Alone',
    '% Total Population: Hispanic or Latino: Two or More Races',
    'Median Household Income (In 2022 Inflation Adjusted Dollars)',
    'Workers 16 Years and Over:',
    'Workers 16 Years and Over: Car, Truck, or Van',
    'Workers 16 Years and Over: Drove Alone',
    'Workers 16 Years and Over: Carpooled',
    'Workers 16 Years and Over: Public Transportation (Includes Taxicab)',
    'Workers 16 Years and Over: Motorcycle',
    'Workers 16 Years and Over: Bicycle',
    'Workers 16 Years and Over: Walked',
    'Workers 16 Years and Over: Other Means',
    'Workers 16 Years and Over: Worked At Home',
    '% Workers 16 Years and Over: Car, Truck, or Van',
    '% Workers 16 Years and Over: Drove Alone',
    '% Workers 16 Years and Over: Carpooled',
    '% Workers 16 Years and Over: Public Transportation (Includes Taxicab)',
    '% Workers 16 Years and Over: Motorcycle',
    '% Workers 16 Years and Over: Bicycle',
    '% Workers 16 Years and Over: Walked',
    '% Workers 16 Years and Over: Other Means',
    '% Workers 16 Years and Over: Worked At Home',
    'Occupied Housing Units',
    'Occupied Housing Units: No Vehicle Available',
    'Occupied Housing Units: 1 Vehicle Available',
    'Occupied Housing Units: 2 Vehicles Available',
    'Occupied Housing Units: 3 Vehicles Available',
    'Occupied Housing Units: 4 Vehicles Available',
    'Occupied Housing Units: 5 or More Vehicles Available',
    '% Occupied Housing Units: No Vehicle Available',
    '% Occupied Housing Units: 1 Vehicle Available',
    '% Occupied Housing Units: 2 Vehicles Available',
    '% Occupied Housing Units: 3 Vehicles Available',
    '% Occupied Housing Units: 4 Vehicles Available',
    '% Occupied Housing Units: 5 or More Vehicles Available',
]

columns_to_weight_ints = [i for i in columns_to_weight if '%' not in i]
columns_to_weight_perc = [i for i in columns_to_weight if '%' in i]
renamed_columns = ['weighted_' + str(i) for i in columns_to_weight]

In [8]:
# Compute area of census bg
census_bg_df[columns_to_weight] = census_bg_df[columns_to_weight].astype(float).fillna(0)

In [9]:
# Assume 'stops_with_buffer' and 'census_bg_df' are defined and are valid GeoDataFrames

# Initialize an empty GeoDataFrame with a specified CRS
station_buffer_census_df = gpd.GeoDataFrame()

# Loop through each station in 'stops_with_buffer'
for idx, station in tqdm(stops_with_buffer.iterrows()):
    # Create a GeoDataFrame for the current station's buffer geometry
    curr_gdf = gpd.GeoDataFrame({'geometry': [station['geometry']]}, crs=stops_with_buffer.crs)

    # Clip the census blocks to the current station's buffer
    curr_census = census_bg_df.clip(curr_gdf)

    # Project the clipped census blocks to the desired CRS
    curr_census = curr_census.to_crs(crs="EPSG:26986")

    # Calculate the area in square kilometers
    curr_census['clipped_area_km2'] = curr_census['geometry'].area / 1e6
    
    # Calculate the respective area percent
    curr_census['station_buffer_area_km2'] = station['geometry'].area / 1e6
    curr_census['percent_of_buffer_area'] = curr_census['clipped_area_km2'] / curr_census['station_buffer_area_km2']

    # Calculate the percent of the block group area
    curr_census['percent_of_census_bg_area'] = curr_census['clipped_area_km2'] / curr_census['bg_area_km2']
    
    # Recompute columns to reflect the spatial weighting with respect to the census block group
    # Columns that are in absolute values can just be multiplied by the area that is in the station buffer
    weighted_census = curr_census[columns_to_weight].copy()
    weighted_census.loc[:, columns_to_weight_ints] = weighted_census[columns_to_weight_ints].mul(curr_census['percent_of_census_bg_area'], axis=0)
    weighted_census.loc[:, columns_to_weight_ints] = weighted_census[columns_to_weight_ints].round(decimals=0).astype(int)
    # The columns that are in percent, however, have to be weighted by the percent of buffer area that is covered by that BG
    weighted_census.loc[:, columns_to_weight_perc] = curr_census[columns_to_weight_perc].mul(curr_census['percent_of_buffer_area'], axis=0)

    # Make sure that we do not have any percent that are bigger than 100% !!!
    assert (weighted_census[[i for i in weighted_census.columns if '%' in i]] > 100).any().any() == False
    
    # Finally, we assign the calculated values to the original census
    curr_census[renamed_columns] = weighted_census.rename(columns={o: n for o, n in zip(columns_to_weight, renamed_columns)})
    
    # We can now just add all the values in the curr_census and add the row to the respective station entry
    summed_census = curr_census[renamed_columns].sum(axis=0)
    stops_with_buffer.loc[stops_with_buffer.index==idx, renamed_columns] = summed_census[renamed_columns].to_numpy()

    # Add an identifier for the current station (e.g., the loop index or the station's index from 'stops_with_buffer')
    curr_census['stop_id'] = station['stop_id']

    # Append the current census blocks to the main GeoDataFrame
    station_buffer_census_df = pd.concat([station_buffer_census_df, curr_census], ignore_index=True)


7324it [04:50, 25.21it/s]


In [10]:
brookline_stations = station_buffer_census_df.sjoin(mbtac_gdf[mbtac_gdf['municipality'] == 'Brookline'])['stop_id'].unique()
station_buffer_census_df[station_buffer_census_df['stop_id'].isin(brookline_stations)].head(5)

Unnamed: 0,STATEFP,COUNTYFP,Geo_FIPS,GEOID,NAME,Geo_QNAME,MTFCC,ALAND,AWATER,INTPTLAT,...,weighted_Occupied Housing Units: 3 Vehicles Available,weighted_Occupied Housing Units: 4 Vehicles Available,weighted_Occupied Housing Units: 5 or More Vehicles Available,weighted_% Occupied Housing Units: No Vehicle Available,weighted_% Occupied Housing Units: 1 Vehicle Available,weighted_% Occupied Housing Units: 2 Vehicles Available,weighted_% Occupied Housing Units: 3 Vehicles Available,weighted_% Occupied Housing Units: 4 Vehicles Available,weighted_% Occupied Housing Units: 5 or More Vehicles Available,stop_id
227,25,21,401100,25021401100,4011.0,Census Tract 4011,G5020,4131254,109096,42.3238534,...,11.0,0.0,0.0,1.591667,5.008833,7.945434,1.515873,0.064505,0.0,1027
228,25,21,400600,25021400600,4006.0,Census Tract 4006,G5020,1226102,553,42.3322718,...,13.0,7.0,0.0,2.423517,7.407197,2.580788,0.310674,0.168873,0.0,1027
229,25,25,502,25025000502,5.02,Census Tract 5.02,G5020,847704,334529,42.3315048,...,0.0,4.0,0.0,18.721278,13.247782,6.303163,0.0,0.138278,0.0,1027
230,25,25,503,25025000503,5.03,Census Tract 5.03,G5020,139345,0,42.3396332,...,8.0,0.0,0.0,3.1516,2.992296,0.703421,0.048964,0.0,0.0,1027
231,25,25,504,25025000504,5.04,Census Tract 5.04,G5020,333893,0,42.3414661,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1027


In [11]:
station_buffer_census_df_complete = pd.merge(station_buffer_census_df, stops_with_buffer[[i for i in stops_with_buffer.columns if i not in renamed_columns]], how='left', on='stop_id')
station_buffer_census_df_complete['geometry'] = station_buffer_census_df_complete['geometry_x']
del station_buffer_census_df_complete['geometry_x']
del station_buffer_census_df_complete['geometry_y']
station_buffer_census_df_complete = gpd.GeoDataFrame(station_buffer_census_df_complete)

In [12]:
station_buffer_census_df_complete[station_buffer_census_df_complete['stop_id']=='WR-0329-02'].explore()

In [13]:
station_buffer_census_df_complete.to_file('../data/preprocessed_data/station_buffer_census_bgs_separate.geojson')
stops_with_buffer.to_file('../data/preprocessed_data/station_buffer_census_cumulative.geojson')

You can find the above files on GDrive:

- `station_buffer_census_bgs_separate.geojson`: [GDrive Link](https://drive.google.com/file/d/1AAlo7GSMCMSBJOBQTCzS9KA7ZcD2m9uI/view?usp=drive_link)
- `station_buffer_census_cumulative.geojson`: [GDrive Link](https://drive.google.com/file/d/181sMgDXKSQLBNk647Dw_5VApV3WfjGP1/view?usp=drive_link)