In [1]:
import geopandas as gpd
import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
CRS_MA = "EPSG:26986"

### Merge MA Census Tract Shapefiles with census data from social explorer

In [3]:
# # Only read these in again if you don't have the ma census tracts files
# ## Relevant statistics from social explorer, by census tract
# census_data = pd.read_csv('../data/census_data_by_tract.csv')
# ## All US census tracts
# us_tracts_gdf = gpd.read_file("../data/TRACT_2019_US_SL140__2019-11-18_12-56-55-677/TRACT_2019_US_SL140_Coast_Clipped.shp", crs="EPSG:26986")
# ## Filter out just MA census tracts
# ma_tracts_gdf = us_tracts_gdf[us_tracts_gdf['STATEFP'] == '25'].sort_values(by="GEOID")
# 
# ## Left merge because not all the census tract geos have census data from social explorer
# ma_tracts_data_gdf = ma_tracts_gdf.merge(census_data, left_on='GEOID', right_on='FIPS', how='left')
# ma_tracts_data_gdf.to_file("../data/ma_tracts_census_data.geojson", driver='GeoJSON')

In [4]:
census_tract_df = gpd.read_file('../code/static/data/ma_tracts_census_data.geojson')
census_tract_df = census_tract_df.to_crs(crs=CRS_MA)
stops_with_buffer = gpd.read_file('../code/static/data/mbta_community_stops_with_buffer.geojson')
stops_with_buffer = stops_with_buffer.to_crs(crs=CRS_MA)

In [5]:
# from collections import Counter
# def rename_duplicates(old_cols):
#     counts = Counter()
#     new_cols = []
#     for col in old_cols:
#         counts[col] += 1
#         new_cols.append(col if counts[col] == 1 else f"{col}_{counts[col]}")
#     return new_cols
# census_tract_df.columns = rename_duplicates(census_tract_df.columns)
# census_tract_df = census_tract_df.drop(columns=census_tract_df.columns[census_tract_df.columns.str.contains('Total Population: Female_')])
# census_tract_df.to_file('../data/ma_tracts_census_data.geojson')

In [6]:
# Compute area of census bg
census_tract_df['tract_area_km2'] = census_tract_df.area / 1e6

In [7]:
mbtac_gdf = gpd.read_file("../code/static/data/mbta_municipalities.geojson")
mbtac_gdf = mbtac_gdf.to_crs(crs=CRS_MA)

In [8]:
assert census_tract_df.crs == stops_with_buffer.crs

In [9]:
columns_to_weight = list({
    # Denomenator for income categories
    'Households:',
    'Households: Less than $25,000',
    'Households: $25,000 to $49,999',
    'Households: $50,000 to $74,999',
    'Households: $75,000 to $99,999',
    'Households: $100,000 or More',
    
    # This is the total population of the census tract (count)
    'Total Population',
    # Denominator for age categories beginning with male
    'Total Population: Male',
    'Total Population: Male: Under 18 Years',
    'Total Population: Male: 18 to 34 Years',
    'Total Population: Male: 35 to 64 Years',
    'Total Population: Male: 65 Years and Over',
    # Denominator for age categories beginning with female
    'Total Population: Female',
    'Total Population: Female: Under 18 Years',
    'Total Population: Female: 18 to 34 Years',
    'Total Population: Female: 35 to 64 Years',
    'Total Population: Female: 65 Years and Over',
    
    # Merging all racial categories within "Hispanic and Latino" so that
    # there aren't an overwhelming number of racial categories 
    'Total Population: Hispanic or Latino',
    'Total Population: Not Hispanic or Latino',
    'Total Population: Not Hispanic or Latino: White Alone',
    'Total Population: Not Hispanic or Latino: Black or African American Alone',
    'Total Population: Not Hispanic or Latino: American Indian and Alaska Native Alone',
    'Total Population: Not Hispanic or Latino: Asian Alone',
    'Total Population: Not Hispanic or Latino: Native Hawaiian and Other Pacific Islander Alone',
    'Total Population: Not Hispanic or Latino: Some Other Race Alone',
    'Total Population: Not Hispanic or Latino: Two or More Races',
    
    'Median Household Income (In 2022 Inflation Adjusted Dollars)',
    
    'Workers 16 Years and Over:',
    'Workers 16 Years and Over: Car, Truck, or Van',
    'Workers 16 Years and Over: Drove Alone',
    'Workers 16 Years and Over: Public Transportation (Includes Taxicab)',
    'Workers 16 Years and Over: Motorcycle',
    'Workers 16 Years and Over: Bicycle',
    'Workers 16 Years and Over: Walked',
    'Workers 16 Years and Over: Other Means',
    'Workers 16 Years and Over: Worked At Home',
    
    'Occupied Housing Units',
    'Occupied Housing Units: No Vehicle Available',
    'Occupied Housing Units: 1 Vehicle Available',
    'Occupied Housing Units: 2 Vehicles Available',
})

renamed_columns = ['Weighted ' + str(i) for i in columns_to_weight]

In [10]:
census_tract_df[columns_to_weight] = census_tract_df[columns_to_weight].astype(float).fillna(0).copy()

In [11]:
# Assume 'stops_with_buffer' and 'census_bg_df' are defined and are valid GeoDataFrames

# Initialize an empty GeoDataFrame with a specified CRS
station_buffer_census_df = gpd.GeoDataFrame()

# Loop through each station in 'stops_with_buffer'
for idx, station in tqdm(stops_with_buffer.iterrows()):
    # Create a GeoDataFrame for the current station's buffer geometry
    curr_gdf = gpd.GeoDataFrame({'geometry': [station['geometry']]}, crs=CRS_MA)

    # Clip the census blocks to the current station's buffer
    curr_census = census_tract_df.clip(curr_gdf)

    # Project the clipped census blocks to the desired CRS
    curr_census = curr_census.to_crs(crs=CRS_MA)

    # Calculate the area in square kilometers
    curr_census['clipped_area_km2'] = curr_census['geometry'].area / 1e6
    
    # Calculate the respective area percent
    curr_census['station_buffer_area_km2'] = station['geometry'].area / 1e6
    curr_census['percent_of_buffer_area'] = curr_census['clipped_area_km2'] / curr_census['station_buffer_area_km2']

    # Calculate the percent of the block group area
    curr_census['percent_of_census_tract_area'] = curr_census['clipped_area_km2'] / curr_census['tract_area_km2']
    
    # Recompute columns to reflect the spatial weighting with respect to the census block group
    # Columns that are in absolute values can just be multiplied by the area that is in the station buffer
    weighted_census = curr_census[columns_to_weight].copy()
    weighted_census.loc[:, columns_to_weight] = weighted_census[columns_to_weight].mul(curr_census['percent_of_census_tract_area'], axis=0)
    weighted_census.loc[:, columns_to_weight] = weighted_census[columns_to_weight].round(decimals=0).astype(int)

    # Make sure that we do not have any percent that are bigger than 100% !!!
    assert (weighted_census[[i for i in weighted_census.columns if '%' in i]] > 100).any().any() == False
    assert len(weighted_census.columns) == len(weighted_census.columns.unique())
    
    # Finally, we assign the calculated values to the original census
    curr_census[renamed_columns] = weighted_census.rename(columns={o: n for o, n in zip(columns_to_weight, renamed_columns)})
    
    # We can now just add all the values in the curr_census and add the row to the respective station entry
    summed_census = curr_census[renamed_columns].sum(axis=0)
    stops_with_buffer.loc[stops_with_buffer.index==idx, renamed_columns] = summed_census[renamed_columns].to_numpy()

    # Add an identifier for the current station (e.g., the loop index or the station's index from 'stops_with_buffer')
    curr_census['stop_name'] = station['stop_name']

    # Append the current census blocks to the main GeoDataFrame
    station_buffer_census_df = pd.concat([station_buffer_census_df, curr_census], ignore_index=True)


164it [00:03, 46.05it/s]


In [12]:
stops_with_buffer.explore()

In [13]:
station_buffer_census_df_complete = pd.merge(station_buffer_census_df, stops_with_buffer[[i for i in stops_with_buffer.columns if i not in renamed_columns]], how='left', on='stop_name')
station_buffer_census_df_complete['geometry'] = station_buffer_census_df_complete['geometry_x']
del station_buffer_census_df_complete['geometry_x']
del station_buffer_census_df_complete['geometry_y']
station_buffer_census_df_complete = gpd.GeoDataFrame(station_buffer_census_df_complete, crs=CRS_MA)

In [14]:
station_buffer_census_df_complete[station_buffer_census_df_complete['stop_name']=='Coolidge Corner']

Unnamed: 0,Total Population: Not Hispanic or Latino: Black or African American Alone,Occupied Housing Units,Workers 16 Years and Over: Walked,Total Population: Male: 35 to 64 Years,"Workers 16 Years and Over: Car, Truck, or Van","Households: $25,000 to $49,999","Households: $75,000 to $99,999",Total Population: Not Hispanic or Latino: Some Other Race Alone,Total Population: Female: 18 to 34 Years,Total Population: Male: 65 Years and Over,...,sq_miles,pop2020,area_acres,type,aland20,community,mbta_comm_type,housing_units_2020,min_rf1_cap_req,geometry
176,41.0,1554.0,387.0,583.0,495.0,158.0,135.0,20.0,818.0,163.0,...,6.83,63191.0,4371.28,T,17507686.0,Brookline,subway or light rail,27961,6990.25,"POLYGON ((231006.804 898512.883, 231022.095 89..."
177,16.0,2467.0,533.0,741.0,1105.0,187.0,451.0,0.0,1304.0,305.0,...,6.83,63191.0,4371.28,T,17507686.0,Brookline,subway or light rail,27961,6990.25,"POLYGON ((230969.265 898877.386, 230963.183 89..."
178,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,6.83,63191.0,4371.28,T,17507686.0,Brookline,subway or light rail,27961,6990.25,"POLYGON ((230457.271 899409.873, 230482.213 89..."
179,215.0,1931.0,657.0,553.0,410.0,244.0,162.0,21.0,473.0,166.0,...,6.83,63191.0,4371.28,T,17507686.0,Brookline,subway or light rail,27961,6990.25,"POLYGON ((231578.950 898575.299, 231619.460 89..."
180,54.0,2526.0,954.0,858.0,583.0,190.0,291.0,0.0,1218.0,366.0,...,6.83,63191.0,4371.28,T,17507686.0,Brookline,subway or light rail,27961,6990.25,"POLYGON ((231313.366 899050.412, 231366.316 89..."
181,253.0,2448.0,808.0,568.0,719.0,359.0,219.0,1.0,1365.0,350.0,...,6.83,63191.0,4371.28,T,17507686.0,Brookline,subway or light rail,27961,6990.25,"POLYGON ((231978.633 898956.773, 231927.702 89..."
182,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,6.83,63191.0,4371.28,T,17507686.0,Brookline,subway or light rail,27961,6990.25,"POLYGON ((231596.284 899760.799, 231588.908 89..."
183,20.0,1752.0,404.0,634.0,746.0,214.0,297.0,50.0,1072.0,254.0,...,6.83,63191.0,4371.28,T,17507686.0,Brookline,subway or light rail,27961,6990.25,"POLYGON ((230966.840 899896.402, 231020.809 89..."


In [15]:
# Making sure that the total populations add up
assert (station_buffer_census_df_complete['Weighted Total Population'] -
(
        station_buffer_census_df_complete['Weighted Total Population: Hispanic or Latino'] + 
        station_buffer_census_df_complete[[c for c in station_buffer_census_df_complete if 'Weighted Total Population: Not Hispanic or Latino:' in c]].sum(axis=1)
).abs() < 3).all()

In [16]:
# stops_with_buffer['Weighted Total Population'] -\
#  (
#          stops_with_buffer['Weighted Total Population: Hispanic or Latino'] +
#          stops_with_buffer[[c for c in stops_with_buffer if 'Weighted Total Population: Not Hispanic or Latino:' in c]].sum(axis=1)
#  )

In [17]:
# Making sure that the total populations add up
assert (
        station_buffer_census_df_complete['Weighted Total Population'] -
        (
                station_buffer_census_df_complete['Weighted Total Population: Male'] +
                station_buffer_census_df_complete['Weighted Total Population: Female']
        ).abs() < 3
).all()

In [18]:
assert (station_buffer_census_df_complete['Weighted Total Population'] - 
        station_buffer_census_df_complete[['Weighted Total Population: Male: Under 18 Years',
            'Weighted Total Population: Male: 18 to 34 Years',
            'Weighted Total Population: Male: 35 to 64 Years',
            'Weighted Total Population: Male: 65 Years and Over',
            'Weighted Total Population: Female: Under 18 Years',
            'Weighted Total Population: Female: 18 to 34 Years',
            'Weighted Total Population: Female: 35 to 64 Years',
            'Weighted Total Population: Female: 65 Years and Over']
        ].sum(axis=1).abs() < 4).all()

## Translate absolute value columns to pct and rename like weighted_pct_lowercase _ separated column name

In [19]:
# The key of the dictionary is the denominator while the entries in the list are the numerators
percentage_mapping = {
    'Weighted Households:' : [
        'Weighted Households: Less than $25,000',
        'Weighted Households: $25,000 to $49,999',
        'Weighted Households: $50,000 to $74,999',
        'Weighted Households: $75,000 to $99,999',
        'Weighted Households: $100,000 or More',
    ],
    'Weighted Total Population': [
        'Weighted Total Population: Male',
        'Weighted Total Population: Male: Under 18 Years',
        'Weighted Total Population: Male: 18 to 34 Years',
        'Weighted Total Population: Male: 35 to 64 Years',
        'Weighted Total Population: Male: 65 Years and Over',
        'Weighted Total Population: Female',
        'Weighted Total Population: Female: Under 18 Years',
        'Weighted Total Population: Female: 18 to 34 Years',
        'Weighted Total Population: Female: 35 to 64 Years',
        'Weighted Total Population: Female: 65 Years and Over',

        'Weighted Total Population: Hispanic or Latino',
        'Weighted Total Population: Not Hispanic or Latino',
        'Weighted Total Population: Not Hispanic or Latino: White Alone',
        'Weighted Total Population: Not Hispanic or Latino: Black or African American Alone',
        'Weighted Total Population: Not Hispanic or Latino: American Indian and Alaska Native Alone',
        'Weighted Total Population: Not Hispanic or Latino: Asian Alone',
        'Weighted Total Population: Not Hispanic or Latino: Native Hawaiian and Other Pacific Islander Alone',
        'Weighted Total Population: Not Hispanic or Latino: Some Other Race Alone',
        'Weighted Total Population: Not Hispanic or Latino: Two or More Races',
    ],
    'Weighted Occupied Housing Units': [
        'Weighted Occupied Housing Units: No Vehicle Available',
        'Weighted Occupied Housing Units: 1 Vehicle Available',
        'Weighted Occupied Housing Units: 2 Vehicles Available',
    ],
    'Weighted Workers 16 Years and Over:': [
        'Weighted Workers 16 Years and Over: Car, Truck, or Van',
        'Weighted Workers 16 Years and Over: Drove Alone',
        'Weighted Workers 16 Years and Over: Public Transportation (Includes Taxicab)',
        'Weighted Workers 16 Years and Over: Motorcycle',
        'Weighted Workers 16 Years and Over: Bicycle',
        'Weighted Workers 16 Years and Over: Walked',
        'Weighted Workers 16 Years and Over: Other Means',
        'Weighted Workers 16 Years and Over: Worked At Home',
    ]
}

In [20]:
for denominator in percentage_mapping:
    percentage_columns = percentage_mapping[denominator]
    renamed_percentage_columns = [f'percentage_{c}' for c in percentage_columns]
    stops_with_buffer[renamed_percentage_columns] = np.divide(stops_with_buffer[percentage_columns].to_numpy(), stops_with_buffer[denominator].to_numpy()[...,None], out=np.zeros_like(stops_with_buffer[percentage_columns].to_numpy()), where=stops_with_buffer[percentage_columns].to_numpy()!=0).round(decimals=2)

In [21]:
stops_with_buffer.columns = [c.replace(',', '').replace('(', '').replace(')', '').replace('$', '').replace(':', '').replace(' ', '_').lower() for c in stops_with_buffer.columns]

In [23]:
station_buffer_census_df_complete.to_file('../data/preprocessed_data/station_buffer_census_tracts_separate.geojson')
stops_with_buffer.to_file('../data/preprocessed_data/station_buffer_census_cumulative.geojson')
stops_with_buffer.to_file('../code/static/data/mbta_community_stops_with_buffer_and_census.geojson')