- The code reads a parquet file with building polygons, reprojects to a metric crs (epsg:3857), and computes 2 fields 1) FDR_PART = perimeter × height and 2) VOLUME_PART = area × height.
- Next, using a batching process of 100, it loops over each target building, builds a 100 m circular buffer, uses a spatial index + sjoin(..., within) to find buildings fully inside that buffer, and then computes three attributes:
- 1) building_density_100 - the count of matched buildings (within the buffer),
  2) FDR - sum of target's FDR_PART divided by 100x100 (target-weighted FDR)
  3) built_up_volume - sum of target's VOLUME_PART divided by 100x100
- Lastly, it concatenates all batches, reprojects back to epsg:4326, and saves the output parquet.

In [5]:
import pandas as pd
import geopandas as gpd
from shapely import wkb, wkt
from tqdm import tqdm

# --- Load data ---
path = r"churu_buildings_with_population.parquet"
output_file = r"churu_buildings_with_morphology.parquet"
df = pd.read_parquet(path)
df.head()


Unnamed: 0,perimeter_in_meters,building_faces,bf_source,confidence_left,geometry,longitude,latitude,id,area_in_meters,height_mean,...,bvnvb,Shape_Leng,Shape_Area,Ward_sqkm,Percentage,Per_inter,calc_area_m2,inhabitants_whole_churu,inhabitants_with_integer_estimate,inhabitants_with_integer_informal
0,6.881646,4,google,0.7285,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x05\x00...,74.974198,28.307729,74.9741978462364:28.307729275278096,0.000136,2.166667,...,47.0,3027.168581,201005.4,0.201,,0.995,201166.29,0.149795,1.0,1.0
1,7.007497,4,google,0.7928,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x05\x00...,74.951391,28.307306,74.9513908477859:28.30730566614702,0.000141,0.0,...,1.0,8789.786444,2305656.0,2.307,4.725,8.626,2307499.98,0.155496,1.0,1.0
2,6.90984,4,google,0.7027,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x05\x00...,74.962459,28.299556,74.96245903512192:28.2995564344419,0.000149,4.222222,...,10.0,1470.395664,64811.62,0.065,,98.462,64863.48,0.163704,1.0,1.0
3,7.235648,4,google,0.7013,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x05\x00...,74.963972,28.297278,74.96397245063798:28.297277583031956,0.000154,6.9,...,17.0,1810.881476,125811.1,0.126,,2.381,125911.76,0.339073,1.0,1.0
4,7.054727,4,google,0.7199,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x05\x00...,74.970836,28.314113,74.97083611088743:28.314113464876772,0.000156,2.166667,...,51.0,1511.571591,59415.58,0.059,,1.695,59463.12,0.171375,1.0,1.0


In [6]:
#if df['POLYGON_COORDINATES'].dtype == object:
#    df['POLYGON_COORDINATES'] = df['POLYGON_COORDINATES'].apply(wkb.loads)

gdf = gpd.GeoDataFrame(df, geometry=df['geometry'].apply(wkb.loads), crs="EPSG:4326")
gdf = gdf.to_crs("EPSG:3857") # reproject to metric crs
gdf.insert(0, column='FDR_PART', value=0.0)
gdf.insert(0, column='VOLUME_PART', value=0.0)
gdf["FDR_PART"] = gdf[["perimeter_in_meters","height"]].apply(lambda building: building.perimeter_in_meters*building.height, axis = 1)
gdf["VOLUME_PART"] = gdf[["area_in_meters","height"]].apply(lambda building: building.area_in_meters*building.height, axis = 1)
target_gdf = gdf.copy()
all_gdf = gdf.copy()
print(gdf.columns)

Index(['VOLUME_PART', 'FDR_PART', 'perimeter_in_meters', 'building_faces',
       'bf_source', 'confidence_left', 'geometry', 'longitude', 'latitude',
       'id', 'area_in_meters', 'height_mean', 'height_median', 'height_max',
       'height', 'floors', 'gfa_in_meters', 'urban_split', 'ghsl_smod',
       'elevation', 'building_density_50', 'building_density_100',
       'building_density_250', 'building_density_500',
       'building_perimeter_in_meters_new', 'perimeter_to_area_ratio',
       'normalized_perimeter_to_area_ratio', 'centroid', 'radius_m',
       'num_vertices', 'classification_source', 'osm_type', 'centroid_x',
       'centroid_y', 'nearest_road_type_1', 'distance_to_1',
       'nearest_road_type_2', 'distance_to_2', 'nearest_road_type_3',
       'distance_to_3', 'nearest_road_type_4', 'distance_to_4',
       'road_density_for_4_fixed', 'road_density_for_5_fixed', 'SQN', 'faces',
       'prediction', 'confidence_settlement_clasification',
       'settlement_clasificatio

In [7]:
# --- Build spatial index once ---
sindex = all_gdf.sindex

# --- Set distances ---
distances = [100]

# --- Batch processing ---
batch_size = 100  # process 100 points at a time 
results = []

for start in tqdm(range(0, len(target_gdf), batch_size), desc="Processing batches"):
    end = min(start + batch_size, len(target_gdf))
    target_batch = target_gdf.iloc[start:end].copy()

    for dist in distances:
        target_batch[f"building_density_{dist}"] = 0
        target_batch[f"FDR"] = 0.0
        target_batch[f"BUILT_UP_VOLUME"] = 0.0

        # create buffers for the batch
        buffer_batch = target_batch.copy()
        buffer_batch["geometry"] = buffer_batch.geometry.buffer(dist)

        # pre-filter all_gdf using spatial index for each buffer
        possible_indices = set()
        for geom in buffer_batch.geometry:
            possible_indices.update(list(sindex.intersection(geom.bounds)))
        candidates = all_gdf.iloc[list(possible_indices)]

        # exact spatial join using within to match original results
        buffer_batch = buffer_batch.drop(columns=["index_right"], errors="ignore")
        candidates = candidates.drop(columns=["index_right"], errors="ignore")

        joined = gpd.sjoin(candidates, buffer_batch, how="inner", predicate="within")
        counts = joined.groupby("index_right").size()
        fdr = joined.groupby("index_right")["FDR_PART_right"].sum()/(dist*dist)
        volume = joined.groupby("index_right")["VOLUME_PART_right"].sum()/(dist*dist)
        target_batch.loc[counts.index, f"building_density_{dist}"] = counts.values
        target_batch.loc[fdr.index, f"FDR"] = fdr.values
        target_batch.loc[fdr.index, f"BUILT_UP_VOLUME"] = volume.values

    results.append(target_batch)

# --- Concatenate all batches ---
target_gdf = pd.concat(results)
target_gdf = target_gdf.to_crs("EPSG:4326")
target_gdf.to_parquet(output_file)
print(target_gdf.head())


Processing batches: 100%|██████████| 14812/14812 [15:25<00:00, 16.01it/s]


   VOLUME_PART   FDR_PART  perimeter_in_meters  building_faces bf_source  \
0     0.000612  30.967408             6.881646               4    google   
1     0.000635  31.533737             7.007497               4    google   
2     0.000668  31.094281             6.909840               4    google   
3     0.001154  54.267362             7.235648               4    google   
4     0.000700  31.746273             7.054727               4    google   

   confidence_left                                           geometry  \
0           0.7285  POLYGON ((74.97421 28.30772, 74.97421 28.30774...   
1           0.7928  POLYGON ((74.9514 28.3073, 74.9514 28.30732, 7...   
2           0.7027  POLYGON ((74.96247 28.29955, 74.96247 28.29957...   
3           0.7013  POLYGON ((74.96398 28.29727, 74.96398 28.29729...   
4           0.7199  POLYGON ((74.97085 28.31411, 74.97084 28.31412...   

   longitude   latitude                                    id  ...  \
0  74.974198  28.307729   74.97419