# Non Addressable Building Flagging Process Example

### Introduction

One key component of the data cleaning process is the identification of non addressable outbuildings (NAO). A NAO is any building that should not be matched with an address as it is not considered addressable. Some examples of what are considered to be NAOs include (but are not limited to): garages, gazebos, water towers, and maintenance sheds.

In [1]:
import datetime
import os
import re
import string
import sys
from pathlib import Path
import fiona
import geopandas as gpd
import numpy as np
import pandas as pd
import swifter
from dotenv import load_dotenv
from numpy.core.numeric import True_
from pyproj import crs
from shapely import geometry
from shapely.geometry import MultiPolygon, Point, Polygon, geo
from math import pi

### Inputs

There are several datasets that are key in identifying NAOs. The base three layers are required for this process to function (address points, building footprints, cadastral parcels).

In the cell below the key layers are loaded in the address points and the cadastral data are the cleaned versions as during the cleaning script they are cleaned first. On the footprints basic cleaning and link is done so that they are ready for the shed flagging process.

In [8]:
def reproject(ingdf, output_crs):
    ''' Takes a gdf and tests to see if it is in the projects crs if it is not the funtions will reproject '''
    if ingdf.crs == None:
        ingdf.set_crs(epsg=output_crs, inplace=True)    
    elif ingdf.crs != f'epsg:{output_crs}':
        ingdf.to_crs(epsg=output_crs, inplace=True)
    return ingdf

def return_smallest_match(ap_matches, parcel_df, unique_id):
    '''Takes plural matches of buildings or address points and compares them against the size of the matched parcel. Returns only the smallest parcel that was matched'''
    ap_matches['ap_match_id'] = range(1, len(ap_matches.index)+1)
    o_ids = []
    for rid in list(set(ap_matches[unique_id].tolist())):
        rid_matches = ap_matches[ap_matches[unique_id] == rid]
        rid_ids = list(set(rid_matches['link_field'].tolist()))
        match_parcels = parcel_df[parcel_df['link_field'].isin(rid_ids)]
        match_parcels.sort_values(by=['AREA'], inplace=True, ascending=True)
        min_parcel_link = match_parcels['link_field'].tolist()[0]
        o_ids.append(rid_matches[rid_matches['link_field'] == min_parcel_link].ap_match_id.tolist()[0])
    ap_matches = ap_matches[ap_matches['ap_match_id'].isin(o_ids)]
    ap_matches.drop(columns=['ap_match_id'], inplace=True)
    return ap_matches

pd.options.mode.chained_assignment = None
load_dotenv(os.path.join(r'C:\projects\point_in_polygon\scripts', 'NB_environments.env'))

proj_crs = os.getenv('PROJ_CRS')
aoi_mask = os.getenv('AOI_MASK')
aoi_gdf = None
if aoi_mask != None:
    aoi_gdf = gpd.read_file(aoi_mask)

footprint_lyr = Path(os.getenv('BF_PATH'))
footprint_lyr_name = os.getenv('BF_LYR_NME')

project_gpkg = Path(os.getenv('DATA_GPKG'))

linking_data = gpd.read_file(project_gpkg, layer='parcels_cleaned', driver='GPKG')
addresses = gpd.read_file(project_gpkg, layer='addresses_cleaned', driver='GPKG')

footprint = gpd.read_file(footprint_lyr, layer=footprint_lyr_name ,mask=aoi_gdf)

footprint = reproject(footprint, proj_crs)

footprint['geometry'] = footprint['geometry'].buffer(0)

print('Cleaning and prepping footprint data')
# footprint = explode(footprint) # Remove multipart polygons convert to single polygons
footprint['bf_area'] = footprint['geometry'].area
# footprint = footprint.loc[footprint.area >= 20.0] # Remove all buildings with an area of less than 20m**2
footprint = footprint.reset_index()
footprint.rename(columns={'index':'bf_index'}, inplace=True)
footprint.set_index(footprint['bf_index'])
footprint = reproject(footprint, proj_crs)

footprint['centroid_geo'] = footprint['geometry'].swifter.apply(lambda pt: pt.centroid)
footprint = footprint.set_geometry('centroid_geo')
footprint = gpd.sjoin(footprint, linking_data[['link_field', 'geometry']], how='left', op='within')
grouped_bf = footprint.groupby('bf_index', dropna=True)['bf_index'].count()
grouped_bf = grouped_bf[grouped_bf > 1].index.tolist()
footprint_plural_sj = footprint[footprint['bf_index'].isin(grouped_bf)]
footprint_singular = footprint[~footprint['bf_index'].isin(grouped_bf)]
footprint_plural_sj = return_smallest_match(footprint_plural_sj, linking_data, 'bf_index')
footprint = footprint_singular.append(footprint_plural_sj)

Cleaning and prepping footprint data


Below is the function that determines whether or not a building is a non-addressable outbuilding there are a few different processes contianed within that look at different criteria tha

In [None]:
def shed_flagging(footprint_gdf, address_gdf, linking_gdf):
    '''
    Methodology for finding and flagging buildings as non-addressable outbuildings.
    '''
    
    def find_sheds( bf_data, ap_count, bf_area_field='bf_area', bf_index_field='bf_index', bp_threshold=20, min_adressable_area=50, max_shed_size=100):
        '''
        returns a list of all bf_indexes that should be flagged as NAOs
        and should be considered unaddressable.
        take the difference from the counts of each type of record in the parcel and flag the number of smallest
        buildings that coorespond with the difference value
        '''
        bf_count = len(bf_data)
        
        # If either is equal to zero this method will not help select out sheds
        if ap_count == 0 or bf_count == 0:
            return []
        if bf_count == 1:
            return []

        # Sizing is different in trailer parks so deal with these differently
        if bf_count > bp_threshold:
            # do just the tiny building check as the min max between home and shed in these areas overlaps
            sheds = bf_data.loc[bf_data[bf_area_field] < min_adressable_area]
            shed_indexes = sheds[bf_index_field].values.tolist() # convert to list of indexes
            return shed_indexes

        # Take out the tiny buildings under 50m2 and prelabel them as sheds then take remainder and test count vs count
        sheds = pd.DataFrame(bf_data.loc[bf_data[bf_area_field] < min_adressable_area])
        bf_data = bf_data.loc[(bf_data[bf_area_field] > min_adressable_area)]

        bf_count = len(bf_data) # reset bf_count because we changed the # of buildings in bf_data

        ap_bf_diff = bf_count - ap_count # how many more bf's there are than address points in the parcel
        sheds = pd.concat([sheds, bf_data.sort_values(bf_area_field, ascending=True).head(ap_bf_diff)], axis=0, join='outer') # sort the smallest to the top then take the top x rows based on ap_bf_diff value 
        
        sheds = sheds[sheds[bf_area_field] <= max_shed_size] # remove things from the output that are unlikly to be sheds >= 100m2

        shed_indexes = sheds[bf_index_field].values.tolist() # convert to list of indexes
        return shed_indexes

    # Start by finding all the perfectly round buildings and labelling them as sheds size doesn't matter here.
    footprint_gdf['perimiter'] = footprint_gdf['geometry'].apply(lambda x: x.length)
    footprint_gdf['C'] = footprint_gdf.apply(lambda c: (4*pi*c['bf_area'])/(c['perimiter']*c['perimiter']), axis=1)
    # separate out the round sheds from rest of the 
    round_sheds = footprint_gdf[footprint_gdf['C'] >= 0.98]
    footprint_gdf = footprint_gdf[footprint_gdf['C'] < 0.98]
    footprint_gdf.drop(columns=['C'], inplace=True)
    round_sheds.drop(columns=['C'], inplace=True)
    
    # Of the remaining group, count, slice
    adp_parcel_linkages = address_gdf.groupby('link_field', dropna=True)['link_field'].count()
    bf_parcel_linkages = footprint_gdf.groupby('link_field', dropna=True)['link_field'].count()

    # Return only cases where the bf count is higher than the adp count
    adp_parcel_l_bf = adp_parcel_linkages[adp_parcel_linkages.index.isin(bf_parcel_linkages.index.tolist())]
    bf_parcel_l_ap = bf_parcel_linkages[bf_parcel_linkages.index.isin(adp_parcel_linkages.index.tolist())]

    bf_parcel_l_ap = pd.DataFrame(bf_parcel_l_ap)
    bf_parcel_l_ap.rename(columns={ bf_parcel_l_ap.columns[0]: "bf_count"}, inplace=True)

    adp_parcel_l_bf = pd.DataFrame(adp_parcel_l_bf)
    adp_parcel_l_bf.rename(columns={adp_parcel_l_bf.columns[0]: "ap_count"}, inplace=True)

    linking_gdf = linking_gdf.loc[linking_gdf['link_field'].isin(bf_parcel_l_ap.index.tolist())]
    linking_gdf['shed_list'] = linking_gdf['link_field'].apply(lambda x: find_sheds(footprint_gdf[footprint_gdf['link_field'] == x], adp_parcel_l_bf[adp_parcel_l_bf.index == x].ap_count.tolist()[0]))
    shed_indexes = [ i for l in linking_gdf['shed_list'].tolist() for i in l ] # item for sublist in t for item in sublist: t being the shed_list list

    shed_gdf = footprint_gdf[footprint_gdf['bf_index'].isin(shed_indexes)]
    footprint_gdf = footprint_gdf.loc[~footprint_gdf['bf_index'].isin(shed_indexes)]

    shed_gdf['shed_flag'] = True
    round_sheds['shed_flag'] = True
    footprint_gdf['shed_flag'] = False
    footprint_gdf = footprint_gdf.append([shed_gdf, round_sheds])
    return footprint_gdf

footprint = shed_flagging(footprint, addresses, linking_data)