## 3_filter_and_extract_buildings_from_VIDA_S2_Partitions
### An auxuliary notebook filtering buildings belonging into states withing countries (such as the State of Maharashtra in India)
### Additionally it computes basic statistics about buildings based on their footprints

### Initial configuration
#### To start working with this particular notebook, you need to provide necessary credential and settings
#### Below is an template of configuration, which is necessary prepare aside of this notebook and copy & paste all content in triple quotes to the next cell's input field
#### Please make sure OSM provides boundary polygon for the specified country name!
    """
    {
    "COS_ENDPOINT_URL": "s3.private.eu-de.cloud-object-storage.appdomain.cloud",
    "COS_AUTH_ENDPOINT_URL": "https://iam.cloud.ibm.com/oidc/token",
    "COS_APIKEY": "xxx",
    "COUNTRY_NAME": "Maharastra",
    "GRID_STORAGE_BUCKET": "xxx",
    "VIDA_PARQUET_BUCKET": "parquets",
    "UTILS_BUCKET"; "utils bucket name",
    "VIDA_S2_PARTITIONS"; "VIDA_S2_PARTITIONS bucket",
    "VIDA_COUNTRIES_BUILDINGS"; "vida-countries-buildings ",
    }
    """


In [6]:
# Read notebook configuration
import getpass
import json

config_str = getpass.getpass('Enter your prepared config: ')
config = json.loads(config_str)


In [12]:
# Import necessary libraries
import pyarrow.parquet as pq
import geopandas as gpd
import pandas as pd
import numpy as np
import shapely
import time
import threading
import datetime 
import jaydebeapi as jdbc
import jpype
import os
from tqdm import tqdm
from botocore.client import Config
import ibm_boto3
import io
import time
import pickle

from pyproj import Geod

geod = Geod(ellps="WGS84")

In [10]:
# init S3 client in order to work with last tiff file version
cos_client = ibm_boto3.client(service_name='s3',
                                  ibm_api_key_id=config["COS_APIKEY"],
                                  ibm_auth_endpoint="https://iam.cloud.ibm.com/oidc/token",
                                  config=Config(signature_version='oauth'),
                                  endpoint_url=config["COS_ENDPOINT_URL"])


# import external utils library
response = cos_client.list_objects_v2(Bucket=config["UTILS_BUCKET"])

utils_to_download = ['regions_S2_ids.pkl', 'filtering_grid_generator.py', 'india_state.geojson']

try:
    for obj in response['Contents']:
        name = obj['Key']
        if name in utils_to_download:
            streaming_body_1 = cos_client.get_object(Bucket=config["UTILS_BUCKET"], Key=name)['Body']
            print("Copying to localStorage :  " + name)
            with io.FileIO(name, 'w') as file:
                for i in io.BytesIO(streaming_body_1.read()):
                    file.write(i)
                    
    from filtering_grid_generator import GridGenerator
    
    print('External utils succesfully imported')
except Exception as e:
    print('Error occured: ', e)


Copying to localStorage :  filtering_grid_generator.py
Copying to localStorage :  india_state.geojson
Copying to localStorage :  regions_S2_ids.pkl
External utils succesfully imported


### Download all the S2 partitions


In [None]:
S2_partitions_bucket_response = cos_client.list_objects_v2(Bucket=config["VIDA_S2_PARTITIONS"]).get('Contents', [])
S2_partitions_bucket_objects = [i['Key'] for i in S2_partitions_bucket_response]


parquets_folder = 'S2_partitions'
if not os.path.exists(parquets_folder):
    os.makedirs(parquets_folder)
    
filtered_parquets_folder = 'states_filteded_parquets'
if not os.path.exists(filtered_parquets_folder):
    os.makedirs(filtered_parquets_folder)
    
for f in tqdm(S2_partitions_bucket_objects, total=len(S2_partitions_bucket_objects)):
    
    try:
        cos_client.download_file(Bucket=config["VIDA_S2_PARTITIONS"],Key=f,Filename=f'{parquets_folder}/{f}')
    except Exception as e:
        print(f'While downloading exception occurred: {e}')
        try:
            
            time.sleep(5)
            cos_client.download_file(Bucket=config["VIDA_S2_PARTITIONS"],Key=f,Filename=f'{parquets_folder}/{f}')
        except Exception as e:
            print(f'While second downloading attempt exception occurred: {e}')

In [None]:
india_states_df = gpd.read_file('india_state.geojson')


regions_polygons = {
    'Madhya Pradesh': [
        india_states_df[india_states_df.NAME_1 == 'Madhya Pradesh'].geometry.iloc[0]
        ],
    'South-India': [
        india_states_df[india_states_df.NAME_1 == 'Tamil Nadu'].geometry.iloc[0].geoms[-1],
        india_states_df[india_states_df.NAME_1 == 'Kerala'].geometry.iloc[0].geoms[-1]
    ],
    'East-India': [
        india_states_df[india_states_df.NAME_1 == 'Jharkhand'].geometry.iloc[0],
        india_states_df[india_states_df.NAME_1 == 'Nagaland'].geometry.iloc[0],
        india_states_df[india_states_df.NAME_1 == 'Mizoram'].geometry.iloc[0],
        india_states_df[india_states_df.NAME_1 == 'Assam'].geometry.iloc[0].geoms[-1],
    ]
}

regions_polygons

{'Madhya Pradesh': [<POLYGON ((78.365 26.869, 78.367 26.863, 78.37 26.858, 78.375 26.847, 78.381...>],
 'South-India': [<POLYGON ((80.076 13.527, 80.076 13.526, 80.079 13.529, 80.087 13.527, 80.08...>,
  <POLYGON ((74.996 12.788, 75 12.783, 75.004 12.786, 75.005 12.785, 75.004 12...>],
 'East-India': [<POLYGON ((87.6 25.315, 87.607 25.311, 87.614 25.316, 87.623 25.311, 87.626 ...>,
  <POLYGON ((95.214 26.937, 95.217 26.934, 95.226 26.934, 95.229 26.931, 95.23...>,
  <POLYGON ((92.801 24.419, 92.804 24.419, 92.807 24.42, 92.809 24.419, 92.809...>,
  <POLYGON ((95.952 27.942, 95.952 27.939, 95.952 27.937, 95.958 27.937, 95.95...>]}

In [16]:
regions_S2_ids = pickle.load(open('regions_S2_ids.pkl','rb'))

In [None]:
grids = GridGenerator()

for region_name, region_polygons in regions_polygons.items():
    
    print('\033[96mProcessing region', region_name)
    
    region_S2_ids = regions_S2_ids[region_name]
    
    # buildings_inside_region = []
    
    for pidx, region_polygon in enumerate(region_polygons):
        
        print('\033[93m    processing pidx', pidx)
        
        inside_polygon_grid, outside_polygon_grid = grids.generate_grids(region_polygon, n=150)
        
        # buildings_inside_polygon = []
        
        buildings_inside_polygon_found = 0
        
        for s2_idx, S2_id in enumerate(region_S2_ids):
            
            try:
                file_name = f'India_{S2_id}.parquet'
                
                print(f'\033[93m        processing file_name {file_name} idx {s2_idx} of {len(region_S2_ids)}')
                
                df = gpd.read_parquet(os.path.join(parquets_folder, file_name))
                df['longitude'] = df['geometry'].apply(lambda g: g.centroid.xy[0][0])
                df['latitude'] = df['geometry'].apply(lambda g: g.centroid.xy[1][0])
                
                df['id'] = df['longitude'].astype(str) + ':' + df['latitude'].astype(str)
                print(f'\033[93m        Amount of buildings in S2 partition {len(df)}')
                
                buildings_in_tiles = []
                buildings_out_tiles = []
                
                buildings_found = 0
                buildings_out_tiles_found = 0
                
                # first step fast filteing all buildings that located in polygon grid tiles
                for in_poly_tile in tqdm(inside_polygon_grid, total=len(inside_polygon_grid), desc='Filtering buildings inside polygon'):
                    
                    min_lon, min_lat, max_lon, max_lat = in_poly_tile.bounds
                    # Filter buildings within current tile
                    buildings_in_tile = df[
                            (df.longitude >= min_lon) &
                            (df.longitude <= max_lon) &
                            (df.latitude >= min_lat) &
                            (df.latitude <= max_lat)
                    ]
                    
                    if len(buildings_in_tile) > 0:
                        
                        buildings_in_tiles.append(buildings_in_tile)
                        buildings_found += len(buildings_in_tile)
                        
                # first step fast filteing all buildings that located out polygon grid tiles       
                for off_poly_tile in tqdm(outside_polygon_grid, total=len(outside_polygon_grid), desc='Filtering buildings outside polygon'):
                    
                    min_lon, min_lat, max_lon, max_lat = off_poly_tile.bounds
                    # Filter buildings within current tile
                    buildings_out_tile = df[
                            (df.longitude >= min_lon) &
                            (df.longitude <= max_lon) &
                            (df.latitude >= min_lat) &
                            (df.latitude <= max_lat)
                    ]
                    
                    if len(buildings_out_tile) > 0:

                        buildings_out_tiles.append(buildings_out_tile)
                        buildings_out_tiles_found += len(buildings_out_tile)
                
                # Concatenate dataframes containing buildins within and outside tiles
                
                try:
                    buildings_in_tiles_df = pd.concat(buildings_in_tiles)
                except:
                    buildings_in_tiles_df = pd.DataFrame()
                
                try:
                    buildings_out_tiles_df = pd.concat(buildings_out_tiles)
                except:
                    buildings_out_tiles_df = pd.DataFrame()
                
                # check in out buildings of polygon
                if (len(buildings_in_tiles_df) != 0) and (len(buildings_out_tiles_df) != 0):
                
                    try:
                        print(f'\033[93m        Buildings found in polygon tiles {len(buildings_in_tiles_df)} out tiles {len(buildings_out_tiles_df)} remaining {len(df) - len(buildings_in_tiles_df) - len(buildings_out_tiles_df)}')
                        
                        if len(buildings_in_tiles_df) > 0:
                            remaining_buildings_df = df[~df['id'].isin(buildings_in_tiles_df['id'])]
                            
                            if len(buildings_out_tiles_df) > 0:
                                remaining_buildings_df = remaining_buildings_df[~remaining_buildings_df['id'].isin(buildings_out_tiles_df['id'])]
                                
                        if (len(buildings_in_tiles_df) == 0) and (len(buildings_out_tiles_df) > 0):
                            remaining_buildings_df = df[~df['id'].isin(buildings_out_tiles_df['id'])]
                            
                        
                        print(f'\033[93m        Remaining amount check {len(remaining_buildings_df)}')
                        
                        result_df = pd.DataFrame()
                        
                        if len(remaining_buildings_df) > 0:
                            remaining_buildings_df['buildings_in_polygon'] = [region_polygon.contains(shapely.Point(row.longitude, row.latitude)) for row in tqdm(remaining_buildings_df.itertuples(), total=len(remaining_buildings_df), desc='Filtering buildings')]

                            remaining_buildings_df = remaining_buildings_df[remaining_buildings_df.buildings_in_polygon == True]
                            
                            if len(buildings_in_tiles_df) > 0:
                                if len(remaining_buildings_df) > 0:
                                    result_df = pd.concat([buildings_in_tiles_df, remaining_buildings_df])
                                else:
                                    result_df = buildings_in_tiles_df
                            else:
                                if len(remaining_buildings_df) > 0:
                                    result_df = remaining_buildings_df
                                else:
                                    result_df = pd.DataFrame()
                        else:
                            if len(buildings_in_tiles_df) > 0:
                                result_df = buildings_in_tiles_df
                            else:
                                result_df = pd.DataFrame()
                                
                        if len(result_df) > 0:
                            print(f'\033[93m        Saved amount {len(result_df)}')
                            result_df.to_parquet(f"{filtered_parquets_folder}/{region_name.replace(' ', '_')}_pidx_{pidx}_S2_id_{S2_id}_buildings.parquet")
                        # buildings_inside_polygon.append(result_df)
                        
                        print()
            except Exception as e:
                print(e)



In [22]:
regions_polygons.keys()

dict_keys(['Madhya Pradesh', 'South-India', 'East-India'])

In [1]:
def get_inner_faces(interiors):
    total = 0
    for interior in interiors:
        total += len(interior.coords) - 1
    return total

def get_inner_perimeter(interiors):
    geod = Geod(ellps="WGS84")
    total = 0
    for interior in interiors:
        total += abs(geod.geometry_area_perimeter(interior)[1])
    return total


def calculations(buildings_in_polygon):
    
    # filter out google buildings vith cobfidence < 0.7
    google_buildings = buildings_in_polygon[(buildings_in_polygon.bf_source == 'google') & (buildings_in_polygon.confidence > 0.7)]
    microsoft_buildings = buildings_in_polygon[buildings_in_polygon.bf_source == 'microsoft']
    
    buildings_in_polygon = pd.concat([google_buildings, microsoft_buildings])
    
    del google_buildings
    del microsoft_buildings
    
    buildings_in_polygon['area_in_meters'] = buildings_in_polygon["geometry"].apply(lambda g: abs(geod.geometry_area_perimeter(g)[0]))

    # Compute Perimeter
    buildings_in_polygon.insert(0,"perimeter_in_meters",0)
    buildings_in_polygon['perimeter_in_meters'] = buildings_in_polygon["geometry"].apply(lambda g: (abs(geod.geometry_area_perimeter(g)[1]) + get_inner_perimeter(g.interiors)) if (g.geom_type == "Polygon") else (abs(geod.geometry_area_perimeter(g)[1])))

    # Compute number of faces
    buildings_in_polygon.insert(1,"building_faces",0)
    buildings_in_polygon['building_faces'] = buildings_in_polygon["geometry"].apply(lambda g: (len(g.exterior.coords) - 1 + get_inner_faces(g.interiors)) if (g.geom_type == "Polygon") else 0)

    
    return buildings_in_polygon

In [None]:

filtered_partitions = os.listdir(filtered_parquets_folder)
columns = ['bf_source', 'confidence', 'geometry', 'longitude', 'latitude', 'id']

for region_name in regions_polygons.keys():

    region_name = region_name.replace(' ', '_')
    try:
        filtered_partitions = [i for i in filtered_partitions if region_name in i]

        main_df = gpd.read_parquet(os.path.join(filtered_parquets_folder, filtered_partitions[0]), columns=columns)

        for i in filtered_partitions[1:]:
            
            parquet_path = os.path.join(filtered_parquets_folder, i)
            current_df = gpd.read_parquet(parquet_path, columns=columns)
            main_df = pd.concat([main_df, current_df])

        main_df = main_df.drop_duplicates(subset=['id'])
        
        filename = f'{region_name}_buildings.parquet'
        
        main_df = calculations(main_df)
        main_df.to_parquet(filename)
        
        # upload to bucket
        try:
            res=cos_client.upload_file(Filename=filename, Bucket=config["VIDA_COUNTRIES_BUILDINGS"],Key=filename)
        except Exception as e:
            print(Exception, e)
        else:
            print(f'{filename} succesfully uploaded')
            
    except Exception as e:
        print(f'Region parquets concatenation error occurred: {e}')