## 9_urban_rural_segregation
### Categorizes buildings based on their location into a fitting urbanization status, uses the pre-processed polygons from 8_rural_urban_json_segregation
### This notebook needs to be executed twice together with 8_rural_urban_json_segregation to categorize building on a "overview" level, i.e., urban-suburban-rural categories and second time to categorize buildings into finer grainde categories using the "detailed" option SEGREGATION_TYPE

### Initial configuration
#### To start working with this particular notebook, you need to provide necessary credential and settings
#### Below is an template of configuration, which is necessary prepare aside of this notebook and copy & paste all content in triple quotes to the next cell's input field
    """
    {
    "COS_ENDPOINT_URL": "s3.private.eu-de.cloud-object-storage.appdomain.cloud",
    "COS_AUTH_ENDPOINT_URL": "https://iam.cloud.ibm.com/oidc/token",
    "COS_APIKEY": "xxx",
    "UTILS_BUCKET": "notebook-utils-bucket",
    "COUNTRY_TABLE": "FEATURES_DB_VIDA_EXTENDED",
    "COUNTRY_NAME": "Kenya",
    "SMOD_BUCKET": "xxx",
    "SEGREGATION_STYLE": "overview",
    "VIDA_COUNTRIES_BUILDINGS": "vida-countries-buildings",
    }
    """


In [10]:
# Read notebook configuration
import getpass
import json

config_str = getpass.getpass('Enter your prepared config: ')
config = json.loads(config_str)

In [10]:
# Import necessary libraries
import pandas as pd
import geopandas as gpd
import jaydebeapi as jdbc
import jpype
import ibm_boto3
import gc
import io
import os
import shapely
import numpy as np
import rasterio as rio
from botocore.client import Config
from rasterio.plot import show
from tqdm import tqdm
from rasterio.mask import mask
import matplotlib.pyplot as plt
import rioxarray
from skimage import measure as M
import json

In [13]:
#Add new country specific bounding box coordinates if needed
# table_name = config["COUNTRY_TABLE"]
# country = config["COUNTRY_NAME"]
# output_SMOD_polygon_geojson =  country + "_segregated_cleaned.json"

min_latitude = 0
max_latitude = 0
min_longitude = 0
max_longitude = 0
# if config["COUNTRY_TABLE"] == 'Kenya':
#     min_latitude = -4.7075268
#     max_latitude = 5.017422
#     min_longitude = 33.9110224
#     max_longitude = 41.8914004

segregation = {}
segregation_priorities = []
default_category = ""
db_col_name = ""

if config["SEGREGATION_STYLE"] == "overview":
    segregation = {
        'URBAN': [22, 23, 30],
        'SUBURBAN': [21],
        'RURAL': [12, 13],
    }
    segregation_priorities = ['URBAN', 'SUBURBAN']
    default_category = 'Rural'
    db_col_name = 'URBAN_SPLIT'

if config["SEGREGATION_STYLE"] == "detailed":
    segregation = {
        'URBAN_CENTER': [30],
        'DENSE_URBAN': [23],
        'SEMI_DENSE_URBAN': [22],
        'SUBURBAN_PERI_URBAN': [21],
        'RURAL_CLUSTER': [13],
        'LOW_DENSITY_RURAL': [12],
    }
    segregation_priorities = ['URBAN_CENTER', 'DENSE_URBAN', 'SEMI_DENSE_URBAN', 'SUBURBAN_PERI_URBAN', 'RURAL_CLUSTER', 'LOW_DENSITY_RURAL']
    default_category = 'Low Density Rural'
    db_col_name = 'GHSL_SMOD'

segregation_names = {}
segregation_names['URBAN'] = 'Urban'
segregation_names['SUBURBAN'] = 'Suburban'
segregation_names['RURAL'] = 'Rural'

segregation_names['URBAN_CENTER'] = 'Urban Center'
segregation_names['DENSE_URBAN'] = 'Dense Urban Cluster'
segregation_names['SEMI_DENSE_URBAN'] = 'Semi-dense Urban Cluster'
segregation_names['SUBURBAN_PERI_URBAN'] = 'Suburban or Per-urban'
segregation_names['RURAL_CLUSTER'] = 'Rural Cluster'
segregation_names['LOW_DENSITY_RURAL'] = 'Low Density Rural'

In [7]:
#load db2jcc4.jar and polygons json
cos_client = ibm_boto3.client(service_name='s3',
                              ibm_api_key_id=config["COS_APIKEY"],
                              config=Config(signature_version='oauth'),
                              endpoint_url=config["COS_ENDPOINT_URL"])

response = cos_client.list_objects_v2(Bucket=config["UTILS_BUCKET"])

try:
    for obj in response['Contents']:
        name = obj['Key']
        streaming_body_1 = cos_client.get_object(Bucket=config["UTILS_BUCKET"], Key=name)['Body']
        print("Copying to localStorage :  " + name)
        with io.FileIO(name, 'w') as file:
            for i in io.BytesIO(streaming_body_1.read()):
                file.write(i)
    
    from utils import *
    print('External utils succesfully imported')
except Exception as e:
    print('Error occured: ', e)

In [2]:
smod_overview_json_filenames = ['Madhya_Pradesh_overview.json', 'South-India_overview.json', 'East-India_overview.json']
smod_detailed_json_filenames = ['Madhya_Pradesh_detailed.json', 'South-India_detailed.json', 'East-India_detailed.json']

vida_datasets = ['East_India_buildings.parquet', 'Madhya_Pradesh_buildings.parquet', 'South_India_buildings.parquet']


In [6]:
# Fetch the geoJSON file containing polygons and process each polygon one by one
for smod_json in smod_overview_json_filenames + smod_detailed_json_filenames:
    
    streaming_body = cos_client.get_object(Bucket=config["SMOD_BUCKET"], Key=smod_json)['Body']
    print("Downloading to local storage :  " + smod_json)
    with io.FileIO(smod_json, 'w') as file:
        for i in io.BytesIO(streaming_body.read()):
            file.write(i)

# Fetch the geoJSON file containing polygons and process each polygon one by one
for vida_buildings_parquet in vida_datasets:
    
    streaming_body = cos_client.get_object(Bucket=config["VIDA_COUNTRIES_BUILDINGS"], Key=vida_buildings_parquet)['Body']
    print("Downloading to local storage :  " + vida_buildings_parquet)
    with io.FileIO(vida_buildings_parquet, 'w') as file:
        for i in io.BytesIO(streaming_body.read()):
            file.write(i)



Madhya_Pradesh_overview.json
South-India_overview.json
East-India_overview.json
Madhya_Pradesh_detailed.json
South-India_detailed.json
East-India_detailed.json


In [7]:
smod_overview_json_filenames.sort(), smod_detailed_json_filenames.sort(), vida_datasets.sort()
smod_overview_json_filenames, smod_detailed_json_filenames, vida_datasets

(['East-India_overview.json',
  'Madhya_Pradesh_overview.json',
  'South-India_overview.json'],
 ['East-India_detailed.json',
  'Madhya_Pradesh_detailed.json',
  'South-India_detailed.json'],
 ['East_India_buildings.parquet',
  'Madhya_Pradesh_buildings.parquet',
  'South_India_buildings.parquet'])

In [None]:
for smod_json_overview, smod_json_detailed, parquet_filename in zip(smod_overview_json_filenames, smod_detailed_json_filenames, vida_datasets):
#Open Kenya_segregated polygons
    with open(smod_json_overview) as s_f:
        geojson_overview = json.load(s_f)

    print('Processing: ', smod_json_overview, smod_json_detailed, parquet_filename)

    buildings_df = gpd.read_parquet(parquet_filename)
    init_len = len(buildings_df)
    dfs = []
    
    for fidx, feature in enumerate(tqdm(geojson_overview['features'], total=len(geojson_overview['features']), desc='Processing overview level')):
        
        # print(f"Processing {fidx+1} of {len(geojson['features'])}")
        
        polygon_coordinates = feature['geometry']['coordinates'][0]
        polygon = shapely.Polygon(polygon_coordinates)
        min_lon, min_lat, max_lon, max_lat = polygon.bounds
        
        buildings_bbox = buildings_df[
                (buildings_df.longitude >= min_lon) &
                (buildings_df.longitude <= max_lon) &
                (buildings_df.latitude >= min_lat) &
                (buildings_df.latitude <= max_lat)
        ].copy()
        
        buildings_bbox['buildings_in_polygon'] = [polygon.contains(shapely.Point(row.longitude, row.latitude)) for row in buildings_bbox.itertuples()]

        buildings_in_polygon = buildings_bbox[buildings_bbox.buildings_in_polygon == True]        
        
        buildings_in_polygon = buildings_in_polygon.drop(columns=['buildings_in_polygon'])
        buildings_in_polygon['urban_split'] = segregation_names [feature['properties']['seg_type']]
        
        dfs.append(buildings_in_polygon)
     
    try: 
        segregated_df = pd.concat(dfs)
        
        remaining_buildings = buildings_df[~buildings_df['id'].isin(segregated_df['id'])]
        remaining_buildings['urban_split'] = 'Rural'
        
        main_df = pd.concat([segregated_df, remaining_buildings])
        
        print(f'init len {init_len} out len {len(main_df)}')
    except Exception as e:
        print(e)
        
    #################################################################################################################
    print('removing duplicates')
    main_df = main_df.drop_duplicates(subset='id')
    print(f'in len {init_len} out len {len(main_df)}')

    main_df.index = [i for i in range(len(main_df))]
    
    with open(smod_json_detailed) as s_f:
        geojson_detailed = json.load(s_f)
        
    dfs = []
        
    for fidx, feature in enumerate(tqdm(geojson_detailed['features'], total=len(geojson_detailed['features']), desc='Processing detailed level')):
        
        # print(f"Processing {fidx+1} of {len(geojson['features'])}")
        
        polygon_coordinates = feature['geometry']['coordinates'][0]
        polygon = shapely.Polygon(polygon_coordinates)
        min_lon, min_lat, max_lon, max_lat = polygon.bounds
        
        buildings_bbox = main_df[
                (main_df.longitude >= min_lon) &
                (main_df.longitude <= max_lon) &
                (main_df.latitude >= min_lat) &
                (main_df.latitude <= max_lat)
        ].copy()
        
        buildings_bbox['buildings_in_polygon'] = [polygon.contains(shapely.Point(row.longitude, row.latitude)) for row in buildings_bbox.itertuples()]

        buildings_in_polygon = buildings_bbox[buildings_bbox.buildings_in_polygon == True]        
        
        buildings_in_polygon = buildings_in_polygon.drop(columns=['buildings_in_polygon'])
        buildings_in_polygon['ghsl_smod'] = segregation_names[feature['properties']['seg_type']]
        
        dfs.append(buildings_in_polygon)
     
    try: 
        segregated_df = pd.concat(dfs)
        
        remaining_buildings = main_df[~main_df['id'].isin(segregated_df['id'])]
        remaining_buildings['ghsl_smod'] = 'Low Density Rural'
        
        main_df = pd.concat([segregated_df, remaining_buildings])
        
        main_df = main_df.drop_duplicates(subset='id')
        
        print(f'in len {init_len} out len {len(main_df)}')
    except Exception as e:
        print(e)
        
    try:
        
        filename = parquet_filename.replace('.parquet', '_GHSL_SMOD.parquet')
        main_df.to_parquet(filename)
        
        res=cos_client.upload_file(Filename=filename, Bucket=config["VIDA_COUNTRIES_BUILDINGS"],Key=filename)
        print(f'File uploaded to the COS {filename}')
    except Exception as e:
        print(e)