## 6_SMOD_caclulation
### Adds SMOD definition to the labelled data set
### Make sure the detailed level segregation from the 8_rural_urban_json_segregation notebook is present in the data curation bucket
### Make sure the SMOD definitions are exact values from the JSON file mentioned above. Configure the SMOD_mapper if needed

### Initial configuration
#### To start working with this particular notebook, you need to provide necessary credential and settings
#### Below is an template of configuration, which is necessary prepare aside of this notebook and copy & paste all content in triple quotes to the next cell's input field
    """
    {
    "COS_ENDPOINT_URL": "s3.private.eu-de.cloud-object-storage.appdomain.cloud",
    "COS_AUTH_ENDPOINT_URL": "https://iam.cloud.ibm.com/oidc/token",
    "COS_APIKEY": "xxx",
    "DATA_CURATION_BUCKET": "xxx",
    "SMOD_DEFINITION_JSON": "Kenya_segmentation_2025.json"
    }
    """


In [None]:
# Read notebook configuration
import getpass
import json

config_str = getpass.getpass('Enter your prepared config: ')
config = json.loads(config_str)

In [1]:
# Import necessary libraries
import json
import geopandas as gpd
import pandas as pd
import shapely
from tqdm import tqdm
from collections import Counter
from IPython.display import clear_output
from botocore.client import Config
import ibm_boto3
import io


In [None]:
# init S3 client in order to upload data to the curation bucket
cos_client = ibm_boto3.client(service_name='s3',
                              ibm_api_key_id=config["COS_APIKEY"],
                              ibm_auth_endpoint=config["COS_AUTH_ENDPOINT_URL"],
                              config=Config(signature_version='oauth'),
                              endpoint_url=config["COS_ENDPOINT_URL"])

In [None]:
labelled_data_parquet = 'all_labelled_data.parquet'
labelled_data_SMOD_parquet = 'all_labelled_data_SMOD.parquet'
smod_polygons_json = config["SMOD_DEFINITION_JSON"]
curation_bucket = config["DATA_CURATION_BUCKET"]

In [None]:
# Fetch the labelled data set and the SMOD polygons
if type(curation_bucket) == str:

    streaming_body = cos_client.get_object(Bucket=curation_bucket, Key=smod_polygons_json)['Body']
    print("Downloading to local storage :  " + smod_polygons_json)
    with io.FileIO(smod_polygons_json, 'w') as file:
        for i in io.BytesIO(streaming_body.read()):
            file.write(i)
    
    streaming_body2 = cos_client.get_object(Bucket=curation_bucket, Key=labelled_data_parquet)['Body']
    print("Downloading to local storage :  " + labelled_data_parquet)
    with io.FileIO(labelled_data_parquet, 'w') as file:
        for i in io.BytesIO(streaming_body2.read()):
            file.write(i)

In [7]:
main_df = gpd.read_parquet(labelled_data_parquet)

In [8]:
segmentation = gpd.read_file(smod_polygons_json)
segmentation

Unnamed: 0,seg_type,geometry
0,URBAN Centre (City),"POLYGON ((40.70719 4.22738, 40.69723 4.22738, ..."
1,URBAN Centre (City),"POLYGON ((40.73707 4.21741, 40.72711 4.21741, ..."
2,URBAN Centre (City),"POLYGON ((40.19914 4.02814, 40.19416 4.03312, ..."
3,URBAN Centre (City),"POLYGON ((40.33860 4.00822, 40.33362 4.01320, ..."
4,URBAN Centre (City),"POLYGON ((41.85279 3.91856, 41.84283 3.91856, ..."
...,...,...
15468,Very low density rural grids (Mostly uninhabit...,"POLYGON ((39.28266 -4.66845, 39.27768 -4.66347..."
15469,Very low density rural grids (Mostly uninhabit...,"POLYGON ((39.36235 -4.66845, 39.35737 -4.66347..."
15470,Very low density rural grids (Mostly uninhabit...,"POLYGON ((39.38228 -4.66845, 39.37730 -4.66347..."
15471,Very low density rural grids (Mostly uninhabit...,"POLYGON ((39.22289 -4.67841, 39.21293 -4.67369..."


In [9]:
# Change the definition if needed
SMOD_mapper = {
    # 'LOW_DENSITY_RURAL': 1,
    'RURAL_CLUSTER': 2,
    'SUBURBAN_PERI_URBAN': 3,
    'SEMI_DENSE_URBAN': 4,
    'DENSE_URBAN': 5,
    'URBAN_CENTER': 6,
}


SMOD_items = [k for k in SMOD_mapper.keys()]


In [10]:
main_df['SMOD_name'] = ['LOW_DENSITY_RURAL' for _ in range(len(main_df))]
main_df['SMOD_id'] = [1 for _ in range(len(main_df))]

In [11]:
for smod_name, smod_id in SMOD_mapper.items():
    
    smod_polygons = segmentation[segmentation.seg_type == smod_name].geometry
    
    for poly in tqdm(smod_polygons, total=len(smod_polygons), desc=f'Processing: {smod_name}'):
        
        min_lon, min_lat, max_lon, max_lat = poly.bounds
        
        buildings_in_bbox = main_df[
                                    (main_df.latitude >= min_lat) &
                                    (main_df.latitude <= max_lat) &
                                    (main_df.longitude >= min_lon) &
                                    (main_df.longitude <= max_lon)
                                    ].copy()
        
        if len(buildings_in_bbox) > 0:
            
            buildings_in_bbox['buildings_in_polygon'] = [poly.contains(shapely.Point(row.longitude, row.latitude)) for row in buildings_in_bbox.itertuples()]
            builings_in_polygon = buildings_in_bbox[buildings_in_bbox['buildings_in_polygon'] == True]
            
            main_df.loc[main_df.index.isin(list(builings_in_polygon.index)), 'SMOD_name'] = smod_name
            main_df.loc[main_df.index.isin(list(builings_in_polygon.index)), 'SMOD_id'] = smod_id


Processing: Low density rural grids cells (Dispersed rural area): 100%|██████████| 9780/9780 [18:43<00:00,  8.70it/s]   
Processing: RURAL cluster (Village): 100%|██████████| 1811/1811 [00:49<00:00, 36.77it/s]
Processing: Suburban or peri-urban cells (Suburb): 100%|██████████| 901/901 [03:20<00:00,  4.50it/s]
Processing: Dense and semi-dense urban cluster (Town): 100%|██████████| 746/746 [00:20<00:00, 36.66it/s]
Processing: URBAN Centre (City): 100%|██████████| 55/55 [00:41<00:00,  1.32it/s]


In [12]:
main_df[main_df.SMOD_name == '']

Unnamed: 0,latitude,longitude,vida_confidence,geometry,building_area_in_meters,county,SMOD_name,SMOD_id


In [13]:
Counter(main_df.SMOD_name)

Counter({'URBAN Centre (City)': 1393095,
         'Suburban or peri-urban cells (Suburb)': 219182,
         'Low density rural grids cells (Dispersed rural area)': 43758,
         'Dense and semi-dense urban cluster (Town)': 31958,
         'Very low density rural grids (Mostly uninhabited area)': 14246,
         'RURAL cluster (Village)': 4129})

In [14]:
main_df = main_df.drop_duplicates(subset='geometry')
main_df

Unnamed: 0,latitude,longitude,vida_confidence,geometry,building_area_in_meters,county,SMOD_name,SMOD_id
5588,-0.883311,35.014648,0.7856,"POLYGON ((35.01468 -0.88330, 35.01463 -0.88329...",21.586176,Bomet,Dense and semi-dense urban cluster (Town),5
5606,-0.882994,35.014654,0.0000,"POLYGON ((35.01468 -0.88302, 35.01468 -0.88298...",25.149882,Bomet,Dense and semi-dense urban cluster (Town),5
5614,-0.883239,35.014658,0.8072,"POLYGON ((35.01468 -0.88326, 35.01468 -0.88321...",24.440989,Bomet,Suburban or peri-urban cells (Suburb),4
6765,-0.882690,35.015054,0.7530,"POLYGON ((35.01507 -0.88271, 35.01507 -0.88267...",22.086248,Bomet,Suburban or peri-urban cells (Suburb),4
7126,-0.883227,35.015184,0.8424,"POLYGON ((35.01521 -0.88320, 35.01516 -0.88320...",35.753080,Bomet,Suburban or peri-urban cells (Suburb),4
...,...,...,...,...,...,...,...,...
1024172,-1.263098,37.096335,0.8918,"POLYGON ((37.09638 -1.26312, 37.09637 -1.26310...",81.492523,Nairobi,Low density rural grids cells (Dispersed rural...,2
1024188,-1.266434,37.101554,0.8751,"POLYGON ((37.10160 -1.26645, 37.10158 -1.26639...",72.363677,Nairobi,Low density rural grids cells (Dispersed rural...,2
1024189,-1.268861,37.101554,0.8099,"POLYGON ((37.10157 -1.26883, 37.10155 -1.26885...",41.212594,Nairobi,Low density rural grids cells (Dispersed rural...,2
1024217,-1.265807,37.101614,0.8286,"POLYGON ((37.10165 -1.26583, 37.10164 -1.26577...",45.291012,Nairobi,Low density rural grids cells (Dispersed rural...,2


In [17]:
main_df.to_parquet(labelled_data_SMOD_parquet)

# optionaly upload file to the bucket
if type(curation_bucket) == str:
        
    try:
        cos_client.upload_file(
            Filename=labelled_data_SMOD_parquet,
            Bucket=curation_bucket,
            Key=labelled_data_SMOD_parquet,
            ExtraArgs={'ContentDisposition': 'attachment'}
        )
           
        print(f'File {labelled_data_SMOD_parquet} successfully uploaded to the COS {curation_bucket} bucket')
    except Exception as e:
        print(f"\033[91mFailed upload file to the bucket {curation_bucket}. Error: {e}")