## 9_data_labeling
### Splits the curated labelled data set to testing, validation and training subsets
### See and adjust if needed the cell below defining non-residential building types: new_nonresidential_types

### Initial configuration
#### To start working with this particular notebook, you need to provide necessary credential and settings
#### Below is an template of configuration, which is necessary prepare aside of this notebook and copy & paste all content in triple quotes to the next cell's input field
    """
    {
    "COS_ENDPOINT_URL": "s3.private.eu-de.cloud-object-storage.appdomain.cloud",
    "COS_AUTH_ENDPOINT_URL": "https://iam.cloud.ibm.com/oidc/token",
    "COS_APIKEY": "xxx",
    "DATA_CURATION_BUCKET": "xxx"
    }
    """


In [None]:
# Read notebook configuration
import getpass
import json

config_str = getpass.getpass('Enter your prepared config: ')
config = json.loads(config_str)

In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import geopandas as gpd
import rasterio as rio
import rasterio.features
import io, os, sys, traceback
import shapely.geometry as G
from datetime import datetime
from tqdm import tqdm
from matplotlib.path import Path
import base64
import matplotlib.pyplot as plt
from shapely.geometry import Polygon, mapping, Point, MultiPolygon
from PIL import Image
from utils import *
from pyproj import Geod
from collections import Counter
import warnings
import plotly.express as px
import shapely
import ibm_boto3
from botocore.client import Config


warnings.simplefilter(action='ignore', category=FutureWarning)

geod = Geod(ellps="WGS84")

In [None]:
# init S3 client in order to upload data to the curation bucket
cos_client = ibm_boto3.client(service_name='s3',
                              ibm_api_key_id=config["COS_APIKEY"],
                              ibm_auth_endpoint=config["COS_AUTH_ENDPOINT_URL"],
                              config=Config(signature_version='oauth'),
                              endpoint_url=config["COS_ENDPOINT_URL"])

In [None]:
# assign cinfig necessary variables
labelled_data_SMOD_heights_sentinel2_parquet = 'all_labelled_data_SMOD_heights_sentinel2.parquet'
labelled_data_finished_split = 'all_merged_L1_SMOD_heights_images.parquet'
curation_bucket = config["DATA_CURATION_BUCKET"]

In [2]:
# Fetch the labelled data set with all info
if type(curation_bucket) == str:

    streaming_body = cos_client.get_object(Bucket=curation_bucket, Key=labelled_data_SMOD_heights_sentinel2_parquet)['Body']
    print("Downloading to local storage :  " + labelled_data_SMOD_heights_sentinel2_parquet)
    with io.FileIO(labelled_data_SMOD_heights_sentinel2_parquet, 'w') as file:
        for i in io.BytesIO(streaming_body.read()):
            file.write(i)

ML_df = gpd.read_parquet(labelled_data_SMOD_heights_sentinel2_parquet)
ML_df.columns
ML_df

In [23]:
new_nonresidential_types = [
    # 'residential',
    # 'house',
    # 'hut',
    # 'shed',
    'construction',
    'cb',
    'garage',
    'detached',
    'place_of_worship',
    'library',
    'fuel',
    'yes',
    'aerodrome',
    'education',
    'religious',
    'agricultural',
    'bank',
    'ruins',
    'hostel',
    'bungalow',
    'no',
    # 'apartments',
    'roof',
    'storage_tank',
    'fire_station',
    'institutional',
    'mall',
    'car',
    'police',
    'plant',
    'quarry',
    'entertainment',
    'carport',
    'greenhouse_horticulture',
    'cinema',
    'terrace',
    'track',
    'dormitory',
    # 'guest_house',
    'gatehouse',
    'pavilion',
    'medical',
    'cabin',
    'theatre',
    'semidetached_house',
    'multipolygon',
    'garages',
    'gate',
    'construction(1)',
    'construction(2)',
    'construction(3)',
    'silo',
    'farmyard',
    'grandstand',
    'tent',
    'container',
    'toilets',
    'bridge',
    'chri',
    'observing tower',
    'foundation',
    'diplomatic',
    'sty',
    'foundaction',
    'parking',
    'CBA_HOUSE',
    'consturuction',
    'gazebo',
    'utility',
    'commercial;residenti',
    'commercia;lresidenti',
    'unkown',
    'open-air',
    'Maya Primary School',
    'swimming pool',
    'collapsed',
    'allotment_house',
    'co operative bank at',
    'abandoned'
    ]

In [24]:
nonresidentialtypes = [
    'retail', 'office', 'school', 'commercial', 'university', 'chapel', 'industrial', 'service',
    'church', 'hospital', 'government', 'hotel', 'mosque', 'greenhouse', 'hangar', 'farm', 'stadium',
    'transportation', 'warehouse', 'clinic', 'public', 'store', 'kitchen', 'Wasini hostel', 'Mortuary',
    'commercial;yes', 'Petrol station', 'Dispensary', 'Medical Laboratory', 'manufacture', 'supermarket',
    'inn', 'greenhouse_horticult', 'cowshed', 'temple', 'kindergarten', 'barn', 'stable', 'business',
    'train_station', 'restaurant', 'college', 'bakehouse', 'civic', 'farm_auxiliary', 'resturant', 'cathedral',
    'yes;office', 'static_caravan', 'kiosk', 'monastery', 'convent', 'sports_centre'
    ]

nonresidentialtypes = nonresidentialtypes + new_nonresidential_types

def assign_ML_class(osm_type):
    
    if osm_type in nonresidentialtypes:
        return "nonresidential"
    else:
        return "residential"

ML_df['L1_class'] = ML_df['osm_type'].apply(lambda x: assign_ML_class(x))


Counter(ML_df.L1_class)

Counter({'nonresidential': 100253, 'residential': 67066})

In [56]:
# assign appropriate ML tag 70% train, 20% validation 10% test
def assign_label(idx):
    
    if str(idx)[-1] in ['0', '1', '2', '3', '4', '5', '6']:
        return 'train'
    elif str(idx)[-1] in ['7', '8']:
        return 'validation'
    elif str(idx)[-1] in ['9']:
        return 'test'


data_len = len(ML_df)
ML_df['index_column'] = [i for i in range(len(ML_df))]
ML_df['image_ML_type'] = ["initval" for _ in range(len(ML_df))]

for ml_class in list(set(ML_df['L1_class'])):
    
    
    ML_df = ML_df.sort_values('area_in_meters', ascending=True)
    ml_class_data_idxs = ML_df[ML_df['L1_class'] == ml_class].index.tolist()
    for row_idx, df_idx in enumerate(ml_class_data_idxs):
        
        ML_df.at[df_idx, 'image_ML_type'] = assign_label(row_idx)
        
split_result = ML_df[['image_ML_type', 'L1_class', 'index_column']].groupby(['image_ML_type', 'L1_class']).count()
split_result['split in %'] = round(100 * split_result['index_column'] / data_len, 3)
print(split_result)


                              index_column  split in %
image_ML_type L1_class                                
test          nonresidential          4018       3.855
              residential             6405       6.145
train         nonresidential         28129      26.986
              residential            44836      43.015
validation    nonresidential          8036       7.710
              residential            12810      12.290


In [59]:
ML_df['area_in_meters'] = ML_df["geometry"].apply(lambda g: abs(geod.geometry_area_perimeter(g)[0]))
ML_df.to_parquet(labelled_data_finished_split)

# optionaly upload file to the bucket
if type(curation_bucket) == str:
        
    try:
        cos_client.upload_file(
            Filename=labelled_data_finished_split,
            Bucket=curation_bucket,
            Key=labelled_data_finished_split,
            ExtraArgs={'ContentDisposition': 'attachment'}
        )
           
        print(f'File {labelled_data_finished_split} successfully uploaded to the COS {curation_bucket} bucket')
    except Exception as e:
        print(f"\033[91mFailed upload file to the bucket {curation_bucket}. Error: {e}")