## 11_building_image_and_metadata_extraction
### Crops building roof images form the Sentinel-2 tiles, uploads them to the COS bucket and sets the corresponding location to the DB2 database

### Initial configuration
#### To start working with this particular notebook, you need to provide necessary credential and settings
#### Below is an template of configuration, which is necessary prepare aside of this notebook and copy & paste all content in triple quotes to the next cell's input field
    """
    {
    "COS_ENDPOINT_URL": "s3.private.eu-de.cloud-object-storage.appdomain.cloud",
    "COS_AUTH_ENDPOINT_URL": "https://iam.cloud.ibm.com/oidc/token",
    "COS_APIKEY": "xxx",
    "UTILS_BUCKET": "notebook-utils-bucket",
    "BUCKET_TIFF": "geotiffs",
    "COUNTRY_NAME": "Kenya",
    "DB2_CONNECTION_STRING": "jdbc:db2://65beb513-5d3d-4101-9001-f42e9dc954b3.brt9d04f0cmqeb8u7740.databases.appdomain.cloud:30371/BLUDB:sslConnection=true;useJDBC4ColumnNameAndLabelSemantics=false;db2.jcc.charsetDecoderEncoder=3;",
    "DB2_USERNAME": "xxx",
    "DB2_PASSWORD": "xxx",
    "COUNTRY_TABLE": "FEATURES_DB_VIDA_EXTENDED",
    "MGRS_COMPRESSED_IMAGES_BUCKET": "building-image-compression",
    "AREA_THRESHOLD": 20
    }
    """


In [1]:
# Read notebook configuration
import getpass
import json
import re

config_str = getpass.getpass('Enter your prepared config: ')
config = json.loads(config_str)

In [3]:
# Import necessary libraries
import io
from PIL import Image
import ibm_boto3
from botocore.client import Config
import numpy as np
import configparser
import os
from ibm_cloud_sdk_core import ApiException
import pandas as pd
import geopandas as gpd
from tqdm import tqdm
from datetime import datetime
import rasterio
from pyproj import Geod
from shapely.geometry import Polygon, MultiPolygon, mapping, Point
import matplotlib.pyplot as plt

import jaydebeapi as jdbc
import jpype

from pykml import parser
import mgrs
import requests
import fiona

import shapely

from fiona.drvsupport import supported_drivers
supported_drivers['KML'] = 'rw'

In [4]:
# initiate the S3 client
cos_client = ibm_boto3.client(service_name='s3',
                              ibm_api_key_id=config["COS_APIKEY"],
                              ibm_auth_endpoint=config["COS_AUTH_ENDPOINT_URL"],
                              config=Config(signature_version='oauth'),
                              endpoint_url=config["COS_ENDPOINT_URL"])

response = cos_client.list_objects_v2(Bucket=config["UTILS_BUCKET"])

# download utils module
try:
    from utils import *
    print('External utils succesfully imported')
    
except Exception as e:
    print('Desired packages is missing in local env, downloading it...', e)
    for obj in response['Contents']:
        name = obj['Key']
        streaming_body_1 = cos_client.get_object(Bucket=config["UTILS_BUCKET"], Key=name)['Body']
        print("Downloading to localStorage :  " + name)
        with io.FileIO(name, 'w') as file:
            for i in io.BytesIO(streaming_body_1.read()):
                file.write(i)
    from utils import *
    print('External utils succesfully imported')
  

External utils succesfully imported


In [5]:
# connect to the IBM DB2 function
def connect_to_db():

    jar = 'db2jcc4.jar'
    os.environ['CLASSPATH'] = jar

    args='-Djava.class.path=%s' % jar
    jvm_path = jpype.getDefaultJVMPath()
    try:
        jpype.startJVM(jvm_path, args)
    except Exception as e:
        print('startJVM exception: ', e)
        
    if jpype.isJVMStarted() and not jpype.isThreadAttachedToJVM():
        jpype.attachThreadToJVM()
        jpype.java.lang.Thread.currentThread().setContextClassLoader(jpype.java.lang.ClassLoader.getSystemClassLoader())
        
    # create JDBC connection
    conn = jdbc.connect(
                'com.ibm.db2.jcc.DB2Driver',
                config['DB2_CONNECTION_STRING'],
                [config["DB2_USERNAME"], config["DB2_PASSWORD"]],
                'db2jcc4.jar')
    
    return conn

DB2_connection = connect_to_db()


  if jpype.isJVMStarted() and not jpype.isThreadAttachedToJVM():


In [6]:
type_source = config["TYPE_SOURCE_FILTER"]
BUCKET_TIFF = config["BUCKET_TIFF"]
DB_NAME = config["DB_NAME"]
country = config["COUNTRY_NAME"]
country_table = config["COUNTRY_TABLE"]
threshold = float(config["AREA_TRESHOLD"])  # threshold in square meters.

#### Generate list of coordinates based on grid

In [8]:
def download_and_open_kml(url, destination):
    response = requests.get(url)
    
    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Open the file in binary write mode and write the content
        with open(destination, 'wb') as file:
            file.write(response.content)
        print(f"File downloaded successfully to: {destination}")

        # Open the downloaded KML file using GeoPandas
        gdf = gpd.read_file(destination, driver='KML')
        return gdf
    else:
        print(f"Failed to download file. Status code: {response.status_code}")
        return None

def open_kml_from_url(url):
    response = requests.get(url)
    
    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Use BytesIO to create a file-like object from the content
        content = BytesIO(response.content)
        
        # Open the KML file using GeoPandas
        gdf = gpd.read_file(content, driver='KML')
        return gdf
    else:
        print(f"Failed to download file. Status code: {response.status_code}")
        return None

def get_lat_lon_bounds(df, tile, MGRS_tiles):
    # Assuming `df` contains your GeoDataFrame with MGRS tiles

    # Filter the DataFrame for the specific tile
    tile_df = df_kml[df_kml['Name'] == tile]

    if not tile_df.empty:
        # Get the geometry for the tile
        geom = tile_df.geometry.iloc[0]

        # Calculate the bounds
        minx, miny, maxx, maxy = geom.bounds

        # Store the bounds for the tile
        tile_bounds = {
            'min_lat': miny,
            'max_lat': maxy,
            'min_lon': minx,
            'max_lon': maxx
        }
    else:
        # Handle cases where the tile might not exist in the DataFrame
        tile_bounds = {
            'min_lat': np.nan,
            'max_lat': np.nan,
            'min_lon': np.nan,
            'max_lon': np.nan
        }

    return tile_bounds

In [9]:
# process kml for desired country
url = 'https://sentinel.esa.int/documents/247904/1955685/S2A_OPER_GIP_TILPAR_MPC__20151209T095117_V20150622T000000_21000101T000000_B00.kml'
destination = 'S2A_OPER.kml'

df_kml = download_and_open_kml(url, destination)

m = mgrs.MGRS()

countries_geoJSON_url = 'https://datahub.io/core/geo-countries/r/countries.geojson'
countries_geoJSON = requests.get(countries_geoJSON_url).content
countries_geoJSON = json.loads(countries_geoJSON)

country_geometry = {}
for feature in countries_geoJSON['features']:
    if feature['properties']['ADMIN'] == country:
        country_geometry['geometry'] = feature['geometry']
        break

if country_geometry['geometry']['type'] == 'MultiPolygon':
    coordinates = shapely.MultiPolygon(country_geometry['geometry']['coordinates'])
elif country_geometry['geometry']['type'] == 'Polygon':
    coordinates = shapely.Polygon(country_geometry['geometry']['coordinates'])
    
# collect MGRS tiles for desired country
MGRS_tiles = []

for polygon in coordinates.geoms:        
    coordinates_list = polygon.exterior.coords._coords
    for xy in coordinates_list:
        MGRS_tiles.append(m.toMGRS(xy[1], xy[0], MGRSPrecision=0))

    latmin, lonmin, latmax, lonmax = polygon.bounds
        
    resolution = 0.05
    for lat in np.arange(latmin, latmax, resolution):
        for lon in np.arange(lonmin, lonmax, resolution):
                        
            MGRS_tiles.append(m.toMGRS(round(lon,4), round(lat,4), MGRSPrecision=0))
        
MGRS_tiles = list(set(MGRS_tiles))

File downloaded successfully to: S2A_OPER.kml


#### Load New Tif Files

In [None]:
tif_images_bucket = cos_client.list_objects_v2(Bucket=BUCKET_TIFF)
tif_images_objects = tif_images_bucket['Contents']

tif_images_filenames = [obj['Key'] for obj in tif_images_objects]
tif_images_filenames

In [11]:
compressed_images_objects = cos_client.list_objects_v2(Bucket=config["MGRS_COMPRESSED_IMAGES_BUCKET"])['Contents']
compressed_images_filenames = [obj['Key'].replace('compressed_images_', '').replace('.zip', '_cloudless.tif') for obj in compressed_images_objects]

In [12]:
for excl in compressed_images_filenames:
    tif_images_filenames.remove(excl)

In [14]:
# Prepare GeoDataframe of buildings from bounding box
def fetch_builings_in_bbox(lon_min, lon_max, lat_min, lat_max):
    
    sql = f"""
        SELECT DISTINCT ID, POLYGON_COORDINATES FROM USER1.{country_table} 
        WHERE 
            (LATITUDE >= {lat_min}) AND 
            (LATITUDE <= {lat_max}) AND 
            (LONGITUDE >= {lon_min}) AND 
            (LONGITUDE <= {lon_max}) AND
            AREA_IN_METERS > {threshold}
            """
    cursor = DB2_connection.cursor()
    cursor.execute(sql)
    data = cursor.fetchall()

    gpd.options.display_precision = 7

    df = pd.DataFrame(data=data, columns=['doc_id', 'polygon_coordinates'])

    convert_dict = {
                    'doc_id': str,
                    'polygon_coordinates': str
                    }

    df = df.astype(convert_dict)
    df['geometry'] = gpd.GeoSeries.from_wkt(df['polygon_coordinates'])
    df = df.drop(columns=['polygon_coordinates'])

    df = gpd.GeoDataFrame(
        df, geometry=df.geometry, crs="EPSG:4326"
    )

    df = df.where(~df['geometry'].isna()).dropna()

    df["corresponding_tiff"] = ['NA' for _ in range(len(df))]
    df 
    
    return df

#### Process, compress and upload to S3 bucket

Define functions for compression, upload to S3 and update building information in DB2:

In [19]:
import base64
import io
import shutil
import time
import pandas as pd
import numpy as np

def match_corresponding_tiff(df, areas_covered_by_tifs, path_to_tif_folder):
    for row in df.itertuples():
        
        try:
            df.at[row.Index, 'corresponding_tiff'] = get_path_to_tif(row.geometry, areas_covered_by_tifs, path_to_tif_folder)
        except Exception as e:
            print(f'Exception occurred {e} for row: {row}')
    return df

# This is an auxiliary function, not used directly - it can be used to obtain the filename of the archive for a given tiff file
def get_compressed_file_name(tif_name):
    # Define a regular expression pattern to match the desired part
    prefix = filename.split('_')[0]

    # Use re.search to find the first match of the pattern in the input string
    match = re.search(pattern, tif_name)

    # Extract the matched part
    if match:
        extracted_part = match.group(1)
        
    return extracted_part

def compress_and_upload_images(df, object_name, bucket_name):
    
    metadata_df = pd.DataFrame()
    t2 = time.time()
    
    try:
        buildings_no_tag_toDB = df.loc[df['image_name'].notnull()]
        images = [(row.image_name, base64.b64decode(row.image_source_bytes)) for row in buildings_no_tag_toDB.itertuples()]

        print(f"Number of images were found: {len(images)}")
        if len(images) != 0:
            print('Start images upload')

            try:
                # Create a DataFrame with metadata columns and append the image_name and image_data
                metadata_df = buildings_no_tag_toDB[['doc_id',
                                                     'geometry',
                                                     'corresponding_tiff',
                                                     'image_name',
                                                     'tiff_name',
                                                     'image_source_bytes',
                                                    ]]
                        
                # Replace NaN values with a placeholder before compression
                metadata_df.replace({np.nan: 'NA'}, inplace=True)
                
                # Compress the DataFrame to a CSV file
                zip_buffer = io.BytesIO()
                metadata_df.to_csv(zip_buffer, index=False)
                zip_buffer.seek(0)
                compressed_data = zip_buffer.read()

                print(f'Compression time: {time.strftime("%H:%M:%S", time.gmtime(int(time.time() - t2)))}')
                
                # Upload compressed data to S3
                cos_client.upload_fileobj(io.BytesIO(compressed_data), bucket_name, object_name)

                print(f'Successfully uploaded {object_name} to {bucket_name}')

            except Exception as e:
                print(f'Error processing images: {e}')

        else:
            print('Zero images were found')

    except Exception as e:
        print('Image upload error:', e)

    finally:
        # Restore NaN values after compression
        metadata_df.replace({'NA': np.nan}, inplace=True)

    print('Delete entire tiff/ directory')
    shutil.rmtree('tiff/', ignore_errors=True)

    print(f'Database upload serially: {time.strftime("%H:%M:%S", time.gmtime(int(time.time() - t2)))}')

# This is an auxiliary function, not used directly - it can be used to obtain building roof images
def download_and_decompress_images(bucket_name, object_name):
    try:
        t1 = time.time()
        # Step 1: Download compressed data from S3
        compressed_data = cos_client.get_object(Bucket=bucket_name, Key=object_name)['Body'].read()
        print(f'Download time: {time.strftime("%H:%M:%S", time.gmtime(int(time.time() - t1)))}')
        
        t2 = time.time()
        # Decompress data
        csv_data = compressed_data.decode()  # Convert bytes to string
        df = pd.read_csv(io.StringIO(csv_data))
        
        # After decompression, replace the placeholder with NaN values
        df.replace({'NA': np.nan}, inplace=True)

        print(f'Decompression time: {time.strftime("%H:%M:%S", time.gmtime(int(time.time() - t2)))}')

        return df

    except Exception as e:
        print(f'Error downloading and decompressing images: {e}')

# Obsolete function from the CLoudant era        
'''
def bulk_update_features_db_vida_extended(df, db_name_upload, object_name):

    bulk_updates = []
    path_compressed_file = "cos://eu-de/building-image-compression/"
    
    try:
        buildings_no_tag_toDB = df.loc[df['image_name'].notnull()]
        images = [r for r in buildings_no_tag_toDB.itertuples()]

        print(f"Number if images were found: {len(images)}")
        if len(images) != 0:
            print('Start images upload')

            for idx, osm_row in enumerate(tqdm(images, desc="Images uploaded")):
                try:
                    document = client.get_document(
                        db=db_name_upload,
                        doc_id=osm_row.doc_id
                    ).get_result()

                    document['tiff_file'] = str(osm_row.tiff_name)
                    document['image_url'] = path_compressed_file + object_name
                    
                    # Post the batch of documents to Cloudant
                    bulk_updates.append(document)
                    bulk_docs = BulkDocs(docs=bulk_updates)
                    # Batch update every BATCH_SIZE_DB_UPDATE 
                    
                    if (len(bulk_updates) >= BATCH_SIZE_DB_UPDATE) or (idx == (len(buildings_no_tag_toDB) - 1)):
                        update_document_response = client.post_bulk_docs(
                            db=db_name_upload,
                            bulk_docs=bulk_docs
                        ).get_result()
                        bulk_updates = []

                    time.sleep(0.003)

                except ApiException as ae:
                    if ae.code == 404:
                        # Document not found, continue with the next file
                        print(f"Document with ID {osm_row.doc_id} not found. Skipping...")
                        continue

                    print(f"Operation failed for {osm_row.doc_id}")
                    print(" - status code: " + str(ae.code))
                    print(" - error message: " + ae.message)
                    if "reason" in ae.http_response.json():
                        print(" - reason: " + ae.http_response.json()["reason"])

        else:
            print('Zero images were found')

    except Exception as e:
        print('Image upload error:', e)
'''

def bulk_update_db2(tiff_name, object_name, bbox):

    bulk_updates = []
    path_compressed_file = f"cos://eu-de/{config["MGRS_COMPRESSED_IMAGES_BUCKET"]}/{object_name}"

    try:
        
        sql = f"""
            UPDATE "USER1"."{country_table}"
              SET
                  "TIFF_FILE" = '{tiff_name}',     
                  "IMAGE_URL" = '{path_compressed_file}'                 
            WHERE 
                (LATITUDE >= {bbox['lat_min']}) AND 
                (LATITUDE <= {bbox['lat_max']}) AND 
                (LONGITUDE >= {bbox['lon_min']}) AND 
                (LONGITUDE <= {bbox['lon_max']}) AND
                (AREA_IN_METERS > {threshold})
            
        """
        print(sql)
        cursor = DB2_connection.cursor()
        cursor.execute(sql)

    except Exception as e:
        print('Image upload error:', e)

Main code to loop through tiff-files, process and upload images:

In [None]:
margin = 0  # defines how many pixels we add to the building wenn preparing the dataset.
# where the preprocessed samples shall be stored shall be stored
folder_preprocessed_files = 'samples/'
os.makedirs(os.path.dirname(folder_preprocessed_files), exist_ok=True)

path_to_tif_folder = 'tiff/'

try:
    shutil.rmtree('tiff/', ignore_errors=True)
except Exception as e:
    print(e)


dfs = []
for tiff_name in tif_images_filenames: # iterate through grid system
    init_time = time.time()
    print('Create /tiff directory')
    os.makedirs(os.path.dirname(path_to_tif_folder), exist_ok=True)
    
    t1 = time.time()
    
    streaming_body = cos_client.get_object(Bucket=BUCKET_TIFF, Key=tiff_name)['Body']
    

    with io.FileIO(path_to_tif_folder + tiff_name, 'w') as file:
        print("Copying to localStorage: " + path_to_tif_folder + tiff_name)
        for i in io.BytesIO(streaming_body.read()):
            file.write(i)
                
    print(f'Files downloaded, time took: {time.strftime("%H:%M:%S", time.gmtime(int(time.time() - t1)))}')
    t1 = time.time()
    
    areas_covered_by_tifs = create_bounds_dict(path_to_tifs=path_to_tif_folder)
    areas_covered_by_tif = areas_covered_by_tifs[tiff_name]

    lon_min = areas_covered_by_tif['lons_sorted'][0]
    lon_max = areas_covered_by_tif['lons_sorted'][1]

    lat_min = areas_covered_by_tif['lats_sorted'][0]
    lat_max = areas_covered_by_tif['lats_sorted'][1]
    bbox = {
        'lon_min': lon_min,
        'lon_max': lon_max,
        'lat_min': lat_min,
        'lat_max': lat_max
    }
    print(lon_min, lon_max, lat_min, lat_max)
    t1 = time.time()
    df = fetch_builings_in_bbox(lon_min, lon_max, lat_min, lat_max)

    df = match_corresponding_tiff(df, areas_covered_by_tifs, path_to_tif_folder)
    tifs = df.corresponding_tiff.unique().tolist()
    if len(tifs) == 0:
        print("No tiff file was found, that corresponds with Lon and Lat coordinates in GeoDataFrame")
    else:
            
        # sanity check: is it a valid tif path
        tif = path_to_tif_folder + tiff_name
        if isinstance(tif, str):
            if tif.endswith('.tif'):
                with rasterio.open(tif) as dataset:
                    bands = dataset.read()

                    # Assuming the TIFF files have 3 bands (RGB)
                    if bands.shape[0] == 3:  # Checking if it has 3 bands (R, G, B)
                        # Reorder array from 3, height, width to height, width, 3
                        picture_all_bands = np.transpose(bands, (1, 2, 0))

                        # Convert to RGB
                        picture = np.clip(picture_all_bands, 0.0, 255.0).astype('uint8')

                        # get all rows with buildings located within this tif file
                        for index, row, in tqdm(df.loc[(df['corresponding_tiff'] == tif)].iterrows(), desc='Cropping images', total=len(df)):
                            pixel_coordinates = get_pixel_coordinates(row.geometry, areas_covered_by_tifs, dataset)

                            if len(pixel_coordinates) == 0:
                                continue
                        # get min max cols
                            rowcolminmax = get_min_max_values_of_row_col(pixel_coordinates=pixel_coordinates)

                            label_rgb = picture[rowcolminmax['rowminmax'][0] - margin:rowcolminmax['rowminmax'][1] + margin + 1,
                                    rowcolminmax['colminmax'][0] - margin:rowcolminmax['colminmax'][1] + margin + 1, :]

                            try:
                                im = Image.fromarray(label_rgb.astype("uint8"))
                                rawBytes = io.BytesIO()
                                im.save(rawBytes, "png")
                                rawBytes.seek(0)
                                df.at[index, 'image_name'] = f"{index}.png"
                                df.at[index, 'tiff_name'] = tif.split('/')[1]
                                df.at[index, 'image_source_bytes'] = base64.b64encode(rawBytes.read()).decode('ascii')

                            except Exception as e:
                                print('Image processing error:', e)
                
        object_name_tif = tiff_name.split('_')[0]
        object_name = f'compressed_images_{object_name_tif}.zip'
        bucket_name = config["MGRS_COMPRESSED_IMAGES_BUCKET"]
        DB_NAME_UPLOAD = country_table

        BATCH_SIZE_DB_UPDATE = 2000

        compress_and_upload_images(df, object_name, bucket_name)
        t3 = time.time()
        df = df.loc[df['image_name'].notnull()]

        bulk_update_db2(tiff_name, object_name, bbox)
        
        print('total time:', time.time() - t3)
        
    print('Remove tiff/ directory')
    shutil.rmtree('tiff/', ignore_errors=True)