## 8_Sentinel2_cropping
### Crops the Sentinel-2 roof image of buildings in the curated labelled data set

### Initial configuration
#### To start working with this particular notebook, you need to provide necessary credential and settings
#### Below is an template of configuration, which is necessary prepare aside of this notebook and copy & paste all content in triple quotes to the next cell's input field
    """
    {
    "COS_ENDPOINT_URL": "s3.private.eu-de.cloud-object-storage.appdomain.cloud",
    "COS_AUTH_ENDPOINT_URL": "https://iam.cloud.ibm.com/oidc/token",
    "COS_APIKEY": "xxx",
    "BUCKET_TIFF": "kenya-images",
    "DATA_CURATION_BUCKET": "xxx"
    }
    """


In [25]:
# Read notebook configuration
import getpass
import json

config_str = getpass.getpass('Enter your prepared config: ')
config = json.loads(config_str)

In [26]:
# Import necessary libraries
import io
from PIL import Image
import ibm_boto3
from botocore.client import Config
import numpy as np
import configparser
import os
from ibm_cloud_sdk_core import ApiException
from ibmcloudant.cloudant_v1 import CloudantV1, Document, BulkDocs
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
import pandas as pd
import geopandas as gpd
import random
import time
import base64
import shutil
import threading
from collections import Counter
from tqdm import tqdm
from datetime import datetime
import rasterio
from pyproj import Geod
from shapely.geometry import Polygon, MultiPolygon, mapping, Point
import matplotlib.pyplot as plt

In [27]:
# initiate the S3 client
cos_client = ibm_boto3.client(service_name='s3',
                              ibm_api_key_id=config["COS_APIKEY"],
                              ibm_auth_endpoint=config["COS_AUTH_ENDPOINT_URL"],
                              config=Config(signature_version='oauth'),
                              endpoint_url=config["COS_ENDPOINT_URL"])


response = cos_client.list_objects_v2(Bucket=config["UTILS_BUCKET"])

# download utils module
try:
    from utils import *
    print('External utils succesfully imported')
    
except Exception as e:
    print('Desired utils package is missing in local env, downloading it...', e)
    for obj in response['Contents']:
        name = obj['Key']
        streaming_body_1 = cos_client.get_object(Bucket=config["UTILS_BUCKET"], Key=name)['Body']
        print("Downloading to localStorage :  " + name)
        with io.FileIO(name, 'w') as file:
            for i in io.BytesIO(streaming_body_1.read()):
                file.write(i)
    from utils import *
    print('External utils succesfully imported')



# initiate the Cloudant client
authenticator = IAMAuthenticator(config["CLOUDANT_API_KEY"])
client = CloudantV1(authenticator=authenticator)
client.set_service_url(config["CLOUDANT_URL"])    

External utils succesfully imported


In [28]:
# assign cinfig necessary variables
BUCKET_TIFF = config["BUCKET_TIFF"]
labelled_data_SMOD_heights_parquet = 'all_labelled_data_SMOD_heights.parquet'
labelled_data_SMOD_heights_sentinel2_parquet = 'all_labelled_data_SMOD_heights_sentinel2.parquet'
curation_bucket = config["DATA_CURATION_BUCKET"]

In [29]:
tif_images_bucket = cos_client.list_objects_v2(Bucket=BUCKET_TIFF)
tif_images_objects = tif_images_bucket['Contents']

tif_images_filenames = [obj['Key'] for obj in tif_images_objects]

In [30]:
# Fetch the labelled data set with all info
if type(curation_bucket) == str:

    streaming_body = cos_client.get_object(Bucket=curation_bucket, Key=labelled_data_SMOD_heights_parquet)['Body']
    print("Downloading to local storage :  " + labelled_data_SMOD_heights_parquet)
    with io.FileIO(labelled_data_SMOD_heights_parquet, 'w') as file:
        for i in io.BytesIO(streaming_body.read()):
            file.write(i)

buildings_df = gpd.read_parquet(labelled_data_SMOD_heights_parquet)

In [None]:
margin = 0  # defines how many pixels we add to the building wenn preparing the dataset.
# where the preprocessed samples shall be stored shall be stored
folder_preprocessed_files = 'samples/'
os.makedirs(os.path.dirname(folder_preprocessed_files), exist_ok=True)

path_to_tif_folder = 'tiff/'

try:
    shutil.rmtree('tiff/', ignore_errors=True)
except Exception as e:
    print(e)


dfs = []
for tiff_name in tif_images_filenames: # iterate through grid system
    init_time = time.time()
    print('Create /tiff directory')
    os.makedirs(os.path.dirname(path_to_tif_folder), exist_ok=True)
    
    t1 = time.time()
    
    streaming_body = cos_client.get_object(Bucket=BUCKET_TIFF, Key=tiff_name)['Body']
    

    with io.FileIO(path_to_tif_folder + tiff_name, 'w') as file:
        print("Copying to localStorage: " + path_to_tif_folder + tiff_name)
        for i in io.BytesIO(streaming_body.read()):
            file.write(i)
                
    print(f'Files downloaded, time took: {time.strftime("%H:%M:%S", time.gmtime(int(time.time() - t1)))}')
    t1 = time.time()
    
    areas_covered_by_tifs = create_bounds_dict(path_to_tifs=path_to_tif_folder)
    areas_covered_by_tif = areas_covered_by_tifs[tiff_name]

    lon_min = areas_covered_by_tif['lons_sorted'][0]
    lon_max = areas_covered_by_tif['lons_sorted'][1]

    lat_min = areas_covered_by_tif['lats_sorted'][0]
    lat_max = areas_covered_by_tif['lats_sorted'][1]
    bbox = {
        'lon_min': lon_min,
        'lon_max': lon_max,
        'lat_min': lat_min,
        'lat_max': lat_max
    }
    print(lon_min, lon_max, lat_min, lat_max)
    t1 = time.time()
    # df = fetch_builings_in_bbox(lon_min, lon_max, lat_min, lat_max)

    df = buildings_df[
                (buildings_df.latitude >= lat_min) & \
                (buildings_df.latitude <= lat_max) & \
                (buildings_df.longitude >= lon_min) & \
                (buildings_df.longitude <= lon_max)
            ].copy()

    # df.index = [i for i in range(len(df))]
        
    df["corresponding_tiff"] = ['NA' for _ in range(len(df))]
#     df['image_name'] = ['' for _ in range(len(df))]
#     df['tiff_name'] = ['' for _ in range(len(df))]
    tifs = df.corresponding_tiff.unique().tolist()
    if len(df) == 0:
        print("No tiff file was found, that corresponds with Lon and Lat coordinates in GeoDataFrame")
    else:
#         for tif in tqdm(tifs, desc ="TIF files processing:"):
            
        # sanity check: is it a valid tif path
        tif = path_to_tif_folder + tiff_name
        if isinstance(tif, str):
            if tif.endswith('.tif'):
                with rasterio.open(tif) as dataset:
                    bands = dataset.read()
                    
                    
                    # Assuming the TIFF files have 3 bands (RGB)
                    if bands.shape[0] == 3:  # Checking if it has 3 bands (R, G, B)
                        # Reorder array from 3, height, width to height, width, 3
                        picture_all_bands = np.transpose(bands, (1, 2, 0))

                        # Convert to RGB
                        picture = np.clip(picture_all_bands, 0.0, 255.0).astype('uint8')
                        
                        current_batch = 0
                        images_batch = []

                        for index, row in tqdm(df.iterrows(), desc='Cropping images', total=len(df)):
                            
                            try:
                                
                                pixel_coordinates = get_pixel_coordinates(row.geometry, areas_covered_by_tifs, dataset)

                                rowcolminmax = get_min_max_values_of_row_col(pixel_coordinates=pixel_coordinates)

                                label_rgb = picture[rowcolminmax['rowminmax'][0] - margin:rowcolminmax['rowminmax'][1] + margin + 1,
                                        rowcolminmax['colminmax'][0] - margin:rowcolminmax['colminmax'][1] + margin + 1, :]

                                
                                im = Image.fromarray(label_rgb.astype("uint8"))

                                rawBytes = io.BytesIO()
                                im.save(rawBytes, "png")
                                rawBytes.seek(0)

                                # df.at[index, 'image_name'] = f"{index}.png"

                                df.at[index, 'tiff_name'] = tiff_name
                                df.at[index, 'image_source_bytes'] = base64.b64encode(rawBytes.read()).decode('ascii')

                                

                            # save_sample(rasterio_bands_transformed=label_rgb, folder_to_store=folder_preprocessed_files, image_name=f"{index}.png")
                            # print(f'Img saved: {index}.png')
                                    
                            except Exception as e:
                                print('Image processing error:', e)
                                # print(traceback.format_exc())
        
        dfs.append(df)
        
    print('Remove tiff/ directory')
    shutil.rmtree('tiff/', ignore_errors=True)

In [None]:
main_df = pd.concat(dfs)
main_df = main_df.drop_duplicates(subset='geometry')
main_df

In [36]:
main_df.to_parquet(labelled_data_SMOD_heights_sentinel2_parquet)

# optionaly upload file to the bucket
if type(curation_bucket) == str:
        
    try:
        cos_client.upload_file(
            Filename=labelled_data_SMOD_heights_sentinel2_parquet,
            Bucket=curation_bucket,
            Key=labelled_data_SMOD_heights_sentinel2_parquet,
            ExtraArgs={'ContentDisposition': 'attachment'}
        )
           
        print(f'File {labelled_data_SMOD_heights_sentinel2_parquet} successfully uploaded to the COS {curation_bucket} bucket')
    except Exception as e:
        print(f"\033[91mFailed upload file to the bucket {curation_bucket}. Error: {e}")