## Initial configuration

#### To start working with this particular notebook, you need to provide necessary credential and settings
#### Below is an template of configuration, which is necessary prepare aside of this notebook and copy & paste all content in triple quotes to the next cell's input field

    """
    {
    "CLOUDANT_API_KEY": "xxx",
    "CLOUDANT_URL": "xxx",
    "UTILS_BUCKET": "notebook-utils-bucket",
    "BUCKET_TIFF": "xxx",
    "DB_NAME": "xxx",
    "COS_ENDPOINT_URL": "xxx",
    "COS_APIKEY": "xxx",
    "SQL_TABLE_NAME": "",
    "COUNTRY": "",
    }
    """

In [1]:
# Read notebook configuration
import getpass
import json



config_str = getpass.getpass('Enter your prepared config: ')
config = json.loads(config_str)

In [2]:
import jaydebeapi as jdbc
import jpype
import os
import json
import ijson
import pandas as pd
import geopandas as gpd
import shapely
from tqdm import tqdm
from shapely.wkt import dumps
import traceback
import ibm_boto3
from botocore.client import Config

In [18]:
# sql_table_name = config["SQL_TABLE_NAME"]
sql_table_name = 'FEATURES_DB_KENYA'

In [4]:
# init S3 client in order to work with last tiff file version
cos_client = ibm_boto3.client(service_name='s3',
                              ibm_api_key_id=config["COS_APIKEY"],
                              config=Config(signature_version='oauth'),
                              endpoint_url=config["COS_ENDPOINT_URL"])

def connect_to_db():
    '''
        Connect to the IBM DB2 database
    '''
    
    jar = 'db2jcc4.jar'
    os.environ['CLASSPATH'] = jar

    args='-Djava.class.path=%s' % jar
    jvm_path = jpype.getDefaultJVMPath()
    try:
        jpype.startJVM(jvm_path, args)
    except Exception as e:
        print('startJVM exception: ', e)
        
    if jpype.isJVMStarted() and not jpype.isThreadAttachedToJVM():
        jpype.attachThreadToJVM()
        jpype.java.lang.Thread.currentThread().setContextClassLoader(jpype.java.lang.ClassLoader.getSystemClassLoader())
        
    
    conn = jdbc.connect(
                'com.ibm.db2.jcc.DB2Driver',
                config['DB2_CONNECTION_STRING'],
                [config["DB2_USERNAME"], config["DB2_PASSWORD"]],
                'db2jcc4.jar')

    return conn

conn = connect_to_db()
curs = conn.cursor()

  if jpype.isJVMStarted() and not jpype.isThreadAttachedToJVM():


In [6]:
# create a resource to be able to retrieve all the object in the bucket
cos_client_resource = ibm_boto3.resource(service_name='s3',
                              ibm_api_key_id=config["COS_APIKEY"],
                              config=Config(signature_version='oauth'),
                              endpoint_url=config["COS_ENDPOINT_URL"])

# create a bucket instance
bucket = cos_client_resource.Bucket("height-buildings-bucket-vol2")

# get all filenames from the tiffs bucket
all_files = [i.key for i in bucket.objects.all()]

target_parquets = [i for i in all_files if i.split('_')[0] in [config["REGION"]]]

len(target_parquets)

4891

In [14]:
heights_parquets_folder = 'heights_parquets'

if not os.path.exists(heights_parquets_folder):
    os.makedirs(heights_parquets_folder)


In [15]:
for p in tqdm(target_parquets):
    
    cos_client.download_file(Bucket='height-buildings-bucket-vol2',Key=p,Filename=f'{heights_parquets_folder}/{p}')


100%|██████████| 4891/4891 [18:48<00:00,  4.34it/s]  


In [16]:

files = os.listdir(heights_parquets_folder)
dfs = []

for f in tqdm(files, total=len(files)):
    
    df_curr = gpd.read_parquet(os.path.join(heights_parquets_folder, f))
    try:
        df_curr['height_isnull'] = df_curr.height_median.isnull()
        df_curr = df_curr[df_curr['height_isnull'] == False]
        df_curr['region'] = f.split('_')[0]
        dfs.append(df_curr)
    except Exception as e:
        print(e)
        dfs.append(df_curr)

df = pd.concat(dfs)

del dfs
len(df)

  0%|          | 0/4891 [00:00<?, ?it/s]

100%|██████████| 4891/4891 [13:53<00:00,  5.87it/s]


30246892

In [17]:
buildings_df = df.drop_duplicates(subset='id')
del df

In [None]:
buildings_df.to_parquet('Kenya_buildings.parquet')

In [None]:
def process_row(row):

    try:
        '''
         id,
        latitude,
        longitude,
        area_in_meters,
        polygon_coordinates,
        footprint_source,
        classification_source,
        ml_confidence,
        ml_model,
        height,
        height_median,
        height_mean,
        height_max,
        tiff_file,
        image_url,
        classification_type,
        osm_id,
        osm_name,
        osm_type,
        osm_building,
        osm_other_tags,
        vida_confidence,
        urban_split,
        ghsl_smod,
        floors,
        gfa_in_meters,
        perimeter_in_meters,
        building_faces,
        
        elec_access_percent,
        elec_consumption_kwh_month,
        elec_consumption_std_kwh_month
                                            '''
        if isinstance(row.geometry, bytes):                    
            polygon = shapely.from_wkb(row.geometry)
        elif isinstance(row.geometry, str):
            polygon = shapely.from_wkt(row.geometry)
        else:
            return
        
        data =[
            f'{round(float(row.longitude), 8)}:{round(float(row.latitude), 8)}',
            round(float(row.latitude), 8),
            round(float(row.longitude), 8),
            round(row.area_in_meters, 4),
            str(dumps(polygon, rounding_precision=8)),
            row.footprint_source,
            row.classification_source,
            round(row.ml_confidence, 4),
            row.ml_model,
            float(row.height),
            float(row.height_median),
            float(row.height_mean),
            float(row.height_max),
            row.tiff_file,
            row.image_url,
            row.classification_type,
            int(row.osm_id),
            row.osm_name,
            row.osm_type,
            row.osm_building,
            row.osm_other_tags,
            row.vida_confidence,
            row.urban_split,
            row.ghsl_smod,
            row.floors,
            row.gfa_in_meters,
            row.perimeter_in_meters,
            row.building_faces,
            
            row.elec_access_percent,
            row.elec_consumption_kwh_month,
            row.elec_consumption_std_kwh_month
        ]
    
        return data

    except Exception as e:
        print(e)
        print(traceback.format_exc())
    


In [None]:
# ALlows skipping if needed
last_idx = 0

In [85]:
# a = buildings_df[buildings_df['confidence'].isna()]


# buildings_df['confidence'] = buildings_df['confidence'].fillna(0)
# buildings_df['confidence'] = buildings_df['confidence'].fillna(0)
buildings_df['ml_confidence'] = buildings_df['ml_confidence'].fillna(0)
buildings_df['ml_model'] = buildings_df['ml_model'].fillna('')

buildings_df['osm_id'] = buildings_df['osm_id'].fillna(0)
buildings_df['osm_name'] = buildings_df['osm_name'].fillna('')
buildings_df['osm_type'] = buildings_df['osm_type'].fillna('')
buildings_df['osm_building'] = buildings_df['osm_building'].fillna('')
buildings_df['osm_other_tags'] = buildings_df['osm_other_tags'].fillna('')

In [89]:
BATCH_SIZE = 750

excepted_batches = []
df_len = len(buildings_df)
data_batch = []
excepted_rows = []
print('excepted_batches', len(excepted_batches))

for idx, row in enumerate(tqdm(buildings_df.itertuples(), desc='Ingesting items', total=df_len)):
    

    if idx >= 0:
        row = process_row(row)
        if row != None:
            data_batch.append(row)
        else:
            excepted_rows.append(row)
        
        if len(data_batch) == BATCH_SIZE or idx == df_len - 1:

            try:

                values = []

                for row in data_batch:
                    row_value = ", ".join([f"'{i}'" for i in row])
                    row_value = f'({row_value})'
                    values.append(row_value)

                values = ', '.join(values)
                stmt = f"""INSERT INTO USER1.{sql_table_name} (
                                            id,
                                            latitude,
                                            longitude,
                                            area_in_meters,
                                            polygon_coordinates,
                                            footprint_source,
                                            classification_source,
                                            ml_confidence,
                                            ml_model,
                                            height,
                                            height_median,
                                            height_mean,
                                            height_max,
                                            tiff_file,
                                            image_url,
                                            classification_type,
                                            osm_id,
                                            osm_name,
                                            osm_type,
                                            osm_building,
                                            osm_other_tags,
                                            vida_confidence,
                                            urban_split,
                                            ghsl_smod,
                                            floors,
                                            gfa_in_meters,
                                            perimeter_in_meters,
                                            building_faces,
                                            elec_access_percent,
                                            elec_consumption_kwh_month,
                                            elec_consumption_std_kwh_month
                                            ) VALUES {values} """
                
                curs.execute(stmt)
                
                # print(stmt)
                
                data_batch = []
                last_idx = idx
                # break

            except Exception as e:
                print('Exception occured', e)
                excepted_batches.append(data_batch)
                data_batch = []
                conn = connect_to_db()
                curs = conn.cursor()
                last_idx = idx

                
    # if idx > BATCH_SIZE: break

print('excepted_batches', len(excepted_batches))

excepted = {'excepted_batches': excepted_batches}

with open(f"excepted_batches_{sql_table_name}.json", "w") as outfile: 
    json.dump(excepted, outfile, default=str)
    
    
print('excepted_rows', len(excepted_rows))

excepted_rows = {'excepted_rows': excepted_rows}

with open(f"excepted_rows_{sql_table_name}.json", "w") as outfile: 
    json.dump(excepted_rows, outfile, default=str)


excepted_batches 0


Ingesting items: 100%|██████████| 31969027/31969027 [9:20:20<00:00, 950.88it/s]   

excepted_batches 0
excepted_rows 0





### Code below is required only if upload above had some issues

In [None]:
excepted_batches = json.load(open(f"excepted_batches_{sql_table_name}.json"))
print(len(excepted_batches['excepted_batches']))


In [None]:
# try to upload one by one in excepted batches

exceptions = []
# values = []
for idx, batch in enumerate(excepted_batches['excepted_batches']):
    for row_idx, row in tqdm(enumerate(batch), desc=f'Ingesing data from {idx} batch', total=len(batch)):
        
        if row_idx >= 0:
            try:
                row_value = ", ".join([f"'{i}'" for i in row])
                row_value = f'({row_value})'
        # values = ', '.join(values)
                
                stmt = f"""INSERT INTO USER1.{sql_table_name} (
                                        id,
                                        latitude,
                                        longitude,
                                        area_in_meters,
                                        polygon_coordinates,
                                        footprint_source,
                                        classification_source,
                                        ml_confidence,
                                        ml_model,
                                        height,
                                        height_median,
                                        height_mean,
                                        height_max,
                                        tiff_file,
                                        image_url,
                                        classification_type,
                                        osm_id,
                                        osm_name,
                                        osm_type,
                                        osm_building,
                                        osm_other_tags,
                                        vida_confidence,
                                        urban_split,
                                        ghsl_smod,
                                        floors,
                                        gfa_in_meters,
                                        perimeter_in_meters,
                                        building_faces,
                                        elec_access_percent,
                                        elec_consumption_kwh_month,
                                        elec_consumption_std_kwh_month
                                        ) VALUES {row_value} """
            
                curs.execute(stmt)
                
        # data_batch = []

            except Exception as e:
                print('Exception occured', e)
                exceptions.append(row)
            # curs = connect_to_db()


In [None]:
# try to upload one by one in excepted rows from batches above

for exception in exceptions:
    try:
        row_value = ", ".join([f"'{i}'" for i in exception])
        row_value = f'({row_value})'
        # values = ', '.join(values)

        stmt = f"""INSERT INTO USER1.{sql_table_name} (
                                    id,
                                                latitude,
                                                longitude,
                                                area_in_meters,
                                                polygon_coordinates,
                                                footprint_source,
                                                classification_source,
                                                ml_confidence,
                                                ml_model,
                                                height,
                                                height_median,
                                                height_mean,
                                                height_max,
                                                tiff_file,
                                                image_url,
                                                classification_type,
                                                osm_id,
                                                osm_name,
                                                osm_type,
                                                osm_building,
                                                osm_other_tags,
                                                vida_confidence,
                                                urban_split,
                                                ghsl_smod,
                                                floors,
                                                gfa_in_meters,
                                                perimeter_in_meters,
                                                building_faces,
                                                elec_access_percent,
                                                elec_consumption_kwh_month,
                                                elec_consumption_std_kwh_month
                                ) VALUES {row_value} """

        curs.execute(stmt)
    except Exception as e:
        print(e)