## 12_inference_main
### Building classification based on the pre-trained model

### Initial configuration
#### To start working with this particular notebook, you need to provide necessary credential and settings
#### Below is an template of configuration, which is necessary prepare aside of this notebook and copy & paste all content in triple quotes to the next cell's input field
    """
    {
    "COS_ENDPOINT_URL": "s3.private.eu-de.cloud-object-storage.appdomain.cloud",
    "COS_AUTH_ENDPOINT_URL": "https://iam.cloud.ibm.com/oidc/token",
    "COS_APIKEY": "xxx",
    "DB2_CONNECTION_STRING": "jdbc:db2://65beb513-5d3d-4101-9001-f42e9dc954b3.brt9d04f0cmqeb8u7740.databases.appdomain.cloud:30371/BLUDB:sslConnection=true;useJDBC4ColumnNameAndLabelSemantics=false;db2.jcc.charsetDecoderEncoder=3;",
    "DB2_USERNAME": "xxx",
    "DB2_PASSWORD": "xxx",
    "UTILS_BUCKET": "notebook-utils-bucket",
    "COUNTRY_TABLE": "FEATURES_DB_MAHARASHTRA",
    "ML_MODELS_BUCKET": "ml-saved-models",
    "ML_MODELS_BUCKET_CRN": "xxx",
    "L2_ML_MODELS_BUCKET": "l2-ml-saved-models",
    "JOB_STATUS_BUCKET": "inferencing-state",
    "MGRS_COMPRESSED_IMAGES_BUCKET": "building-image-compression",
    "INFERECE_JSON_NAME": "inferenced_state_Maharashtra_p1.json",
    "MODEL_NAME": "HybridArchitecture_CFG001_India_Maharashtra_DenseNet121_L1_1I_2N_dt06_27_2024_01_15_17.h5"
    }
    """


In [1]:
# Read notebook configuration
import getpass
import json

config_str = getpass.getpass('Enter your prepared config: ')
config = json.loads(config_str)

In [None]:
# Install necessary packages
#! pip install ibmcloudant
#! pip install geopandas==0.13.2
#! pip install rasterio==1.3.8
# ! pip intasll shapely
# ! pip install numpy==1.23.5;
# ! pip install pyproj==3.6.0
#! pip install pathos==0.3.1
#! pip install ibmcloudant==0.4.3
#! pip install ibm-cloud-sdk-core==3.16.7

In [3]:
# import necessary libraries
import base64
import os
import configparser
import sys
from concurrent.futures import ThreadPoolExecutor
import time
import cv2
import ibm_boto3
import tensorflow as tf
from tqdm import tqdm
from PIL import Image

from keras.models import load_model
import numpy as np
from botocore.client import Config
import io
from ibm_cloud_sdk_core import ApiException
from datetime import datetime
from ibmcloudant.cloudant_v1 import CloudantV1, BulkDocs
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import sklearn.metrics as SKM

import jaydebeapi as jdbc
import jpype
import threading
import requests
from collections import Counter
# import shapely
# from pyproj import Geod
# geod = Geod(ellps="WGS84")


cpu_count = os.cpu_count()
# assign configuration variables
# sys.path.append('.')

BUCKET_MODEL = config["ML_MODELS_BUCKET"]
MODEL_NAME = config["MODEL_NAME"]

BATCH_INFERENCE = 10_000
BATCH_SIZE_ML_CLASSIFIER = 128
BATCH_SIZE_DB_UPDATE = 500
LIMIT = 1_000
img_size = (124, 124)

cos_client = ibm_boto3.client(service_name='s3',
                              ibm_api_key_id=config["COS_APIKEY"],
                              ibm_auth_endpoint=config["COS_AUTH_ENDPOINT_URL"],
                              config=Config(signature_version='oauth'),
                              endpoint_url=config["COS_ENDPOINT_URL"])

cos_client_resource = ibm_boto3.resource(service_name='s3',
    ibm_api_key_id=config["COS_APIKEY"],
    ibm_service_instance_id=config["ML_MODELS_BUCKET_CRN"],
    ibm_auth_endpoint=config["COS_AUTH_ENDPOINT_URL"],
    config=Config(signature_version='oauth'),
    endpoint_url=config["COS_ENDPOINT_URL"]
)

response = cos_client.list_objects_v2(Bucket=config["UTILS_BUCKET"])
# download utils module
try:
    from utils import *
    print('External utils succesfully imported')
    
except Exception as e:
    print('Desired packages is missing in local env, downloading it...', e)
    for obj in response['Contents']:
        name = obj['Key']
        streaming_body_1 = cos_client.get_object(Bucket=config["UTILS_BUCKET"], Key=name)['Body']
        print("Downloading to localStorage :  " + name)
        with io.FileIO(name, 'w') as file:
            for i in io.BytesIO(streaming_body_1.read()):
                file.write(i)
    from utils import *
    print('External utils succesfully imported')


l2ml_models_bucket = cos_client_resource.Bucket(config["L2_ML_MODELS_BUCKET"])

l2ml_models_bucket.download_file(MODEL_NAME, MODEL_NAME)
print('Model downloaded: ', MODEL_NAME)

# time measuring
def format_timedelta(td):
    minutes, seconds = divmod(td.seconds + td.days * 86400, 60)
    hours, minutes = divmod(minutes, 60)
    return '{:d} hours {:02d} minutes {:02d} seconds'.format(hours, minutes, seconds)


startTime1 = datetime.now()
startTime = datetime.now()

print("INFERENCE SCRIPT START")

if os.path.isfile(MODEL_NAME) == False:
    print("ERROR: Can not find pre-trained model, Aborting ....")
    quit()
cnn_mod = load_model(MODEL_NAME)

print('MODEL loaded')

External utils succesfully imported
Model downloaded:  HybridArchitecture_CFG001_India_Maharashtra_DenseNet121_L1_1I_2N_dt06_27_2024_01_15_17.h5
INFERENCE SCRIPT START
MODEL loaded


In [4]:
def connect_to_db():

    jar = 'db2jcc4.jar'
#     os.environ['JAVA_HOME'] = '/usr/libexec/java_home'
    os.environ['CLASSPATH'] = jar

    args='-Djava.class.path=%s' % jar
    jvm_path = jpype.getDefaultJVMPath()
    try:
        jpype.startJVM(jvm_path, args)
    except Exception as e:
        print('startJVM exception: ', e)
        
    if jpype.isJVMStarted() and not jpype.isThreadAttachedToJVM():
        jpype.attachThreadToJVM()
        jpype.java.lang.Thread.currentThread().setContextClassLoader(jpype.java.lang.ClassLoader.getSystemClassLoader())
        
    
    conn = jdbc.connect(
                'com.ibm.db2.jcc.DB2Driver',
                config['DB2_CONNECTION_STRING'],
                [config["DB2_USERNAME"], config["DB2_PASSWORD"]],
                'db2jcc4.jar')
    

    return conn

DB2_connection = connect_to_db()

  if jpype.isJVMStarted() and not jpype.isThreadAttachedToJVM():


In [5]:
response = cos_client.list_objects_v2(Bucket=config['MGRS_COMPRESSED_IMAGES_BUCKET'])
objects = response.get("Contents", [])
# MGRS_tiles_to_process = [obj['Key'] for obj in objects]

MGRS_tiles = {obj['Key']: obj['Size'] / 1_000_000 for obj in objects}


In [6]:
MGRS_tiles = sorted(MGRS_tiles.items(), key=lambda x:x[1])
MGRS_tiles

[('compressed_images_43QHA.zip', 0.0036),
 ('compressed_images_43QFE.zip', 0.06688),
 ('compressed_images_44QMF.zip', 3.079516),
 ('compressed_images_43QBV.zip', 4.511823),
 ('compressed_images_44QMG.zip', 18.904504),
 ('compressed_images_44QKG.zip', 36.747604),
 ('compressed_images_43QEU.zip', 37.258969),
 ('compressed_images_43QHB.zip', 42.385128),
 ('compressed_images_43QCD.zip', 54.649278),
 ('compressed_images_44QKJ.zip', 88.867104),
 ('compressed_images_43QFD.zip', 154.722814),
 ('compressed_images_43QHC.zip', 205.5549),
 ('compressed_images_44QKH.zip', 216.034441),
 ('compressed_images_43QFC.zip', 241.214881),
 ('compressed_images_43QEB.zip', 262.349007),
 ('compressed_images_43QEA.zip', 275.13706),
 ('compressed_images_43QCB.zip', 310.062676),
 ('compressed_images_43QEV.zip', 324.124237),
 ('compressed_images_43QCC.zip', 337.31677)]

In [8]:
MGRS_tiles_to_process_p1 = []
MGRS_tiles_to_process_p2 = []
part = 0
for idx, i in enumerate(MGRS_tiles):
    if (idx + 1) % 2 == 0:
        MGRS_tiles_to_process_p1.append(i)
    else:
        MGRS_tiles_to_process_p2.append(i)
        

[('compressed_images_43QHA.zip', 0.0036),
 ('compressed_images_43QFE.zip', 0.06688),
 ('compressed_images_44QMF.zip', 3.079516),
 ('compressed_images_43QBV.zip', 4.511823),
 ('compressed_images_44QMG.zip', 18.904504),
 ('compressed_images_44QKG.zip', 36.747604),
 ('compressed_images_43QEU.zip', 37.258969),
 ('compressed_images_43QHB.zip', 42.385128),
 ('compressed_images_43QCD.zip', 54.649278),
 ('compressed_images_44QKJ.zip', 88.867104),
 ('compressed_images_43QFD.zip', 154.722814),
 ('compressed_images_43QHC.zip', 205.5549),
 ('compressed_images_44QKH.zip', 216.034441),
 ('compressed_images_43QFC.zip', 241.214881),
 ('compressed_images_43QEB.zip', 262.349007),
 ('compressed_images_43QEA.zip', 275.13706),
 ('compressed_images_43QCB.zip', 310.062676),
 ('compressed_images_43QEV.zip', 324.124237),
 ('compressed_images_43QCC.zip', 337.31677)]

In [13]:
def download_and_decompress_images(bucket_name, object_name):
    try:
        t1 = time.time()
        # Step 1: Download compressed data from S3
        compressed_data = cos_client.get_object(Bucket=bucket_name, Key=object_name)['Body'].read()
#         print(f'Download time: {time.strftime("%H:%M:%S", time.gmtime(int(time.time() - t1)))}')
        
        t2 = time.time()
        # Decompress data
        csv_data = compressed_data.decode()  # Convert bytes to string
        df = pd.read_csv(io.StringIO(csv_data))
        
        # After decompression, replace the placeholder with NaN values
        df.replace({'NA': np.nan}, inplace=True)

#         print(f'Decompression time: {time.strftime("%H:%M:%S", time.gmtime(int(time.time() - t2)))}')

        return df

    except Exception as e:
        print(f'Error downloading and decompressing images: {e}')

In [14]:
def update_DB2_item(row: dict, cursor):
    '''
    row: dict = {
                "LATITUDE": "",
                "LONGITUDE": "",
                "ML_MODEL": "",
                "ML_CONFIDENCE": "",
                "CLASSIFICATION_TYPE": ""
                }
    '''
    
    sql = f"""
        UPDATE "USER1"."{config["COUNTRY_TABLE"]}"
            SET
              "ML_MODEL" = '{row['ML_MODEL']}',     
              "ML_CONFIDENCE" = '{row['ML_CONFIDENCE']}',
              "CLASSIFICATION_SOURCE" = 'classification_model',
              "CLASSIFICATION_TYPE" = '{row['CLASSIFICATION_TYPE']}'
            WHERE 
                (LATITUDE = {row['LATITUDE']}) AND 
                (LONGITUDE = {row['LONGITUDE']})
        """
    
    cursor.execute(sql)

In [15]:
inferenced_state = {
    "inferenced_count": 0,
    "current_file": "",
    "total_inferenced_count": 0
}

In [16]:
def map_smod(smod_name):

    SMOD_mapper = {
        'Very Low Density Rural Grids (Mostly Uninhabited Area)': 1,
        'Low Density Rural Grids Cells (Dispersed Rural Area)': 2,
        'Rural Cluster (Village)': 3,
        'Suburban Or Peri-Urban Cells (Suburb)': 4,
        'Dense And Semi-Dense Urban Cluster (Town)': 5,
        'Urban Centre (City)': 6,
    }
    
    return SMOD_mapper[smod_name]



In [17]:
json_filename = config["INFERECE_JSON_NAME"]

In [18]:
def log_state_to_bucket(inferenced_count: dict):
    
    with open(json_filename, "w") as outfile:
                json.dump(inferenced_count, outfile)
                
    cos_client.upload_file(
        Filename=json_filename,
        Bucket=config["JOB_STATUS_BUCKET"],
        Key=json_filename,
        )

In [None]:
cursor = DB2_connection.cursor()
batch_size = 2000

total_inferenced_count = 0
normalize_area = 20_000
normalize_height = 20
normalize_smod = 6

images, numeric = [], []

for fname in MGRS_tiles_to_process_p1:


    print('Processing file', fname)

    t1 = time.time()

    df = download_and_decompress_images(config['MGRS_COMPRESSED_IMAGES_BUCKET'], fname[0])
    
    print(f'Assign SMODid for {fname}')
    
    df['SMOD_id'] = df['ghsl_smod'].apply(map_smod)
    
#     df['near_junction'] = [0 for _ in range(len(df))]
    
    df['longitude'] = df['doc_id'].apply(lambda x: float(x.split(':')[0]))
    df['latitude'] = df['doc_id'].apply(lambda x: float(x.split(':')[1]))
    
    df = df.tail(400_000)
    
    df.index = [i for i in range(len(df))]
    
    images_batch = []
    predicted_baches = []

    batches_processed = 0
    
    for idx, row in enumerate(tqdm(df.itertuples(), desc='Inferencing & updating', total=len(df))):

        image_source_bytes = base64.b64decode(row.image_source_bytes)

        image = Image.open(io.BytesIO(image_source_bytes))  
        image = image.resize((124, 124), Image.Resampling.NEAREST)
        image = np.array(image)
        
        numeric_input = [row.area_in_meters / normalize_area, row.SMOD_id/normalize_smod]
        
        images.append(image)
        numeric.append(numeric_input)

        if (len(images) == batch_size) or (idx == len(df) - 1):

            inputs_batch = [
                np.array(images), 
                np.array(numeric)
                ]

            predictions = cnn_mod.predict(inputs_batch, 
                                          batch_size=len(images),
                                          verbose=0, 
                                          workers=8, 
                                          use_multiprocessing=True,
                                          )

            for pidx, prediction in enumerate(predictions):

                row_index = batches_processed * batch_size + pidx

                lat = df.loc[row_index, 'doc_id'].split(':')[1]
                lon = df.loc[row_index, 'doc_id'].split(':')[0]

                table_row = {
                        "LATITUDE": lat,
                        "LONGITUDE": lon,
                        "ML_MODEL": MODEL_NAME,
                        "ML_CONFIDENCE": round(float(prediction[0]), 5),
                        "CLASSIFICATION_TYPE": "non-res" if prediction < 0.5 else "res"
                        }
                
                total_inferenced_count += 1
                try:
                    update_DB2_item(table_row, cursor)
                    
                except Exception as e:
        
                    print('Connection error, reconnect')

                    DB2_connection = connect_to_db()
                    cursor = DB2_connection.cursor()

            batches_processed += 1
            print('images infered & uploaded', len(images))
            images, numeric = [], []
            
            inferenced_state['inferenced_count'] = batches_processed * batch_size
            inferenced_state['current_file'] = fname[0]
            inferenced_state['total_inferenced_count'] = total_inferenced_count
            
            t1 = time.time()
            log_state_to_bucket(inferenced_state)
#             print(time.time() - t1)
               