In [1]:
import os
import json
import requests
import argparse
import numpy as np
import pandas as pd
from tqdm import tqdm
from pdf2image import convert_from_path
from strsimpy.levenshtein import Levenshtein

In [2]:
!pwd

/home/darshansonde/Code/work/2021_tolaram/insureka-ml/ocr/nbs


## PART 1

In [3]:
FILE_TYPE = 'STNK' # Change this as per your need -> STNK, KTP, SIM, NPWP, ODO
STNK_DIR = '../data/stnk/'
KTP_DIR = '../data/ktp/'
SIM_DIR = '../data/sim/'
NPWP_DIR = '../data/npwp/'
ODO_DIR = '../data/odo/'
IMAGE_DIR = STNK_DIR # Change this as per your need -> STNK_DIR, KTP_DIR, SIM_DIR, NPWP_DIR, ODO_DIR
PDF_DIR = '../data/PDFs/'
BASE_URL = 'http://127.0.0.1:8000/internal/intranet/api/v1/'
OCR_POST = 'document/analysis/'
OCR_GET = 'document/analysis/{id}/'
OCR_ANNOTATE = 'document/analysis/{id}/annotate/'
LAVENSHTEIN = Levenshtein()
__STNK_ANNOTATED__ = {
    "no": "string",
    "nomor_registrasi": "string",
    "nama_pemilik": "string",
    "alamat": "string",
    "berlaku_sd": "string",
    "urut": "string",
    "kohir": "string",
    "nik": "string",
    "bbnkb_pokok": "string",
    "pkb_pokok": "string",
    "swdkllj_pokok": "string",
    "biaya_adm_stnk_pokok": "string",
    "biaya_adm_tnkb_pokok": "string",
    "jumlah_pokok": "string",
    "bbnkb_sanski_administratif": "string",
    "pkb_sanski_administratif": "string",
    "swdkllj_sanski_administratif": "string",
    "biaya_adm_stnk_sanski_administratif": "string",
    "biaya_adm_tnkb_sanski_administratif": "string",
    "jumlah_sanski_administratif": "string",
    "bbnkb_jumlah": "string",
    "pkb_jumlah": "string",
    "swdkllj_jumlah": "string",
    "biaya_adm_stnk_jumlah": "string",
    "biaya_adm_tnkb_jumlah": "string",
    "jumlah_jumlah": "string",
    "ditetapkan_tanggal": "string",
    "penaksir_pajak": "string",
    "merk_type": "string",
    "jenis_model": "string",
    "tahun_pembuatan_perakitan": "string",
    "warna_kb": "string",
    "isi_silinder_hp": "string",
    "nomor_rangka_nik": "string",
    "nomor_mesin": "string",
    "no_bpkb": "string",
    "bahan_bakar": "string",
    "warna_tnkb": "string",
    "kepemilikan_ke": "string",
    "no_registrasilama": "string",
    "kode_njkb": "string",
    "type": "string",
    "jenis": "string",
    "model": "string",
    "tahun_pembuatan": "string",
    "isi_silinder": "string",
    "nomor_ragka_nik_vin": "string",
    "nomor_mesin2": "string",
    "warna2": "string",
    "bahan_bakar2": "string",
    "warna_tnkb2": "string",
    "tahun_registrasi": "string",
    "nomor_bpkb": "string",
    "kode_lokasi": "string",
    "no_urut_pendaftaran": "string",
    "berlaku_sampai": "string"
}

__KTP_ANNOTATED__ = {
    "nik": "string",
    "nama": "string",
    "tgl_lahir": "string",
    "jenis_kelamin": "string",
    "tempat": "string",
    "alamat": "string",
    "rt_rw": "string",
    "kel_desa": "string",
    "kecamatan": "string",
    "agama": "string",
    "status_perkawinan": "string",
    "pekerjaan": "string",
    "kewarganegaraan": "string",
    "berlaku_hingga": "string"
}

__SIM_ANNOTTED__ = {
    "nama": "string",
    "tgl_lahir": "string",
    "jenis_kelamin": "string",
    "tempat": "string",
    "alamat": "string",
    "kel_desa": "string",
    "kecamatan": "string",
    "agama": "string",
    "pekerjaan": "string",
    "kewarganegaraan": "string",
    "berlaku_hingga": "string"
}

__NPWP_ANNOTATED__ = {
    "npwp": "string",
    "nama": "string",
    "alamat": "string",
    "kpp": "string",
    "tgl_terdaftar": "2022-01-04",
    "tgl_terdaftar_bbox": {}
}

__ODO_ANNOTATED__ = {
    "value": 0,
    "scale": "string"
}

In [4]:
def send_doc(file_path=None, file_type=None):
    files = {'image': open(file_path, 'rb')}
    values = {'type': file_type}
    url = BASE_URL + OCR_POST
    response = requests.post(url, files=files, data=values)
    url = BASE_URL + OCR_GET.format(id = str(response.json()['id']))
    response = requests.get(url)
    while not response.json()['is_completed']:
        response = requests.get(url)
    return response.json()

def annotate_doc(file_type=None, doc_analysis_id=None):
    data = {'result': {'is_annotated': True}}
    if file_type == 'STNK':
        data['result']['STNK_annotated'] = __STNK_ANNOTATED__
    elif file_type == 'KTP':
        data['result']['KTP_annotated'] = __KTP_ANNOTATED__
    elif file_type == 'SIM':
        data['result']['SIM_annotated'] = __SIM_ANNOTTED__
    elif file_type == 'NPWP':
        data['result']['NPWP_annotated'] = __NPWP_ANNOTATED__
    elif file_type == 'ODO':
        data['result']['ODO_annotated'] = __ODO_ANNOTATED__
    else:
        return None
    url = BASE_URL + OCR_ANNOTATE.format(id = doc_analysis_id)
    response = requests.patch(url, data=json.dumps(data), headers={'Content-type': 'application/json'})
    return response.json()

In [5]:
def evaluate(document_1:dict, document_2:dict):
    assert document_1.keys() == document_2.keys(), 'The dictionaries must have same set of keys'
    score = {
        'overall_score': 0,
        'most_problematic_field_levenshtein': [],
        'most_correct_field_levenshtein': [],
        'individual_field_scores_levenshtein': {i: np.inf for i in document_1.keys()}
    }
    for key in document_1.keys():
        levenshtein_score = LAVENSHTEIN.distance(document_1[key], document_2[key])
        score['individual_field_scores_levenshtein'][key] = levenshtein_score
        score['overall_score'] += levenshtein_score
    levenshtein_min_value = min(score['individual_field_scores_levenshtein'].items(), key=lambda x: x[1])[1]
    levenshtein_max_value = max(score['individual_field_scores_levenshtein'].items(), key=lambda x: x[1])[1]
    for key, value in score['individual_field_scores_levenshtein'].items():
        if value == levenshtein_min_value:
            score['most_correct_field_levenshtein'].append(key)
        if value == levenshtein_max_value:
            score['most_problematic_field_levenshtein'].append(key)
    return score

def run_evaluation(image_dir, file_type=None):
    images = os.listdir(image_dir)
    scores = []
    for image in tqdm(images):
        try:
            send_doc_response = send_doc(file_path=os.path.join(image_dir, image), file_type=file_type)
            response = annotate_doc(file_type=file_type, doc_analysis_id=send_doc_response['id'])
            if response is not None:
                document_true = response['result'][file_type + '_annotated']
                document_pred = send_doc_response['result'][file_type]
                score = evaluate(document_1=document_true, document_2=document_pred)
                score['raw_response'] = response
                scores.append(score)
        except Exception as e:
            print(e)
            continue
    return scores

scores = run_evaluation(image_dir=IMAGE_DIR, file_type=FILE_TYPE)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:13<00:00, 13.35s/it]


In [6]:
scores

[{'overall_score': 485,
  'most_problematic_field_levenshtein': ['nomor_ragka_nik_vin'],
  'most_correct_field_levenshtein': ['nik',
   'bbnkb_pokok',
   'pkb_pokok',
   'swdkllj_pokok',
   'biaya_adm_stnk_pokok',
   'biaya_adm_tnkb_pokok',
   'jumlah_pokok',
   'bbnkb_sanski_administratif',
   'pkb_sanski_administratif',
   'swdkllj_sanski_administratif',
   'biaya_adm_stnk_sanski_administratif',
   'biaya_adm_tnkb_sanski_administratif',
   'jumlah_sanski_administratif',
   'bbnkb_jumlah',
   'pkb_jumlah',
   'swdkllj_jumlah',
   'biaya_adm_stnk_jumlah',
   'biaya_adm_tnkb_jumlah',
   'jumlah_jumlah',
   'ditetapkan_tanggal',
   'penaksir_pajak',
   'merk_type',
   'jenis_model',
   'tahun_pembuatan_perakitan',
   'isi_silinder_hp',
   'no_bpkb',
   'bahan_bakar',
   'warna_tnkb',
   'kepemilikan_ke',
   'no_registrasilama',
   'kode_njkb',
   'type',
   'tahun_pembuatan',
   'isi_silinder',
   'nomor_mesin2',
   'warna_tnkb2',
   'tahun_registrasi',
   'nomor_bpkb',
   'kode_lokasi',

In [7]:
def evaluate_docements(scores:list, file_type:str=None):
    if not scores:
        return 'No documents found!'
    min_value = min(scores, key=lambda x: x['overall_score'])['overall_score']
    max_value = max(scores, key=lambda x: x['overall_score'])['overall_score']
    document_score = {
        'min_value': min_value,
        'max_value': max_value,
        'most_correct_document': [],
        'most_problematic_docement': [],
        'file_type': file_type,
        'all_scores': scores
    }
    for score in scores:
        if min_value == score['overall_score']:
            document_score['most_correct_document'].append(score)
        if max_value == score['overall_score']:
            document_score['most_problematic_docement'].append(score)
    return document_score

document_score = evaluate_docements(scores=scores, file_type=FILE_TYPE)

In [8]:
document_score

{'min_value': 485,
 'max_value': 485,
 'most_correct_document': [{'overall_score': 485,
   'most_problematic_field_levenshtein': ['nomor_ragka_nik_vin'],
   'most_correct_field_levenshtein': ['nik',
    'bbnkb_pokok',
    'pkb_pokok',
    'swdkllj_pokok',
    'biaya_adm_stnk_pokok',
    'biaya_adm_tnkb_pokok',
    'jumlah_pokok',
    'bbnkb_sanski_administratif',
    'pkb_sanski_administratif',
    'swdkllj_sanski_administratif',
    'biaya_adm_stnk_sanski_administratif',
    'biaya_adm_tnkb_sanski_administratif',
    'jumlah_sanski_administratif',
    'bbnkb_jumlah',
    'pkb_jumlah',
    'swdkllj_jumlah',
    'biaya_adm_stnk_jumlah',
    'biaya_adm_tnkb_jumlah',
    'jumlah_jumlah',
    'ditetapkan_tanggal',
    'penaksir_pajak',
    'merk_type',
    'jenis_model',
    'tahun_pembuatan_perakitan',
    'isi_silinder_hp',
    'no_bpkb',
    'bahan_bakar',
    'warna_tnkb',
    'kepemilikan_ke',
    'no_registrasilama',
    'kode_njkb',
    'type',
    'tahun_pembuatan',
    'isi_silind