In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys
import numpy as np
import pandas as pd
from pathlib import Path

from IPython.core.display import display, HTML
display(HTML('<style>.container {width:100% !important;}</style>'))


In [2]:
sys.path.append("../src")

In [3]:

import os
import json
import requests
import argparse
import numpy as np
import pandas as pd
from tqdm import tqdm
from pdf2image import convert_from_path
from strsimpy.levenshtein import Levenshtein
import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
from metrics import evaluate

In [8]:
help(evaluate)

Help on module metrics.evaluate in metrics:

NAME
    metrics.evaluate

FUNCTIONS
    scores(anno_file, inf_file, file_type)

DATA
    cols = {'ktp': ['agama', 'alamat', 'berlaku_hingga', 'jenis_kelamin', ...

FILE
    /home/darshansonde/Code/work/2021_tolaram/insureka-ml/ocr/src/metrics/evaluate.py




In [56]:
DATA_DIR=Path("../data/interim/2022-01-06_ocr_annotated")
!tree $DATA_DIR

[01;34m../data/interim/2022-01-06_ocr_annotated[00m
├── DocumentAnalysis-2022-01-30_ktp_annotated.csv
├── DocumentAnalysis-2022-01-30_ktp_cleaned.csv
├── DocumentAnalysis-2022-01-30_ktp.csv
├── DocumentAnalysis-2022-01-30_ktp_inference.csv
├── DocumentAnalysis-2022-01-30_ktp_metrics.csv
├── DocumentAnalysis-2022-01-30_sim_annotated.csv
├── DocumentAnalysis-2022-01-30_sim_cleaned.csv
├── DocumentAnalysis-2022-01-30_sim.csv
├── DocumentAnalysis-2022-01-30_sim_inference.csv
├── DocumentAnalysis-2022-01-30_stnk_annotated.csv
├── DocumentAnalysis-2022-01-30_stnk_cleaned.csv
├── DocumentAnalysis-2022-01-30_stnk.csv
├── DocumentAnalysis-2022-01-30_stnk_inference.csv
└── OCR ANNOTATION TRACKING.xlsx

0 directories, 14 files


In [57]:
dann = pd.read_csv(DATA_DIR/"DocumentAnalysis-2022-01-30_ktp_annotated.csv")
dann.fillna('', inplace=True)
dinf = pd.read_csv(DATA_DIR/"DocumentAnalysis-2022-01-30_ktp_inference.csv")
dinf.fillna('', inplace=True)

In [58]:
dinf['nik'] = dinf['nik'].astype(str)

In [59]:
dinf.shape

(75, 22)

In [60]:
dann.shape

(75, 22)

In [61]:
df = pd.merge(dann, dinf, on='id', suffixes=("_annotated", ""))
df.shape

(75, 43)

In [62]:
fields = ['nik', 'nama', 'tgl_lahir', 'jenis_kelamin', 'tempat', 'alamat', 'rt_rw', 'kel_desa',
            'kecamatan', 'agama', 'status_perkawinan', 'pekerjaan',
            'kewarganegaraan', 'berlaku_hingga']
ldist = Levenshtein()
def apply_levenshtein(x):
    for field in fields:
        x[f"{field}_dist"] = int(ldist.distance(x[field], x[f"{field}_annotated"]))

        # zero division error corrections. if both annotated and source are blank. we consider it as 0% error.
        if x[f"{field}_dist"] == 0:
            x[f"{field}_err"] = 0
        else:
            x[f"{field}_err"] = min(x[f"{field}_dist"] * 100 / len(x[f"{field}_annotated"]) if len(x[f"{field}_annotated"]) > 0 else 100, 100)
    return x

df = df.apply(apply_levenshtein, axis=1)

df.head()

Unnamed: 0,Unnamed: 0_annotated,id,image_annotated,gcs_link_annotated,type_annotated,is_completed_annotated,created_at_annotated,result_annotated,agama_annotated,alamat_annotated,...,agama_dist,agama_err,status_perkawinan_dist,status_perkawinan_err,pekerjaan_dist,pekerjaan_err,kewarganegaraan_dist,kewarganegaraan_err,berlaku_hingga_dist,berlaku_hingga_err
0,0,617,ocr/analysis/KTP27.jpeg,gs://som-datasets/interim/2022-01-06_combined_...,KTP,1,2022-01-09 15:33:47,413,ISLAM,JL LETJEN SUPRAPTO XIV/89,...,0,0.0,11,100.0,14,73.684211,7,100.0,0,0.0
1,1,616,ocr/analysis/KTP11.jpeg,gs://som-datasets/interim/2022-01-06_combined_...,KTP,1,2022-01-09 15:29:18,412,ISLAM,JL. TERUNG NO. 2 A,...,0,0.0,11,100.0,14,82.352941,7,100.0,12,100.0
2,2,615,ocr/analysis/KTP12.jpeg,gs://som-datasets/interim/2022-01-06_combined_...,KTP,1,2022-01-09 15:24:54,411,ISLAM,BTN MANGGA TIGA BLOK I NO21,...,20,100.0,7,100.0,6,60.0,7,100.0,0,0.0
3,3,614,ocr/analysis/KTP13.jpeg,gs://som-datasets/interim/2022-01-06_combined_...,KTP,1,2022-01-09 15:19:37,410,ISLAM,JL.HASAN SAPUTRA IV NO.10,...,0,0.0,11,100.0,12,80.0,7,100.0,0,0.0
4,4,613,ocr/analysis/KTP14.jpeg,gs://som-datasets/interim/2022-01-06_combined_...,KTP,1,2022-01-09 15:05:22,409,ISLAM,KP. BOJONG PETIR,...,2,40.0,11,100.0,9,90.0,7,100.0,0,0.0


In [63]:
df[[f + "_dist" for f in fields]].sum()

nik_dist                  278
nama_dist                 245
tgl_lahir_dist            837
jenis_kelamin_dist        611
tempat_dist               274
alamat_dist               706
rt_rw_dist                374
kel_desa_dist             455
kecamatan_dist            173
agama_dist                416
status_perkawinan_dist    540
pekerjaan_dist            735
kewarganegaraan_dist      422
berlaku_hingga_dist       309
dtype: int64

In [64]:
sum(df[[f + "_dist" for f in fields]].sum())

6375

In [65]:
df[[f + "_err" for f in fields]].mean()

nik_err                  23.297939
nama_err                 22.120414
tgl_lahir_err            90.666667
jenis_kelamin_err        93.333333
tempat_err               48.009117
alamat_err               36.988739
rt_rw_err                70.751323
kel_desa_err             62.849832
kecamatan_err            21.631536
agama_err                53.185185
status_perkawinan_err    90.666667
pekerjaan_err            70.747392
kewarganegaraan_err      89.333333
berlaku_hingga_err       30.333333
dtype: float64

In [66]:
df[[f + "_err" for f in fields]].mean().mean()

57.42248642339847

In [67]:
def scores(anno_file, inf_file, file_type):
    dann = pd.read_csv(anno_file)
    dann.fillna('', inplace=True)
    dinf = pd.read_csv(inf_file)
    dinf.fillna('', inplace=True)
    
    #hackfix
    if 'nik' in dinf:
        dinf['nik'] = dinf['nik'].astype(str)
    
    df = pd.merge(dann, dinf, on='id', suffixes=("_annotated", ""))
    
    
    cols = {'ktp': ['agama',
        'alamat',
        'berlaku_hingga',
        'jenis_kelamin',
        'kecamatan',
        'kel_desa',
        'kewarganegaraan',
        'nama',
        'nik',
        'pekerjaan',
        'rt_rw',
        'status_perkawinan',
        'tempat',
        'tgl_lahir'],
        'sim': [
        'nama',
        'tgl_lahir',
        'jenis_kelamin',
        'tempat',
        'alamat',
        'kel_desa',
        'kecamatan',
        'agama',
        'pekerjaan',
        'kewarganegaraan',
        'berlaku_hingga'
        ],
        'stnk': [
        'no',
        'nomor_registrasi',
        'nama_pemilik',
        'alamat',
        'berlaku_sd',
        'urut',
        'kohir',
        'nik',
        'bbnkb_pokok',
        'pkb_pokok',
        'swdkllj_pokok',
        'biaya_adm_stnk_pokok',
        'biaya_adm_tnkb_pokok',
        'jumlah_pokok',
        'bbnkb_sanski_administratif',
        'pkb_sanski_administratif',
        'swdkllj_sanski_administratif',
        'biaya_adm_stnk_sanski_administratif',
        'biaya_adm_tnkb_sanski_administratif',
        'jumlah_sanski_administratif',
        'bbnkb_jumlah',
        'pkb_jumlah',
        'swdkllj_jumlah',
        'biaya_adm_stnk_jumlah',
        'biaya_adm_tnkb_jumlah',
        'jumlah_jumlah',
        'ditetapkan_tanggal',
        'penaksir_pajak',
        'merk_type',
        'jenis_model',
        'tahun_pembuatan_perakitan',
        'warna_kb',
        'isi_silinder_hp',
        'nomor_rangka_nik',
        'nomor_mesin',
        'no_bpkb',
        'bahan_bakar',
        'warna_tnkb',
        'kepemilikan_ke',
        'no_registrasilama',
        'kode_njkb',
        'type',
        'jenis',
        'model',
        'tahun_pembuatan',
        'isi_silinder',
        'nomor_ragka_nik_vin',
        'nomor_mesin2',
        'warna2',
        'bahan_bakar2',
        'warna_tnkb2',
        'tahun_registrasi',
        'nomor_bpkb',
        'kode_lokasi',
        'no_urut_pendaftaran',
        'berlaku_sampai'
        ]}
    
    fields = cols[file_type]
    
    ldist = Levenshtein()
    
    def apply_levenshtein(x):
        for field in fields:
            x[f"{field}_dist"] = int(ldist.distance(x[field], x[f"{field}_annotated"]))

            # zero division error corrections. if both annotated and source are blank. we consider it as 0% error.
            if x[f"{field}_dist"] == 0:
                x[f"{field}_err"] = 0
            else:
                x[f"{field}_err"] = min(x[f"{field}_dist"] * 100 / len(x[f"{field}_annotated"]) if len(x[f"{field}_annotated"]) > 0 else 100, 100)
        return x

    df = df.apply(apply_levenshtein, axis=1)

    return df

res = scores(DATA_DIR/"DocumentAnalysis-2022-01-30_ktp_annotated.csv", DATA_DIR/"DocumentAnalysis-2022-01-30_ktp_inference.csv", 'ktp')


In [68]:
res.columns

Index(['Unnamed: 0_annotated', 'id', 'image_annotated', 'gcs_link_annotated',
       'type_annotated', 'is_completed_annotated', 'created_at_annotated',
       'result_annotated', 'agama_annotated', 'alamat_annotated',
       'berlaku_hingga_annotated', 'jenis_kelamin_annotated',
       'kecamatan_annotated', 'kel_desa_annotated',
       'kewarganegaraan_annotated', 'nama_annotated', 'nik_annotated',
       'pekerjaan_annotated', 'rt_rw_annotated', 'status_perkawinan_annotated',
       'tempat_annotated', 'tgl_lahir_annotated', 'Unnamed: 0', 'image',
       'gcs_link', 'type', 'is_completed', 'created_at', 'result', 'agama',
       'alamat', 'berlaku_hingga', 'jenis_kelamin', 'kecamatan', 'kel_desa',
       'kewarganegaraan', 'nama', 'nik', 'pekerjaan', 'rt_rw',
       'status_perkawinan', 'tempat', 'tgl_lahir', 'agama_dist', 'agama_err',
       'alamat_dist', 'alamat_err', 'berlaku_hingga_dist',
       'berlaku_hingga_err', 'jenis_kelamin_dist', 'jenis_kelamin_err',
       'kecamatan_

In [69]:
res.drop("Unnamed: 0_annotated", axis=1, inplace=True)

In [70]:
res.to_csv(DATA_DIR/"DocumentAnalysis-2022-01-30_ktp_metrics.csv", index=False)

In [90]:
metrics = df[[f + "_err" for f in fields]].mean()

In [101]:
for k,v in metrics.iteritems():
    print(k,v)

nik_err 23.297938662644544
nama_err 22.120414360321483
tgl_lahir_err 90.66666666666667
jenis_kelamin_err 93.33333333333333
tempat_err 48.009116809116804
alamat_err 36.98873871407689
rt_rw_err 70.75132275132275
kel_desa_err 62.84983164983165
kecamatan_err 21.631536099492756
agama_err 53.18518518518519
status_perkawinan_err 90.66666666666667
pekerjaan_err 70.74739236225304
kewarganegaraan_err 89.33333333333333
berlaku_hingga_err 30.333333333333332
