In [27]:
import numpy as np
import cv2
import easyocr
import re
import pandas as pd
import os

In [28]:
def inputImage(img_path):
    """ Fungsi membaca image dan resize"""
    img = cv2.imread(img_path)
    if img is None:
        print(f"Image {img_path} is not found")
        return None
    h, w, _ = img.shape
    resized_img = img
    if w > 1000 and h > 1000:
        #menentukan ukuran baru
        new_h = int(h*0.20)
        new_w = int(w*0.25)

        #resize
        resized_img = cv2.resize(img, (new_w, new_h))
   

    return resized_img

In [29]:
def cropping_img(resized_img):
    """Cropping gambar"""
    #need log, print will do for now
    if resized_img is None:
        print(f"bruh no {resized_img} detected")
        return None
    #converting to hsv
    hsv_img = cv2.cvtColor(resized_img, cv2.COLOR_BGR2HSV)
    #thresholding
    hue_upper = 250
    hue_lower = 30 # 
    saturation_upper = 250
    saturation_lower = 10  # rentang toleransi saturasi
    value_upper = 200
    value_lower = 71  # rentang toleransi value
    
    up_thresh = np.array([hue_upper, saturation_upper, value_upper], dtype=np.uint8)
    low_thresh = np.array([hue_lower, saturation_lower, value_lower], dtype=np.uint8)
    thresh = cv2.inRange(hsv_img, low_thresh, up_thresh)

    #finding countour
    cnts, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    if len(cnts) == 0:
        print("No contours found")
        return None

    cnts = sorted(cnts, key=cv2.contourArea, reverse=True)
    largest_contour = cnts[0]

    #menentukan apakah layak cropping
    area = cv2.contourArea(largest_contour)
    if area > 43000:
        x, y, w, h = cv2.boundingRect(largest_contour)
        ROI = resized_img[y:y+h, x:x+w]
        # cv2.drawContours(resized_img, largest_contour, -1, (0,255,0), 2)
    else:
        ROI = resized_img

    # cv2.imshow("Cropped_img", ROI)
    # cv2.waitKey(0)
    # cv2.destroyAllWindows()
    return ROI



In [30]:
def binarization(cropped):
    grayed = cv2.cvtColor(cropped, cv2.COLOR_BGR2GRAY)
    #could use medianblur here(?)
    blurred = cv2.GaussianBlur(grayed, (5,5), 0 )
    #could also use adaptiveThresholding
    threshed_img = cv2.adaptiveThreshold(blurred, 255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 21,10)
    # threshed_img = cv2.threshold(grayed, 130, 250, cv2.THRESH_BINARY_INV|cv2.THRESH_OTSU)
    # dilate = cv2.dilate(threshed_img, (4,4), iterations=1)

    return threshed_img

In [31]:
def extraction(binarize):
    """Fungsi ekstraksi gambar KTP menjadi string"""
    reader = easyocr.Reader(["id"])
    text = reader.readtext(binarize, detail = 0)
    text = " ".join(text)
    match = re.search(r'\d+', text)
    if match:
        nik = match.group(0)
        # print(f"NIK ditemukan: {nik}, Panjang: {len(nik)} digits")
        return nik
    else:
      # print("Tidak ada NIK ditemukan")
      return "N/A"
    

#cari digit yang bukan cuman 16 digit. Sesuai pattern regex atau tidak

In [32]:
def calculate_match_percentage(actual_nik, extracted_nik):
    match_count = 0 
    for a, e in zip(actual_nik, extracted_nik):
        if a == e:
            match_count += 1
    return (match_count/len(actual_nik)) * 100 if actual_nik else 0 

In [33]:
def categorize_match(percentage):
    # Categorize match percentage into specific groups
    if percentage == 100:
        return "100% match"
    elif percentage == 0:
        return "0% match"
    else:
        return "Other % match"

In [34]:
img_path = r"<img_folder_path>"
extracted_NIK =[]
actual_NIK = []
match_percentage = []
match_category = []
for name in os.listdir(img_path):
    file_name = os.path.splitext(name)[0]
    actual_NIK.append(file_name)

    input = inputImage(os.path.join(img_path, name))
    cropped = cropping_img(input)
    binarize = binarization(cropped)

    extracted_text = extraction(binarize)
    extracted_NIK.append(extracted_text)

    if extracted_text != "N/A":
        percentage = calculate_match_percentage(file_name, extracted_text)
    else:
        percentage = 0
    match_percentage.append(percentage)
    match_category.append(categorize_match(percentage))
    
df = pd.DataFrame({
    'actual_nik' : actual_NIK,
    'extracted_nik' : extracted_NIK,
    'match_percentage' : match_percentage,
    'match_category': match_category
})





Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster wi

In [35]:
average_percentage = df['match_percentage'].mean()
display(df)
df_100 = df[df['match_category'] == '100% match']
df_0 = df[df['match_category'] == '0% match']
df_other = df[df['match_category'] == 'Other % match']
print(f"\n--- 100% match (Rows: {df_100.shape[0]}) ---")
display(df_100)

print(f"\n--- 0% match (Rows: {df_0.shape[0]}) ---")
display(df_0)
print(f"\n--- Other % match (Rows: {df_other.shape[0]}) ---")
display(df_other)
print(f"\nAverage correct reading percentage is {average_percentage:.2f}%")

Unnamed: 0,actual_nik,extracted_nik,match_percentage,match_category
0,1271116508900002,1271116508900002,100.00,100% match
1,2171104306989003,2171104306989003,100.00,100% match
2,3507055503020006,4,0.00,0% match
3,3508175404990003,3508175404990003,100.00,100% match
4,3509015609010001,3509015609010001,100.00,100% match
...,...,...,...,...
83,5108065611020004,9,0.00,0% match
84,5108067110010002,0,0.00,0% match
85,5108084704010004,5108084704010004,100.00,100% match
86,5108087110990002,31,6.25,Other % match



--- 100% match (Rows: 45) ---


Unnamed: 0,actual_nik,extracted_nik,match_percentage,match_category
0,1271116508900002,1271116508900002,100.0,100% match
1,2171104306989003,2171104306989003,100.0,100% match
3,3508175404990003,3508175404990003,100.0,100% match
4,3509015609010001,3509015609010001,100.0,100% match
5,3509066912980004,3509066912980004,100.0,100% match
7,3509085406000003,3509085406000003,100.0,100% match
11,3509125607010004,3509125607010004,100.0,100% match
14,3509174107010184,3509174107010184,100.0,100% match
16,3509175803990003,3509175803990003,100.0,100% match
18,3509205703000005,3509205703000005,100.0,100% match



--- 0% match (Rows: 22) ---


Unnamed: 0,actual_nik,extracted_nik,match_percentage,match_category
2,3507055503020006,4.0,0.0,0% match
6,3509085205020002,1.0,0.0,0% match
8,3509094207010003,0.0,0.0,0% match
12,3509134501950003,4.0,0.0,0% match
15,3509175010880018,,0.0,0% match
22,3509236712020002,27.0,0.0,0% match
26,3509264207990003,72.0,0.0,0% match
29,3509295402980003,174.0,0.0,0% match
52,3510145709000004,5.0,0.0,0% match
54,3510166506960007,,0.0,0% match



--- Other % match (Rows: 21) ---


Unnamed: 0,actual_nik,extracted_nik,match_percentage,match_category
9,3509095412960003,15070954126,50.0,Other % match
10,3509106811010002,3509106810100,62.5,Other % match
13,3509166009980003,7266009980003,6.25,Other % match
17,3509196508020009,150787,12.5,Other % match
20,3509234204000006,190941970,18.75,Other % match
23,3509236804000101,35092360040,62.5,Other % match
25,3509246901980003,3507246701980003,87.5,Other % match
28,3509286604980001,350920660478000,81.25,Other % match
30,3509296409010003,509296409010003,12.5,Other % match
31,3509304305020005,350930430,56.25,Other % match



Average correct reading percentage is 61.51%


In [36]:
df_other['len']=df_other["extracted_nik"].apply(lambda x: len(x)==16)
df_other[df_other["len"]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_other['len']=df_other["extracted_nik"].apply(lambda x: len(x)==16)


Unnamed: 0,actual_nik,extracted_nik,match_percentage,match_category,len
25,3509246901980003,3507246701980003,87.5,Other % match,True
37,3510026102010001,9510026102010001,93.75,Other % match,True
65,3514116312980001,3524116312980001,93.75,Other % match,True
75,5102075505020002,5202075505020002,93.75,Other % match,True
