#### SetUP

In [1]:
import os
os.environ['DISABLE_MODEL_SOURCE_CHECK'] = 'True'

from paddleocr  import PaddleOCR, PaddleOCRVL, PPStructureV3
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import cv2
import re


  from .autonotebook import tqdm as notebook_tqdm
[33mChecking connectivity to the model hosters, this may take a while. To bypass this check, set `DISABLE_MODEL_SOURCE_CHECK` to `True`.[0m


In [2]:
ocr_model = PaddleOCR(
   # 1. Dokumen Orientasi (Rotasi Gambar)
    use_doc_orientation_classify=True,  # JANGAN LUPA SET TRUE
    doc_orientation_classify_model_name='PP-LCNet_x1_0_doc_ori',
    
    # 2. Dokumen Unwarping (Pelurusan Kertas Lecek)
    use_doc_unwarping=True,             # JANGAN LUPA SET TRUE
    doc_unwarping_model_name='UVDoc',
    
    # 3. Deteksi Teks (Mencari Kotak)
    text_detection_model_name='PP-OCRv5_server_det',
    
    # 4. Orientasi Per Baris Teks
    use_textline_orientation=True,      # Opsional, bisa False biar lebih cepat
    textline_orientation_model_name='PP-LCNet_x1_0_textline_ori',
    
    # 5. Pengenalan Teks (Membaca Huruf)
    text_recognition_model_name='latin_PP-OCRv5_mobile_rec',
    
    
    # 3. Deteksi & Ukuran (Penting untuk struk panjang)
    text_det_limit_side_len=1200,        # Sesuaikan dengan slider UI 'det_max_side_len'
    text_det_limit_type='max',          # Biasanya default 'max'
    
    # 4. Thresholding (Fine-tuning deteksi)
    text_det_thresh=0.3,                # Sesuaikan dengan slider UI 'det_db_thresh'
    text_det_box_thresh=0.6,            # Sesuaikan dengan slider UI 'det_db_box_thresh'
    text_det_unclip_ratio=1.5,          # Sesuaikan dengan slider UI 'det_db_unclip_ratio'
    
    # 5. Parameter Tambahan (Jika diperlukan)
    text_rec_score_thresh=0.5,          # Batas minimum confidence score untuk hasil OCR
    return_word_box=False,              # False jika ingin per baris, True jika per kata
)


[32mCreating model: ('PP-LCNet_x1_0_doc_ori', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\Users\igust\.paddlex\official_models\PP-LCNet_x1_0_doc_ori`.[0m
[32mCreating model: ('UVDoc', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\Users\igust\.paddlex\official_models\UVDoc`.[0m
[32mCreating model: ('PP-LCNet_x1_0_textline_ori', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\Users\igust\.paddlex\official_models\PP-LCNet_x1_0_textline_ori`.[0m
[32mCreating model: ('PP-OCRv5_server_det', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\Users\igust\.paddlex\official_models\PP-OCRv5_server_det`.[0m
[32mCreating model: ('latin_PP-OCRv5_mobile_rec', None)[0m
[32mModel files already exist. Using cac

#### Start here

In [3]:
img_path = os.path.join('..', 'struk', '1.jpeg')

In [4]:
hasil = ocr_model.predict(
    img_path, 

    # 2. Unclip Ratio: KEMBALIKAN KE STANDARD (1.5 atau 1.6)
    # 2.0 terlalu besar untuk struk thermal yang rapat.
    text_det_unclip_ratio=1.5,

    # 3. Box Threshold: Naikkan sedikit (0.5 atau 0.6)
    # 0.4 terlalu sensitif, menangkap bayangan kertas sebagai kotak teks sampah.
    # 0.6 membuat kotak lebih selektif dan rapi.
    text_det_box_thresh=0.6,

    # 4. Det Threshold: Default (0.3)
    text_det_thresh=0.3,
    
    # Parameter Wajib Struk
    use_textline_orientation=True,
    use_doc_orientation_classify=True,
    text_rec_score_thresh=0.0,          # Tetap 0.0 agar semua teks masuk
)

In [5]:
hasil

[{'input_path': '..\\struk\\1.jpeg',
  'page_index': None,
  'doc_preprocessor_res': {'input_path': None,
   'page_index': None,
   'input_img': array([[[17, ..., 19],
           ...,
           [ 5, ...,  8]],
   
          ...,
   
          [[ 6, ...,  6],
           ...,
           [ 6, ...,  6]]], shape=(1280, 960, 3), dtype=uint8),
   'model_settings': {'use_doc_orientation_classify': True,
    'use_doc_unwarping': True},
   'angle': 0,
   'rot_img': array([[[17, ..., 19],
           ...,
           [ 5, ...,  8]],
   
          ...,
   
          [[ 6, ...,  6],
           ...,
           [ 6, ...,  6]]], shape=(1280, 960, 3), dtype=uint8),
   'output_img': array([[[32, ..., 35],
           ...,
           [27, ..., 19]],
   
          ...,
   
          [[10, ..., 11],
           ...,
           [15, ..., 11]]], shape=(1280, 960, 3), dtype=uint8)},
  'dt_polys': [array([[329, 215],
          ...,
          [329, 271]], shape=(4, 2), dtype=int16),
   array([[107, 296],
         

In [35]:
data_dict = hasil[0]

list_teks = data_dict.get('rec_texts', [])
list_skor = data_dict.get('rec_scores', [])
list_box  = data_dict.get('dt_polys', [])

In [None]:
list_teks

In [37]:
for res in hasil:
    #res.save_to_img("output")
    res.save_to_json("output")

#### A/B Testing Whit and Whitout Image Pre-Processing

In [None]:
# --- 2. PRE-PROCESSING (METODE B) ---
def safe_preprocess(img):
    h, w = img.shape[:2]
    # Denoising
    img_clean = cv2.fastNlMeansDenoisingColored(img, None, 10, 10, 7, 21)
    # Upscaling 2x
    scale = 2.0
    new_w, new_h = int(w * scale), int(h * scale)
    img_scale = cv2.resize(img_clean, (new_w, new_h), interpolation=cv2.INTER_CUBIC)
    # Mild Sharpening
    kernel = np.array([[0, -1, 0], [-1, 5,-1], [0, -1, 0]])
    img_sharp = cv2.filter2D(img_scale, -1, kernel)
    return img_sharp

# --- 3. FUNGSI PREDIKSI ---
def run_prediction(image_input):
    # Handle input path vs numpy
    if isinstance(image_input, str):
        image = cv2.imread(image_input)
    else:
        image = image_input

    # Set Limit Sisi Dinamis (Agar koordinat Metode B tidak meleset/mengecil)
    h, w = image.shape[:2]
    limit_side = max(h, w) + 100 
    if limit_side < 960: limit_side = 960

    # Panggil PaddleOCR .predict()
    prediction_generator = ocr_model.predict(
        input=image, 
        text_det_limit_side_len=limit_side, # PENTING: Ikuti ukuran gambar
        text_det_thresh=0.3,
        text_det_box_thresh=0.4,
        text_det_unclip_ratio=2.0,
        use_textline_orientation=True,
        use_doc_orientation_classify=True,
        text_rec_score_thresh=0.0 # Ambil semua teks
    )

    # Parsing Generator ke List
    results = list(prediction_generator)
    boxes = []
    texts = []
    scores = []
    
    for res in results:
        if 'dt_polys' in res: boxes.extend(res['dt_polys'])
        if 'rec_texts' in res: texts.extend(res['rec_texts'])
        if 'rec_scores' in res: scores.extend(res['rec_scores'])
            
    return image, boxes, texts, scores

# --- 4. FUNGSI VISUALISASI CANTIK ---
def draw_ocr_visualization(img, boxes, texts, scores, title):
    # Buat copy agar gambar asli tidak rusak
    viz_img = img.copy()
    h, w = viz_img.shape[:2]
    
    # Hitung skala font dinamis berdasarkan ukuran gambar
    # Agar teks tidak kekecilan di gambar Metode B (yang resolusinya besar)
    font_scale = w / 1000.0 * 0.6 
    if font_scale < 0.4: font_scale = 0.4
    thickness = int(font_scale * 2)

    for i, box in enumerate(boxes):
        if i >= len(texts): break # Safety check
        
        text = texts[i]
        score = scores[i]
        
        # Filter: Hanya gambar jika score cukup tinggi (opsional, biar tidak semrawut)
        # Tapi untuk debugging, kita tampilkan semua
        color = (0, 255, 0) # Hijau
        
        # 1. Gambar Kotak
        pts = np.array(box, np.int32).reshape((-1, 1, 2))
        cv2.polylines(viz_img, [pts], True, color, 2)
        
        # 2. Gambar Background Teks (Agar tulisan terbaca jelas)
        text_label = f"{text}" # Bisa tambah f" ({score:.2f})" jika mau lihat score
        (text_w, text_h), _ = cv2.getTextSize(text_label, cv2.FONT_HERSHEY_SIMPLEX, font_scale, thickness)
        
        # Koordinat pojok kiri atas kotak
        x, y = int(box[0][0]), int(box[0][1])
        
        # Gambar kotak background di atas bounding box
        cv2.rectangle(viz_img, (x, y - text_h - 5), (x + text_w, y), color, -1)
        
        # 3. Tulis Teks (Warna Hitam di atas background Hijau)
        cv2.putText(viz_img, text_label, (x, y - 5), cv2.FONT_HERSHEY_SIMPLEX, font_scale, (0, 0, 0), thickness)

    return viz_img

# --- 5. EXECUTE A/B TEST ---
def run_visual_ab_test(image_path):
    print(f"Memproses: {image_path}...")
    
    # --- METODE A: RAW ---
    print("1. Running Method A (Original)...")
    img_a, boxes_a, texts_a, scores_a = run_prediction(image_path)
    viz_a = draw_ocr_visualization(img_a, boxes_a, texts_a, scores_a, "Metode A (Raw)")
    
    # --- METODE B: ENHANCED ---
    print("2. Running Method B (Upscaled + Sharpen)...")
    # Load raw manual untuk diproses
    raw = cv2.imread(image_path)
    if raw is None: return
    img_enhanced = safe_preprocess(raw)
    
    img_b, boxes_b, texts_b, scores_b = run_prediction(img_enhanced)
    viz_b = draw_ocr_visualization(img_b, boxes_b, texts_b, scores_b, "Metode B (Enhanced)")
    
    # --- TAMPILKAN HASIL ---
    plt.figure(figsize=(20, 15)) # Ukuran kanvas besar
    
    # Plot A
    plt.subplot(1, 2, 1)
    plt.title(f"METODE A (RAW)\n{len(boxes_a)} Teks Terdeteksi", fontsize=15)
    plt.imshow(cv2.cvtColor(viz_a, cv2.COLOR_BGR2RGB))
    plt.axis('off')
    
    # Plot B
    plt.subplot(1, 2, 2)
    plt.title(f"METODE B (UPSCALED)\n{len(boxes_b)} Teks Terdeteksi (Cek Ejaan!)", fontsize=15)
    plt.imshow(cv2.cvtColor(viz_b, cv2.COLOR_BGR2RGB))
    plt.axis('off')
    
    plt.tight_layout()
    plt.show()

# --- CARA PAKAI ---
# Ganti dengan path struk Anda yang paling sulit/buram
img_path = os.path.join('..', 'struk', '1.jpeg')
run_visual_ab_test(img_path)

#### Visualisasi box detection

In [None]:
import matplotlib.pyplot as plt
import numpy as np

def visualize_detection_boxes(image_path, ocr_result):
    """
    Memvisualisasikan hanya bounding box hasil deteksi dari PaddleOCR.
    
    Args:
        image_path (str): Path ke file gambar asli.
        ocr_result (dict): Dictionary output dari PaddleOCR (misal: hasil[0]).
    """
    
    # 1. Load Gambar
    image = cv2.imread(image_path)
    if image is None:
        print(f"Error: Gambar tidak ditemukan di path {image_path}")
        return

    # Konversi BGR (OpenCV) ke RGB (Matplotlib) agar warna sesuai
    image_viz = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    
    # 2. Ambil Data Kotak Deteksi
    # Menggunakan .get() agar aman jika key tidak ada
    boxes = ocr_result.get('dt_polys', [])
    
    print(f"Info: Ditemukan {len(boxes)} kotak deteksi.")

    if not boxes:
        print("Tidak ada kotak deteksi untuk divisualisasikan.")
        return
    
    # 3. Gambar Kotak pada Gambar
    for box in boxes:
        # Format box PaddleOCR: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
        # Ubah ke format numpy array polygons (int32) untuk OpenCV
        points = np.array(box, np.int32)
        
        # Reshape menjadi (jumlah_titik, 1, 2) sesuai format cv2.polylines
        points = points.reshape((-1, 1, 2))
        
        # Gambar garis poligon (kotak)
        # isClosed=True agar garis terakhir nyambung ke awal
        # color=(255, 0, 0) untuk warna Merah (dalam format RGB)
        # thickness=2 untuk ketebalan garis
        cv2.polylines(image_viz, [points], isClosed=True, color=(255, 0, 0), thickness=2)

    # 4. Tampilkan Hasil Visualisasi
    plt.figure(figsize=(12, 16)) # Ukuran plot besar agar detail terlihat
    plt.imshow(image_viz)
    plt.axis('off') # Hilangkan sumbu koordinat
    plt.title(f"Visualisasi Deteksi: {len(boxes)} Bounding Boxes")
    plt.show()

# --- CONTOH PENGGUNAAN ---
# Misalkan 'hasil' adalah list output dari PaddleOCR
# Dan struk Anda ada di 'struk/1.jpeg'

# Pastikan Anda menggunakan hasil[0] karena output PaddleOCR adalah list
# visualize_detection_boxes('struk/1.jpeg', hasil[0])

In [None]:
gambar_box = visualize_detection_boxes(img_path, hasil[0])
gambar_box

#### Reconstruct Line Formating

In [None]:
def reconstruct_lines_v2(ocr_result, y_threshold=15):
    """
    Menggabungkan hasil OCR (Dictionary format) menjadi baris teks.
    Cocok untuk output PaddleOCR v3/v4 Pipeline.
    """
    
    # 1. Ambil data dari key dictionary yang sesuai
    # dt_polys = List kotak koordinat
    # rec_texts = List teks hasil bacaan
    boxes = ocr_result.get('dt_polys', [])
    texts = ocr_result.get('rec_texts', [])
    
    # Validasi panjang data
    if len(boxes) != len(texts):
        print("Warning: Jumlah kotak dan teks tidak sama.")
        return []

    if not boxes:
        return []

    # 2. Parsing data agar mudah diolah
    parsed_boxes = []
    
    # Kita gunakan zip() untuk menggabungkan kotak dan teks yang index-nya sama
    for box, text in zip(boxes, texts):
        # Format box di JSON Anda: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
        # Titik 0 = Kiri Atas, Titik 2 = Kanan Bawah
        
        # Hitung titik tengah vertikal (Center Y)
        y1 = box[0][1]
        y2 = box[2][1]
        center_y = (y1 + y2) / 2
        
        # Ambil posisi X paling kiri untuk urutan kata
        min_x = box[0][0]
        
        parsed_boxes.append({
            'text': text,
            'center_y': center_y,
            'min_x': min_x
        })

    # 3. Urutkan semua kotak berdasarkan posisi Y (dari atas ke bawah)
    parsed_boxes.sort(key=lambda b: b['center_y'])

    # 4. Clustering (Pengelompokan Baris)
    lines = []
    current_line = [parsed_boxes[0]]
    
    for i in range(1, len(parsed_boxes)):
        box = parsed_boxes[i]
        last_box = current_line[-1]
        
        # Cek selisih Y dengan kotak sebelumnya
        if abs(box['center_y'] - last_box['center_y']) <= y_threshold:
            # Jika selisih kecil (misal < 15px), berarti satu baris
            current_line.append(box)
        else:
            # Jika selisih besar, simpan baris lama dan buat baris baru
            lines.append(current_line)
            current_line = [box]
    
    # Simpan baris terakhir
    lines.append(current_line)

    # 5. Finalisasi: Urutkan X (kiri-kanan) dan gabung string
    final_output = []
    for line in lines:
        # Urutkan item dalam satu baris dari kiri ke kanan
        line.sort(key=lambda b: b['min_x'])
        
        # Gabungkan teks dengan spasi
        joined_text = " ".join([item['text'] for item in line])
        final_output.append(joined_text)

    return final_output

In [None]:
# 1. Panggil fungsi penyatu baris
clean_lines = reconstruct_lines_v2(hasil[0], y_threshold=15)

# 2. Cetak Hasil
print("--- HASIL STRUK RAPI ---")
for line in clean_lines:
    print(line)

In [None]:
def reconstruct_lines_robust(ocr_result, y_threshold=15):
    """
    Versi ROBUST: Tetap memproses data meskipun jumlah kotak dan teks tidak sama.
    Mencegah return list kosong [] saat terjadi mismatch.
    """
    
    # 1. Parsing Input (Support List & Dict)
    data_dict = {}
    if isinstance(ocr_result, list):
        if len(ocr_result) > 0: data_dict = ocr_result[0]
        else: return []
    elif isinstance(ocr_result, dict):
        data_dict = ocr_result
    else:
        return []

    # 2. Ambil Data
    boxes = data_dict.get('dt_polys', [])
    texts = data_dict.get('rec_texts', [])
    
    # --- LOGIKA BARU: PENYELAMATAN DATA ---
    if len(boxes) != len(texts):
        # Hitung panjang terpendek
        min_len = min(len(boxes), len(texts))
        
        # Beri warning tapi JANGAN return kosong
        print(f"[Info] Mismatch terdeteksi! Boxes: {len(boxes)}, Texts: {len(texts)}.")
        print(f"       Mengambil {min_len} data pertama yang valid...")
        
        # Potong list yang kepanjangan agar sama rata
        boxes = boxes[:min_len]
        texts = texts[:min_len]

    if not boxes:
        return []

    # 3. Pairing Data (Sekarang aman dilakukan zip)
    parsed_boxes = []
    for box, text in zip(boxes, texts):
        y1 = box[0][1]
        y2 = box[2][1]
        center_y = (y1 + y2) / 2
        min_x = box[0][0]
        
        parsed_boxes.append({
            'text': text,
            'center_y': center_y,
            'min_x': min_x
        })

    # 4. Clustering Baris (Sama seperti sebelumnya)
    parsed_boxes.sort(key=lambda b: b['center_y'])
    
    lines = []
    if parsed_boxes:
        current_line = [parsed_boxes[0]]
        for i in range(1, len(parsed_boxes)):
            box = parsed_boxes[i]
            last_box = current_line[-1]
            
            if abs(box['center_y'] - last_box['center_y']) <= y_threshold:
                current_line.append(box)
            else:
                lines.append(current_line)
                current_line = [box]
        lines.append(current_line)

    # 5. Gabungkan Teks
    final_output = []
    for line in lines:
        line.sort(key=lambda b: b['min_x'])
        joined_text = " ".join([item['text'] for item in line])
        final_output.append(joined_text)

    return final_output

In [None]:
# 1. Panggil fungsi penyatu baris
clean_lines = reconstruct_lines_robust(hasil[0], y_threshold=15)

# 2. Cetak Hasil
print("--- HASIL STRUK RAPI ---")
for line in clean_lines:
    print(line)

In [None]:
def reconstruct_lines_smart_filter(ocr_result, y_threshold=15, min_score=0.6):
    """
    Menggabungkan baris sekaligus memfilter teks berkualitas rendah.
    
    Args:
        ocr_result (dict): Output dari predict (dt_polys, rec_texts, rec_scores).
        y_threshold (int): Toleransi jarak vertikal antar kata.
        min_score (float): Ambang batas keyakinan (Ganti sesuai preferensi Anda).
    """
    
    # 1. Ambil Data (Pastikan format dict)
    data = ocr_result[0] if isinstance(ocr_result, list) else ocr_result
    
    boxes = data.get('dt_polys', [])
    texts = data.get('rec_texts', [])
    scores = data.get('rec_scores', [])
    
    # Safety Check: Karena thresh=0.0, harusnya len-nya sama persis.
    # Kalau masih beda, baru kita potong (fail-safe).
    min_len = min(len(boxes), len(texts), len(scores))
    
    # 2. Filtering & Parsing (Tahap Krusial)
    valid_items = []
    
    for i in range(min_len):
        score = scores[i]
        text = texts[i]
        box = boxes[i]
        
        # --- SMART FILTER ---
        # Hanya ambil data jika score memenuhi standar Anda
        if score >= min_score:
            # Hitung geometri hanya untuk data valid
            y1 = box[0][1]
            y2 = box[2][1]
            center_y = (y1 + y2) / 2
            min_x = box[0][0]
            
            valid_items.append({
                'text': text,
                'center_y': center_y,
                'min_x': min_x,
                'score': score
            })
    
    # Jika tidak ada item lolos filter
    if not valid_items:
        return []

    # 3. Clustering Baris (Sama seperti v2, tapi datanya sudah bersih)
    valid_items.sort(key=lambda x: x['center_y'])
    
    lines = []
    current_line = [valid_items[0]]
    
    for i in range(1, len(valid_items)):
        item = valid_items[i]
        last_item = current_line[-1]
        
        if abs(item['center_y'] - last_item['center_y']) <= y_threshold:
            current_line.append(item)
        else:
            lines.append(current_line)
            current_line = [item]
    lines.append(current_line)

    # 4. Finalisasi Teks
    final_output = []
    for line in lines:
        line.sort(key=lambda x: x['min_x'])
        joined_text = " ".join([item['text'] for item in line])
        final_output.append(joined_text)

    return final_output

In [None]:
# 1. Panggil fungsi penyatu baris
clean_lines = reconstruct_lines_smart_filter(hasil[0], y_threshold=15)

# 2. Cetak Hasil
print("--- HASIL STRUK RAPI ---")
for line in clean_lines:
    print(line)

#### Final Reconstruction Line

In [None]:
def reconstruct_lines_dynamic(ocr_result):
    """
    Menggabungkan baris dengan Threshold DINAMIS berdasarkan tinggi huruf rata-rata.
    Aman untuk struk rapat maupun renggang.
    """
    
    # 1. Parsing Input (Sama seperti sebelumnya)
    data_dict = {}
    if isinstance(ocr_result, list):
        if len(ocr_result) > 0: data_dict = ocr_result[0]
        else: return []
    elif isinstance(ocr_result, dict):
        data_dict = ocr_result
    else:
        return []

    boxes = data_dict.get('dt_polys', [])
    texts = data_dict.get('rec_texts', [])
    
    if len(boxes) != len(texts):
        min_len = min(len(boxes), len(texts))
        boxes = boxes[:min_len]
        texts = texts[:min_len]
        
    if not boxes: return []

    # 2. Persiapan Data & HITUNG TINGGI RATA-RATA
    parsed_boxes = []
    total_height = 0
    
    for box, text in zip(boxes, texts):
        # Hitung tinggi kotak ini (y_bawah - y_atas)
        h = abs(box[2][1] - box[0][1])
        total_height += h
        
        y1 = box[0][1]
        y2 = box[2][1]
        center_y = (y1 + y2) / 2
        min_x = box[0][0]
        
        parsed_boxes.append({
            'text': text,
            'center_y': center_y,
            'min_x': min_x,
            'height': h
        })
        
    # --- LOGIKA DINAMIS ---
    # Hitung rata-rata tinggi huruf di struk ini
    avg_height = total_height / len(parsed_boxes)
    
    # Set threshold = 50% dari tinggi huruf rata-rata
    # Contoh: Jika tinggi huruf 30px, threshold jadi 15px (Normal)
    # Contoh: Jika tinggi huruf 10px (Struk Rapat), threshold jadi 5px (Aman!)
    dynamic_threshold = avg_height * 0.45 
    
    # print(f"[Info] Avg Height: {avg_height:.1f}px, Dynamic Threshold: {dynamic_threshold:.1f}px")

    # 3. Clustering Baris (Pakai dynamic_threshold)
    parsed_boxes.sort(key=lambda b: b['center_y'])
    
    lines = []
    current_line = [parsed_boxes[0]]
    
    for i in range(1, len(parsed_boxes)):
        box = parsed_boxes[i]
        last_box = current_line[-1]
        
        if abs(box['center_y'] - last_box['center_y']) <= dynamic_threshold:
            current_line.append(box)
        else:
            lines.append(current_line)
            current_line = [box]
    lines.append(current_line)

    # 4. Finalisasi Output
    final_output = []
    for line in lines:
        line.sort(key=lambda b: b['min_x'])
        joined_text = " ".join([item['text'] for item in line])
        final_output.append(joined_text)

    return final_output

In [None]:
# 1. Panggil fungsi penyatu baris
clean_lines = reconstruct_lines_dynamic(hasil[0])

In [None]:
# 1. Panggil fungsi penyatu baris
clean_lines = reconstruct_lines_dynamic(hasil[0])

# 2. Cetak Hasil
print("--- HASIL STRUK RAPI ---")
for line in clean_lines:
   print(line)

In [None]:
print(clean_lines)

#### Parser infomation From the recepits

In [None]:
import re
import json
from datetime import datetime

class MasterReceiptParser:
    def __init__(self):
        # Regex untuk pola umum
        self.price_pattern = re.compile(r'(\d{1,3}(?:[.,]\d{3})*(?:[.,]\d{2})?)$')
        # Pola Qty kompleks: "2 x 5000", "12 @ 900", "1 PCS 5000"
        self.qty_pattern = re.compile(r'(\S+)\s*(?:[xX@]|pcs|PCS)\s*(\d{1,3}(?:[.,]\d{3})*)')
        
        # Keyword untuk membedakan area struk
        self.header_skip = ["SELAMAT DATANG", "COPY", "STRUK", "JL.", "RAYA", "TELP", "NPWP"]
        self.footer_keywords = ['TOTAL', 'JUMLAH', 'BAYAR', 'TUNAI', 'CASH', 'KEMBALI', 'CHANGE', 'PPN', 'TAX', 'DPP', 'SISA', 'MEMBER', 'BKP']

    # --- 1. FITUR PEMBERSIH TYPO (SANITIZATION) ---
    def sanitize_number(self, text):
        """Membersihkan typo OCR pada angka (misal 'l' jadi '1', '-A' jadi '1')."""
        if not text: return "1"
        text = text.upper().strip()
        
        # Koreksi Ekstrem (Noise spesifik Anda)
        if text in ['-A', 'A', 'I', 'L']: return '1'
        
        replacements = {
            'L': '1', 'I': '1', '|': '1', '!': '1', 
            'O': '0', 'D': '0', 'Q': '0',
            'S': '5', '$': '5', 'Z': '2', 'B': '8'
        }
        clean_text = ""
        for char in text:
            clean_text += replacements.get(char, char)
            
        # Hapus non-digit kecuali titik/koma
        return re.sub(r'[^\d.,]', '', clean_text)

    def parse_price(self, price_str):
        """Mengubah string '10.000' atau '10,000.00' menjadi integer."""
        if not price_str: return 0
        clean = re.sub(r'[^\d]', '', price_str) # Hapus semua kecuali angka
        return int(clean)

    # --- 2. FITUR VALIDASI MATEMATIKA ---
    def validate_qty(self, raw_qty_str, unit_price, total_line_price):
        """
        Menebak Qty yang benar dengan Matematika Terbalik.
        Hukum: Qty x Harga Satuan = Total Harga
        """
        # 1. Coba parse apa adanya
        sanitized_qty = self.sanitize_number(raw_qty_str)
        try:
            qty = int(sanitized_qty)
        except:
            qty = 1
            
        # 2. Cek Matematika
        if unit_price > 0:
            expected_total = qty * unit_price
            # Jika hitungan cocok (toleransi selisih sedikit krn pembulatan)
            if abs(expected_total - total_line_price) < 100: 
                return qty
            
            # Jika TIDAK cocok, coba Qty = 1 (Kasus paling umum)
            if abs(unit_price - total_line_price) < 100:
                return 1
                
            # Jika masih salah, hitung Qty seharusnya
            calc_qty = total_line_price / unit_price
            if calc_qty.is_integer() and calc_qty > 0:
                return int(calc_qty)
                
        return qty # Jika menyerah, kembalikan hasil sanitasi

    # --- 3. EKSTRAKSI METADATA ---
    def extract_metadata(self, lines):
        store_name = "Unknown Store"
        trx_date = None
        trx_time = None
        
        # Regex Tanggal/Waktu
        date_pattern = re.compile(r'(\d{1,2}[-./]\d{1,2}[-./]\d{2,4})')
        time_pattern = re.compile(r'(\d{1,2}[:]\d{2}(?:[:]\d{2})?)')
        
        header_found = False
        
        for line in lines:
            line_upper = line.upper()
            
            # Cari Nama Toko (Baris valid pertama yang bukan keyword skip)
            if not header_found and len(line) > 3:
                is_skip = any(k in line_upper for k in self.header_skip)
                if not is_skip and not re.search(r'\d', line): # Nama toko jarang ada angka
                    store_name = line
                    header_found = True
            
            # Cari Tanggal
            if not trx_date:
                match = date_pattern.search(line)
                if match: trx_date = match.group(1)
                
            # Cari Waktu
            if not trx_time:
                match = time_pattern.search(line)
                if match: trx_time = match.group(1)
                
        return store_name, trx_date, trx_time

    # --- 4. ENGINE PARSING ITEM (BUFFER LOGIC) ---
    def parse_items(self, lines):
        items = []
        name_buffer = []
        grand_total = 0
        
        for line in lines:
            line_clean = line.strip()
            line_upper = line_clean.upper()
            
            # A. SKIP HEADER/FOOTER
            # Jika mengandung keyword footer, reset buffer & skip
            if any(k in line_upper for k in self.footer_keywords):
                # Cek jika ini baris TOTAL BELANJA untuk ambil Grand Total
                if 'TOTAL' in line_upper and 'ITEM' not in line_upper:
                    price_match = self.price_pattern.search(line_clean)
                    if price_match:
                        val = self.parse_price(price_match.group(1))
                        # Update grand total jika nilainya terbesar (logika sederhana)
                        if val > grand_total: grand_total = val
                name_buffer = []
                continue

            # B. DETEKSI HARGA DI UJUNG KANAN (Penanda Akhir Item)
            price_match = self.price_pattern.search(line_clean)
            
            if price_match:
                # === EKSEKUSI ITEM ===
                total_line_price = self.parse_price(price_match.group(1))
                
                # Ambil sisa teks di kiri harga
                leftover_text = line_clean[:price_match.start()].strip()
                
                # Variabel default
                qty = 1
                unit_price = total_line_price
                
                # Cek pola Qty "2 x 5000" di sisa teks
                qty_match = self.qty_pattern.search(leftover_text)
                
                if qty_match:
                    # Ada pola Qty!
                    raw_qty = qty_match.group(1) # Bisa jadi '7' atau '-A'
                    raw_price = qty_match.group(2)
                    
                    unit_price = self.parse_price(raw_price)
                    
                    # VALIDASI MATEMATIKA & SANITASI
                    qty = self.validate_qty(raw_qty, unit_price, total_line_price)
                    
                    # Nama barang ada di BUFFER sebelumnya
                    item_name = " ".join(name_buffer).strip()
                    if not item_name: item_name = "Unknown Item" # Fallback
                
                else:
                    # Tidak ada pola Qty, berarti 1 baris normal atau baris terakhir nama
                    # Contoh: "ROTI 15.000"
                    current_name_part = leftover_text
                    if name_buffer:
                        item_name = (" ".join(name_buffer) + " " + current_name_part).strip()
                    else:
                        item_name = current_name_part
                
                # Simpan Hasil
                if len(item_name) > 2 and total_line_price > 0:
                    items.append({
                        "name": item_name,
                        "qty": qty,
                        "price": unit_price,
                        "total": total_line_price
                    })
                
                name_buffer = [] # Reset buffer
                
            else:
                # === SIMPAN KE BUFFER ===
                # Tidak ada harga, berarti ini potongan nama barang
                if len(line_clean) > 2: # Filter noise pendek
                    name_buffer.append(line_clean)
                    
        return items, grand_total

    # --- MAIN RUN ---
    def process(self, lines):
        store, date, time = self.extract_metadata(lines)
        items, total = self.parse_items(lines)
        
        return {
            "merchant": store,
            "date": date,
            "time": time,
            "items": items,
            "total_amount": total,
            "item_count": len(items)
        }

In [None]:
# --- CONTOH PENGGUNAAN ---
# Anggap 'final_lines' adalah output dari reconstruct_lines_dynamic
# final_lines = [ ... list string baris struk ... ]

parser = MasterReceiptParser()
result_json = parser.process(clean_lines)
print(json.dumps(result_json, indent=4))

In [None]:
def parse_price(text):
    if not text: return 0
    # Hapus .00 di belakang jika ada (kasus 204,000.00)
    text = re.sub(r'\.00$', '', text)
    clean = re.sub(r'[^\d]', '', text)
    return int(clean) if clean else 0

def parse_qty(text):
    if not text: return 1
    # Ganti koma dengan titik untuk float (12,5 -> 12.5)
    clean = text.replace(',', '.').upper()
    # Hapus karakter satuan
    clean = re.sub(r'[^\d.]', '', clean)
    try:
        val = float(clean)
        if val.is_integer(): return int(val)
        return val
    except:
        return 1

def extract_items_universal(lines):
    items = []
    
    # 1. BLACKLIST (Header/Footer/Metadata)
    blacklist = [
        "INDOMARET", "INDOMARCO", "PRISMATAMA", "JL.", "JALAN", "RAYA", 
        "KEC", "KOTA", "TELP", "NPWP", "JAKARTA", "DENPASAR", "BALI",
        "STRUK", "COPY", "KASIR", "TANGGAL", "DATE", "TIME",
        "TOTAL", "TUNAI", "KEMBALI", "PPN", "DPP", "HARGA JUAL", "ITEM",
        "LAYANAN", "KONTAK", "GRATIS", "ONGKIR", "HEMAT", "POINT", "CARD", "DEBIT",
        "KODE", "WAKTU", "NO.", "FAKTUR", "APOTEKKU", "SOLUSI", "KESEHATAN", 
        "WA", "SMS", "CALL", "CENTER", "WEBSITE", "INSTAGRAM", "CUSTOMER", "ORDER"
    ]

    # --- PATTERN DEFINITIONS ---
    
    # PATTERN 1: Explicit Calculation (Prioritas Tinggi)
    # Menangkap: "12.0 BOS x 17,000.00 204,000.00" atau "1x 33.000"
    # Group: (Qty) (Unit/Sep) (Harga) (Total)
    calc_pattern = re.compile(r'(\d+(?:[.,]\d+)?)\s*(?:BOS|PCS|x|X|@)?\s*[xX@]\s*((?:\d{1,3}[.,])*\d{1,3})\s+((?:\d{1,3}[.,])*\d{1,3})')

    # PATTERN 2: Standard End-of-Line Price (Prioritas Menengah)
    # Menangkap: "... 9000 9,000"
    price_total_pattern = re.compile(r'((?:\d{1,3}[.,])*\d{1,3})\s+((?:\d{1,3}[.,])*\d{1,3})$')
    
    # PATTERN 3: Qty Suffix (Untuk Pattern 2)
    # Menangkap "ROTI 2" -> Qty 2
    qty_suffix_pattern = re.compile(r'\s+(\d{1,3})\s*(?:@|x|X|PCS|pcs)?$')

    name_buffer = []

    for line in lines:
        line_clean = line.strip()
        line_upper = line_clean.upper()
        
        # A. Filter Sampah
        if any(kw in line_upper for kw in blacklist):
            if "TOTAL" in line_upper: # Reset buffer kalau ketemu Total Belanja
                name_buffer = [] 
            continue

        # --- CEK PATTERN 1: KALKULASI EKSPLISIT (Indomaret Gbr 1 & Saga) ---
        match_calc = calc_pattern.search(line_clean)
        
        if match_calc:
            raw_qty, raw_price, raw_total = match_calc.groups()
            
            qty = parse_qty(raw_qty)
            unit_price = parse_price(raw_price)
            total_price = parse_price(raw_total)
            
            # Tentukan Nama Barang
            # Kasus Indomaret: Nama ada di buffer baris sebelumnya
            # Kasus Saga: Baris ini mungkin cuma konfirmasi "1x 33.000"
            
            full_name = "Unknown Item"
            is_confirmation_line = False
            
            if name_buffer:
                full_name = " ".join(name_buffer).strip()
                # Bersihkan nomor urut di depan nama (misal "1 A000..." -> "A000...")
                full_name = re.sub(r'^\d+\s+', '', full_name)
            else:
                # Buffer kosong, tapi ketemu kalkulasi? 
                # Cek apakah ini baris konfirmasi Saga (misal sebelumnya sudah ke-capture itemnya)
                # Jika buffer kosong, kita ambil teks sisa di kiri baris ini (kalau ada)
                leftover = line_clean[:match_calc.start()].strip()
                if leftover:
                    full_name = leftover
                else:
                    is_confirmation_line = True

            # VALIDASI SAGA (Anti-Duplicate)
            # Jika baris ini cuma "1x 33.000" (nama kosong), dan totalnya sama dengan item terakhir...
            # Kemungkinan besar ini duplikat/konfirmasi POS. Jangan dimasukkan.
            if is_confirmation_line and items and items[-1]['total'] == total_price:
                 name_buffer = []
                 continue

            # Simpan Item (Jika Valid Math)
            if not is_confirmation_line and unit_price > 0:
                 # Math Check
                 if abs((qty * unit_price) - total_price) < 1000:
                    items.append({
                        "name": full_name,
                        "qty": qty,
                        "price": unit_price,
                        "total": total_price
                    })
            
            name_buffer = [] # Reset buffer
            continue # Lanjut ke baris berikutnya

        # --- CEK PATTERN 2: HARGA DI UJUNG (Standard) ---
        match_price = price_total_pattern.search(line_clean)
        
        if match_price:
            price_str, total_str = match_price.groups()
            unit_price = parse_price(price_str)
            total_price = parse_price(total_str)
            
            # Ambil sisa teks di kiri
            leftover = line_clean[:match_price.start()].strip()
            
            # Logika Qty (Cari di suffix)
            qty = 1
            current_name_part = leftover
            
            qty_match = qty_suffix_pattern.search(leftover)
            if qty_match:
                qty = int(qty_match.group(1))
                current_name_part = leftover[:qty_match.start()].strip()
            
            # Gabung Nama
            full_name = ""
            if not current_name_part and name_buffer:
                full_name = " ".join(name_buffer)
            elif current_name_part:
                if name_buffer:
                    full_name = (" ".join(name_buffer) + " " + current_name_part).strip()
                else:
                    full_name = current_name_part
            
            # Bersihkan nama dari simbol aneh
            full_name = re.sub(r'^\d+\s+', '', full_name) # Hapus nomor urut depan

            # Validasi Math & Simpan
            if unit_price > 0 and len(full_name) > 2:
                # Reverse Math Correction (untuk kasus Qty salah baca)
                if abs((qty * unit_price) - total_price) > 500:
                    # Coba hitung qty seharusnya
                    calc_qty = total_price / unit_price
                    if calc_qty.is_integer() and 0 < calc_qty < 1000:
                        qty = int(calc_qty)
                
                # Double Check Saga: Jangan masukkan "ICE" jika itu cuma modifier tanpa harga tambahan
                # Tapi di struk Saga, ICE punya harga. Jadi tetap dimasukkan.
                
                items.append({
                    "name": full_name,
                    "qty": qty,
                    "price": unit_price,
                    "total": total_price
                })
            
            name_buffer = []
            
        else:
            # === BUFFERING ===
            # Simpan baris teks biasa (Potensi nama barang)
            # Filter tanggal/waktu agar tidak masuk buffer
            if not re.search(r'\d{2}[:/.]\d{2}', line_clean):
                 if len(line_clean) > 2:
                    name_buffer.append(line_clean)

    return items

print("\n--- HASIL SAGA ---")
print(extract_items_universal(clean_lines))

In [None]:
import re

class ReceiptParser:
    def __init__(self):
        # Regex untuk mendeteksi angka harga (ribuan dengan titik)
        # Menangkap minimal 2 angka di akhir string (Satuan & Total)
        self.price_pattern = re.compile(r'((?:\d{1,3}[.,])*\d{1,3})')

    def clean_currency(self, price_str):
        """Mengubah string '10.000' jadi integer 10000"""
        if not price_str: return 0
        clean = re.sub(r'[^\d]', '', price_str)
        return int(clean) if clean else 0

    def is_numeric_line(self, text):
        """Mengecek apakah baris ini memiliki potensi sebagai baris harga"""
        # Minimal ada angka yang terlihat seperti harga (> 3 digit)
        # Atau ada pola mata uang
        return bool(re.search(r'\d{3,}', text))

    def parse_transaction_line(self, text):
        """
        Mencoba mengekstrak Qty, Harga Satuan, Total dari baris.
        Menggunakan logika 'Fallback' yang kita bahas sebelumnya.
        """
        # Cari semua angka yang berpotensi menjadi harga/qty
        # Kita split berdasarkan spasi, lalu bersihkan
        tokens = text.split()
        numbers = []
        
        for token in tokens:
            # Cek apakah token ini angka (bisa mengandung . atau ,)
            if re.match(r'^[\d.,]+$', token):
                # Bersihkan jadi float/int murni untuk analisis
                val_str = re.sub(r'[^\d]', '', token)
                if val_str:
                    numbers.append(int(val_str))
        
        # LOGIKA PEMULIHAN DATA (Data Recovery)
        qty = 1
        unit_price = 0
        total_price = 0

        if len(numbers) >= 3:
            # Kasus Ideal: Ada 3 angka terdeteksi (Qty, Satuan, Total)
            # Biasanya urutannya: Qty, Satuan, Total
            # Tapi kadang Qty dianggap ribuan (misal 1.000), jadi hati-hati
            qty = numbers[0]
            unit_price = numbers[1]
            total_price = numbers[-1] # Ambil paling kanan sbg total
            
            # Koreksi jika Qty terdeteksi sebagai harga (misal angka tahun/kode)
            # Cek matematika: Jika angka1 * angka2 != angka3, coba kombinasi lain
            
        elif len(numbers) == 2:
            # Kasus Qty Hilang / Tidak Terbaca OCR
            # Asumsi: Angka 1 = Satuan, Angka 2 = Total
            unit_price = numbers[0]
            total_price = numbers[1]
            
            # Hitung Qty secara matematis
            if unit_price > 0:
                calc_qty = total_price / unit_price
                # Jika hasil bagi bulat, berarti itu Qty nya
                if calc_qty.is_integer():
                    qty = int(calc_qty)
                else:
                    # Jika tidak bulat, mungkin urutannya terbalik atau salah deteksi
                    pass 
        
        elif len(numbers) == 1:
             # Kasus Ekstrim: Cuma Total yang terbaca
             total_price = numbers[0]
             unit_price = total_price # Asumsi beli 1
        
        return qty, unit_price, total_price

    def process_receipt(self, lines):
        parsed_items = []
        name_buffer = [] # Penampung nama barang (untuk kasus multi-line)

        for line in lines:
            line = line.strip()
            if not line: continue

            # Langkah 1: Apakah ini baris harga?
            # Ciri: Diakhiri dengan angka format uang
            is_price_row = False
            matches = self.price_pattern.findall(line)
            
            # Jika ada minimal 1 angka uang di akhir baris
            if matches and len(matches) >= 1:
                # Cek apakah angka tersebut ada di ujung kanan string
                if re.search(r'\d+\s*$', line): 
                    is_price_row = True

            if is_price_row:
                # --- INI ADALAH LINE TRANSAKSI ---
                
                # 1. Ekstrak Angka
                qty, unit, total = self.parse_transaction_line(line)
                
                # 2. Tentukan Nama Barang
                # Hapus angka-angka dari baris ini untuk melihat sisa teks (nama)
                text_content = re.sub(r'[\d.,]+', '', line).strip()
                
                final_name = ""
                
                if name_buffer:
                    # Pola 2 & 3: Nama ada di baris sebelumnya
                    # Gabungkan buffer, lalu tambahkan teks sisa di baris harga (jika ada)
                    final_name = " ".join(name_buffer)
                    if len(text_content) > 2: # Kalau sisa teks valid (bukan simbol sampah)
                        final_name += " " + text_content
                    
                    # Reset Buffer karena sudah dipakai
                    name_buffer = []
                else:
                    # Pola 1: Nama ada di baris yang sama
                    final_name = text_content
                
                # Simpan Hasil
                parsed_items.append({
                    "name": final_name,
                    "qty": qty,
                    "price": unit,
                    "total": total
                })
            
            else:
                # --- INI ADALAH LINE NAMA (Teks) ---
                # Masukkan ke buffer untuk dipakai nanti saat ketemu harga
                # Filter sampah: Jangan masukkan jika cuma simbol/garis
                if len(line) > 2 and not re.match(r'^[=\-]+$', line):
                    name_buffer.append(line)

        return parsed_items

In [None]:
parser = ReceiptParser()
items = parser.process_receipt(clean_lines)

import json
print(json.dumps(items, indent=2))

In [None]:
import re

# Simulasi teks hasil OCR dari gambar Anda
ocr_text = """
--- HASIL STRUK RAPI ---
SAGA RENON - Listening Space
: 05/12/202516:07
Date
Order Number : POS-051225-46
Customer :6 saga
Sales Type :Normal
User :Saga Coffee Bali
Cashier :Saga Coffee Bali
AMERICANO
ICE 33.000
1x33.000
Total Item 1
33.000
Subtotal
1.650
Service Charge (5%)
3.465
Tax
385
Adjustment
Total 38.500
"""

def parse_receipt(text):
    lines = [line.strip() for line in text.split('\n') if line.strip()]
    items = []
    
    # 1. Tentukan batas area pencarian item
    # Kita mulai mencari setelah kata kunci metadata selesai
    start_index = 0
    end_index = len(lines)
    
    for i, line in enumerate(lines):
        # Anchor bawah (biasanya item berhenti sebelum Total Item atau Subtotal)
        if "Total Item" in line or "Subtotal" in line:
            end_index = i
            break
        # Anchor atas (biasanya setelah Cashier atau User)
        if "Cashier" in line:
            start_index = i + 1

    # Ambil hanya baris yang berpotensi berisi item
    item_section = lines[start_index:end_index]
    
    # 2. Logika Ekstraksi Item (Looping terbalik atau Pattern Matching)
    # Di struk ini, pola kuncinya adalah baris "1x33.000" (Qty x Harga)
    # Nama item berada DI ATAS baris kuantitas tersebut.
    
    temp_item_name = []
    
    for line in item_section:
        # Regex untuk mencari pola "1x33.000" atau "2 x 50.000"
        # \d+ (angka), x (huruf x), [\d\.]+ (angka dengan titik)
        qty_price_match = re.search(r'(\d+)\s*x\s*([\d\.]+)', line)
        
        if qty_price_match:
            qty = qty_price_match.group(1)
            price_per_unit = qty_price_match.group(2).replace('.', '')
            
            # Jika ketemu pola qty, berarti baris-baris sebelumnya yang ditampung adalah nama item
            full_item_name = " ".join(temp_item_name)
            
            items.append({
                "name": full_item_name,
                "qty": int(qty),
                "price": int(price_per_unit),
                "total": int(qty) * int(price_per_unit)
            })
            
            # Reset penampung nama untuk item berikutnya
            temp_item_name = []
        else:
            # Jika bukan baris qty, dan bukan harga varian (opsional, tergantung kompleksitas)
            # Kita anggap ini bagian dari nama item atau varian (seperti "ICE 33.000")
            # Untuk simplifikasi, kita masukkan ke nama item dulu
            temp_item_name.append(line)

    return items

# Jalankan fungsi
parsed_data = parse_receipt(clean_lines)

# Tampilkan Hasil
import json
print(json.dumps(parsed_data, indent=4))