In [1]:
import os
import json
import csv 
import pandas as pd
from pathlib import Path
from PIL import Image


# Library For OCR
from paddleocr  import PaddleOCR, PaddleOCRVL, PPStructureV3


  from .autonotebook import tqdm as notebook_tqdm
[33mChecking connectivity to the model hosters, this may take a while. To bypass this check, set `DISABLE_MODEL_SOURCE_CHECK` to `True`.[0m


In [2]:
# Init OCR (hanya perlu sekali saja)
ocr_model = PaddleOCR(
   # 1. Dokumen Orientasi (Rotasi Gambar)
    use_doc_orientation_classify=True,  # JANGAN LUPA SET TRUE
    doc_orientation_classify_model_name='PP-LCNet_x1_0_doc_ori',
    
    # 2. Dokumen Unwarping (Pelurusan Kertas Lecek)
    use_doc_unwarping=True,             # JANGAN LUPA SET TRUE
    doc_unwarping_model_name='UVDoc',
    
    # 3. Deteksi Teks (Mencari text dengan Kotak)
    text_detection_model_name='PP-OCRv5_server_det',
    
    # 4. Orientasi Per Baris Teks
    use_textline_orientation=True,      # Opsional, bisa False biar lebih cepat
    textline_orientation_model_name='PP-LCNet_x1_0_textline_ori',
    
    # 5. Pengenalan Teks (Membaca Huruf)
    text_recognition_model_name='latin_PP-OCRv5_mobile_rec',
    
    
    # 3. Deteksi & Ukuran (Penting untuk struk panjang)
    text_det_limit_side_len=1200,        
    text_det_limit_type='max',          
    
    # 4. Thresholding (Fine-tuning deteksi)
    text_det_thresh=0.4,                
    text_det_box_thresh=0.5,            
    text_det_unclip_ratio=2,   
           
    # 5. Parameter Tambahan (Jika diperlukan)
    text_rec_score_thresh=0.5,          # Batas minimum confidence score untuk hasil OCR
    return_word_box=False,              # False jika ingin per baris, True jika per kata
)


[32mCreating model: ('PP-LCNet_x1_0_doc_ori', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\Users\igust\.paddlex\official_models\PP-LCNet_x1_0_doc_ori`.[0m
[32mCreating model: ('UVDoc', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\Users\igust\.paddlex\official_models\UVDoc`.[0m
[32mCreating model: ('PP-LCNet_x1_0_textline_ori', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\Users\igust\.paddlex\official_models\PP-LCNet_x1_0_textline_ori`.[0m
[32mCreating model: ('PP-OCRv5_server_det', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\Users\igust\.paddlex\official_models\PP-OCRv5_server_det`.[0m
[32mCreating model: ('latin_PP-OCRv5_mobile_rec', None)[0m
[32mModel files already exist. Using cac

#### LOAD Gambar Struk 

1. Gambar struk yang akan digunakan terdapat pada file "struk"
   
2. semua gambar akan di load secara otomatis dan di lakukan prediksi OCR dengan model yang telah di Inisiasi.

3. Kemudain hasil akan disimpan pada file Output dalam bentuk Json.
   
4. Selanjutnya semua file hasil akan dimuat lagi untuk Porses selanjutnya "Lebeling" 

In [None]:
# File Path 
img_dir = Path("../struk")
output_ocr_dir = Path("output")

In [7]:
# 1. Ambil semua file, jadikan list, dan URUTKAN
all_images = sorted(list(img_dir.glob("*.*")))

# 2. Slice list tersebut mulai dari index 20 (file ke-21) sampai habis
# Filter: Hanya ambil file yang namanya (stem) adalah angka >= 21
target_images = [
    img for img in all_images 
    if img.stem.isdigit() and int(img.stem) >= 23
]

# 2. Sort: Urutkan hasil akhirnya secara numerik (21, 22, 23...)
target_images.sort(key=lambda x: int(x.stem))



print(f"Total gambar ditemukan: {len(target_images)}")
print(f"Akan memproses: {len(target_images)} gambar (mulai dari {target_images[0].name})")

Total gambar ditemukan: 22
Akan memproses: 22 gambar (mulai dari 23.jpg)


In [None]:


# 3. Loop pada list yang sudah dipotong
for img_path in target_images:
    print(f"Processing {img_path.name}")

    try:
        result = ocr_model.predict(str(img_path))

        for res in result:
            res.save_to_json(output_ocr_dir)
            
    except Exception as e:
        print(f"ERROR processing {img_path.name}: {e}")

##### Dataset init

In [8]:
if "dataset_rows" not in globals():
    dataset_rows = []
    print("Dataset initialized")
else:
    print("Dataset already exists, rows:", len(dataset_rows))


Dataset already exists, rows: 0


##### Muat Gambar 

In [None]:
ocr_dir = Path("output")

image_files = sorted(
    [p for p in img_dir.iterdir() if p.suffix.lower() in [".jpg", ".jpeg", ".png"]],
    key=lambda x: int(x.stem)  # urut numerik
)

print("Total struk ditemukan:", len(image_files))


Total struk ditemukan: 44


In [None]:
ocr_dir = Path("json")


Total struk ditemukan: 44


In [None]:
for img_path in image_files:
    num = img_path.stem  # "1", "2", ...
    receipt_id = f"receipt_{int(num):03d}"

    ocr_json_path = ocr_dir / f"{num}_res.json"

    if not ocr_json_path.exists():
        print(f"⚠️ OCR result tidak ditemukan untuk {img_path.name}, skip")
        continue

    # ambil ukuran gambar
    img = Image.open(img_path)
    IMAGE_WIDTH, IMAGE_HEIGHT = img.size

    # load OCR json
    with open(ocr_json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    texts = data["rec_texts"]
    scores = data["rec_scores"]
    boxes = data["rec_boxes"]

    before = len(dataset_rows)

    for line_id, (text, score, box) in enumerate(zip(texts, scores, boxes)):
        x_min, y_min, x_max, y_max = box

        dataset_rows.append({
            "receipt_id": receipt_id,
            "line_id": line_id,
            "text": text,
            "ocr_conf": round(score, 4),

            "x_min": x_min,
            "y_min": y_min,
            "x_max": x_max,
            "y_max": y_max,

            "x_center_norm": ((x_min + x_max) / 2) / IMAGE_WIDTH,
            "y_center_norm": ((y_min + y_max) / 2) / IMAGE_HEIGHT,
            "box_width_norm": (x_max - x_min) / IMAGE_WIDTH,

            "has_digit": int(any(c.isdigit() for c in text)),
            "text_len": len(text)
        })

    added = len(dataset_rows) - before
    print(f"✅ {receipt_id} diproses | +{added} baris | total: {len(dataset_rows)}")

✅ receipt_001 diproses | +39 baris | total: 39
✅ receipt_002 diproses | +34 baris | total: 73
✅ receipt_003 diproses | +34 baris | total: 107
✅ receipt_004 diproses | +33 baris | total: 140
✅ receipt_005 diproses | +28 baris | total: 168
✅ receipt_006 diproses | +130 baris | total: 298
✅ receipt_007 diproses | +50 baris | total: 348
✅ receipt_008 diproses | +71 baris | total: 419
✅ receipt_009 diproses | +32 baris | total: 451
✅ receipt_010 diproses | +21 baris | total: 472
✅ receipt_011 diproses | +28 baris | total: 500
✅ receipt_012 diproses | +32 baris | total: 532
✅ receipt_013 diproses | +36 baris | total: 568
✅ receipt_014 diproses | +41 baris | total: 609
✅ receipt_015 diproses | +33 baris | total: 642
✅ receipt_016 diproses | +29 baris | total: 671
✅ receipt_017 diproses | +37 baris | total: 708
✅ receipt_018 diproses | +30 baris | total: 738
✅ receipt_019 diproses | +30 baris | total: 768
✅ receipt_020 diproses | +36 baris | total: 804


In [10]:
for img_path in target_images:
    num = img_path.stem  # "1", "2", ...
    receipt_id = f"receipt_{int(num):03d}"

    ocr_json_path = ocr_dir / f"{num}_res.json"

    if not ocr_json_path.exists():
        print(f"⚠️ OCR result tidak ditemukan untuk {img_path.name}, skip")
        continue

    # ambil ukuran gambar
    img = Image.open(img_path)
    IMAGE_WIDTH, IMAGE_HEIGHT = img.size

    # load OCR json
    with open(ocr_json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    texts = data["rec_texts"]
    scores = data["rec_scores"]
    boxes = data["rec_boxes"]

    before = len(dataset_rows)

    for line_id, (text, score, box) in enumerate(zip(texts, scores, boxes)):
        x_min, y_min, x_max, y_max = box

        dataset_rows.append({
            "receipt_id": receipt_id,
            "line_id": line_id,
            "text": text,
            "ocr_conf": round(score, 4),

            "x_min": x_min,
            "y_min": y_min,
            "x_max": x_max,
            "y_max": y_max,

            "x_center_norm": ((x_min + x_max) / 2) / IMAGE_WIDTH,
            "y_center_norm": ((y_min + y_max) / 2) / IMAGE_HEIGHT,
            "box_width_norm": (x_max - x_min) / IMAGE_WIDTH,

            "has_digit": int(any(c.isdigit() for c in text)),
            "text_len": len(text)
        })

    added = len(dataset_rows) - before
    print(f"✅ {receipt_id} diproses | +{added} baris | total: {len(dataset_rows)}")

✅ receipt_023 diproses | +43 baris | total: 43
✅ receipt_024 diproses | +49 baris | total: 92
✅ receipt_025 diproses | +43 baris | total: 135
✅ receipt_026 diproses | +44 baris | total: 179
✅ receipt_027 diproses | +58 baris | total: 237
✅ receipt_028 diproses | +49 baris | total: 286
✅ receipt_029 diproses | +43 baris | total: 329
✅ receipt_030 diproses | +33 baris | total: 362
✅ receipt_031 diproses | +25 baris | total: 387
✅ receipt_032 diproses | +36 baris | total: 423
✅ receipt_033 diproses | +27 baris | total: 450
✅ receipt_034 diproses | +38 baris | total: 488
✅ receipt_035 diproses | +43 baris | total: 531
✅ receipt_036 diproses | +53 baris | total: 584
✅ receipt_037 diproses | +32 baris | total: 616
✅ receipt_038 diproses | +33 baris | total: 649
✅ receipt_039 diproses | +36 baris | total: 685
✅ receipt_040 diproses | +41 baris | total: 726
✅ receipt_041 diproses | +51 baris | total: 777
✅ receipt_042 diproses | +37 baris | total: 814
✅ receipt_043 diproses | +30 baris | total

In [11]:
df = pd.DataFrame(dataset_rows)
df["receipt_id"].nunique()

22

In [12]:
df["receipt_id"].value_counts()


receipt_id
receipt_027    58
receipt_036    53
receipt_041    51
receipt_024    49
receipt_028    49
receipt_026    44
receipt_025    43
receipt_023    43
receipt_035    43
receipt_029    43
receipt_040    41
receipt_034    38
receipt_042    37
receipt_032    36
receipt_039    36
receipt_044    33
receipt_030    33
receipt_038    33
receipt_037    32
receipt_043    30
receipt_033    27
receipt_031    25
Name: count, dtype: int64

In [13]:
df.groupby("receipt_id").head(2)[
    ["receipt_id", "line_id", "text"]
]


Unnamed: 0,receipt_id,line_id,text
0,receipt_023,0,MEGA ARTA
1,receipt_023,1,J7. Tangkuban Perahu
43,receipt_024,0,MEGA ARTA
44,receipt_024,1,J7. Tangkuban Perahu
92,receipt_025,0,MEGA ARTA
93,receipt_025,1,J7. Tangkuban Perahu
135,receipt_026,0,MEGA ARTA
136,receipt_026,1,J1. Tangkuban Perahu
179,receipt_027,0,MEGA ARTA
180,receipt_027,1,J7. Tangkuban Perahu


In [40]:
## Konversi dataset_rows → DataFrame

df = pd.DataFrame(dataset_rows)

print("Total baris:", len(df))
df.head()


Total baris: 877


Unnamed: 0,receipt_id,line_id,text,ocr_conf,x_min,y_min,x_max,y_max,x_center_norm,y_center_norm,box_width_norm,has_digit,text_len
0,receipt_023,0,MEGA ARTA,0.9618,2273,1173,3710,1424,0.488807,0.15913,0.234804,0,9
1,receipt_023,1,J7. Tangkuban Perahu,0.9506,1487,1387,4517,1747,0.490523,0.192034,0.495098,1,20
2,receipt_023,2,Padangsambian - Denpasar,0.9876,1208,1670,4816,1987,0.492157,0.224081,0.589542,0,24
3,receipt_023,3,16/12/2025,0.9998,2560,2576,3894,2860,0.527288,0.333088,0.217974,1,10
4,receipt_023,4,Date,0.9996,742,2608,1409,2867,0.175735,0.335478,0.108987,0,4


In [41]:
if "is_item_line" not in df.columns:
    df["is_item_line"] = pd.NA

print("Belum dilabeli:", df["is_item_line"].isna().sum())


Belum dilabeli: 877


In [42]:
def interactive_labeling_df(df):
    total = len(df)

    for idx in range(total):
        if pd.notna(df.at[idx, "is_item_line"]):
            continue

        row = df.loc[idx]

        print("=" * 70)
        print(f"Index       : {idx}/{total}")
        print(f"Receipt ID  : {row['receipt_id']}")
        print(f"Line ID     : {row['line_id']}")
        print(f"Text        : {row['text']}")
        print(f"Y-pos (norm): {row['y_center_norm']:.2f}")
        print(f"Has digit   : {row['has_digit']}")

        label = input("Label? [1=item | 0=non-item | s=skip | q=quit]: ").strip()

        if label == "1":
            df.at[idx, "is_item_line"] = 1
        elif label == "0":
            df.at[idx, "is_item_line"] = 0
        elif label.lower() == "s":
            continue
        elif label.lower() == "q":
            print("⏸️ Pause labeling")
            break
        else:
            print("Input tidak valid")
            continue


In [43]:
interactive_labeling_df(df)


Index       : 0/877
Receipt ID  : receipt_023
Line ID     : 0
Text        : MEGA ARTA
Y-pos (norm): 0.16
Has digit   : 0
Index       : 1/877
Receipt ID  : receipt_023
Line ID     : 1
Text        : J7. Tangkuban Perahu
Y-pos (norm): 0.19
Has digit   : 1
Index       : 2/877
Receipt ID  : receipt_023
Line ID     : 2
Text        : Padangsambian - Denpasar
Y-pos (norm): 0.22
Has digit   : 0
Index       : 3/877
Receipt ID  : receipt_023
Line ID     : 3
Text        : 16/12/2025
Y-pos (norm): 0.33
Has digit   : 1
Index       : 4/877
Receipt ID  : receipt_023
Line ID     : 4
Text        : Date
Y-pos (norm): 0.34
Has digit   : 0
Index       : 5/877
Receipt ID  : receipt_023
Line ID     : 5
Text        : 17:38:42
Y-pos (norm): 0.34
Has digit   : 1
Index       : 6/877
Receipt ID  : receipt_023
Line ID     : 6
Text        : No.
Y-pos (norm): 0.37
Has digit   : 0
Index       : 7/877
Receipt ID  : receipt_023
Line ID     : 7
Text        : 2512879372
Y-pos (norm): 0.37
Has digit   : 1
Index       : 8/

In [44]:
if "is_item_line" not in df.columns:
    df["is_item_line"] = pd.NA

print("Belum dilabeli:", df["is_item_line"].isna().sum())

Belum dilabeli: 5


In [45]:
df[df["is_item_line"].isna()][
    ["receipt_id", "line_id", "text", "y_center_norm", "has_digit"]
]


Unnamed: 0,receipt_id,line_id,text,y_center_norm,has_digit
228,receipt_027,49,17.500,0.72451,1
263,receipt_028,26,: RP.,0.592157,0
324,receipt_029,38,dibeli tidak dapat,0.789828,0
353,receipt_030,24,Kembali,0.687891,0
588,receipt_037,4,BLOK 0D NO 40 RT DDO RW,0.138971,1


In [46]:
df.loc[df["is_item_line"].isna(), "is_item_line"] = 0


In [47]:
df["is_item_line"].isna().sum()


np.int64(0)

In [48]:
df["is_item_line"].value_counts()


is_item_line
0    683
1    194
Name: count, dtype: int64

In [49]:
df.groupby("receipt_id")["is_item_line"].sum().describe()


count     22
unique    13
top       11
freq       3
Name: is_item_line, dtype: int64

### Run terakhir

In [50]:
data_ocr_csv = os.path.join("output", "ocr_lines2.csv")

df.to_csv(data_ocr_csv, index=False, encoding="utf-8")
print(f"Save to {data_ocr_csv}")

Save to output\ocr_lines2.csv


#### Split Dataset

In [24]:
from sklearn.model_selection import train_test_split

# pastikan dataset sudah dilabel
assert df["is_item_line"].isna().sum() == 0

# ambil daftar receipt unik
receipt_ids = df["receipt_id"].unique()

# split receipt_id, BUKAN baris
train_receipts, test_receipts = train_test_split(
    receipt_ids,
    test_size=0.2,
    random_state=42
)

# buat train & test dataframe
train_df = df[df["receipt_id"].isin(train_receipts)].reset_index(drop=True)
test_df  = df[df["receipt_id"].isin(test_receipts)].reset_index(drop=True)

print("Jumlah receipt train:", train_df["receipt_id"].nunique())
print("Jumlah receipt test :", test_df["receipt_id"].nunique())
print()
print("Jumlah baris train:", len(train_df))
print("Jumlah baris test :", len(test_df))


Jumlah receipt train: 16
Jumlah receipt test : 4

Jumlah baris train: 672
Jumlah baris test : 132


In [25]:
set(train_df["receipt_id"]) & set(test_df["receipt_id"])


set()

In [26]:
print("Train distribution:")
print(train_df["is_item_line"].value_counts(normalize=True))

print("\nTest distribution:")
print(test_df["is_item_line"].value_counts(normalize=True))


Train distribution:
is_item_line
0    0.544643
1    0.455357
Name: proportion, dtype: float64

Test distribution:
is_item_line
0    0.80303
1    0.19697
Name: proportion, dtype: float64


### Tentukan fitur 

- y_center_norm      (posisi vertikal)
- x_center_norm      (posisi horizontal)
- box_width_norm     (lebar relatif)
- text_len           (panjang teks)
- has_digit          (0/1)
- ocr_conf           (confidence OCR)


In [27]:
from sklearn.preprocessing import StandardScaler

FEATURES = [
    "y_center_norm",
    "x_center_norm",
    "box_width_norm",
    "text_len",
    "has_digit",
    "ocr_conf"
]

X_train = train_df[FEATURES]
y_train = train_df["is_item_line"]

X_test = test_df[FEATURES]
y_test = test_df["is_item_line"]


In [28]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)


In [30]:
print(train_df["is_item_line"].dtype)
print(train_df["is_item_line"].unique())


object
[0 1]


In [31]:
train_df["is_item_line"] = train_df["is_item_line"].astype(int)
test_df["is_item_line"]  = test_df["is_item_line"].astype(int)

y_train = train_df["is_item_line"]
y_test  = test_df["is_item_line"]


In [32]:
print(y_train.dtype)
print(y_train.unique())


int64
[0 1]


#### Train model pertama Logistic Regression

In [33]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(
    max_iter=1000,
    class_weight="balanced",  # penting untuk OCR
    random_state=42
)

model.fit(X_train_scaled, y_train)

print("✅ Model trained")


✅ Model trained


In [34]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_scaled)

print(classification_report(y_test, y_pred, digits=3))


              precision    recall  f1-score   support

           0      0.889     0.679     0.770       106
           1      0.333     0.654     0.442        26

    accuracy                          0.674       132
   macro avg      0.611     0.667     0.606       132
weighted avg      0.779     0.674     0.705       132



In [35]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
print(cm)


[[72 34]
 [ 9 17]]


In [36]:
from sklearn.metrics import classification_report

y_proba = model.predict_proba(X_test_scaled)[:, 1]

for t in [0.3, 0.35, 0.4]:
    y_pred_t = (y_proba >= t).astype(int)
    print(f"\nThreshold = {t}")
    print(classification_report(y_test, y_pred_t, digits=3))




Threshold = 0.3
              precision    recall  f1-score   support

           0      0.873     0.453     0.596       106
           1      0.247     0.731     0.369        26

    accuracy                          0.508       132
   macro avg      0.560     0.592     0.483       132
weighted avg      0.749     0.508     0.551       132


Threshold = 0.35
              precision    recall  f1-score   support

           0      0.883     0.500     0.639       106
           1      0.264     0.731     0.388        26

    accuracy                          0.545       132
   macro avg      0.574     0.615     0.513       132
weighted avg      0.761     0.545     0.589       132


Threshold = 0.4
              precision    recall  f1-score   support

           0      0.889     0.604     0.719       106
           1      0.300     0.692     0.419        26

    accuracy                          0.621       132
   macro avg      0.594     0.648     0.569       132
weighted avg      0.77

In [38]:
fn = test_df[(y_test == 1) & (y_pred == 0)]
fn[["receipt_id","line_id","text","y_center_norm","has_digit"]].head(10)


Unnamed: 0,receipt_id,line_id,text,y_center_norm,has_digit
16,receipt_001,16,(TABLET),0.558594,0
82,receipt_016,9,Bread Butter Pudding,0.429258,0
84,receipt_016,11,Cream Bruille,0.469093,0
86,receipt_016,13,Choco Croissant,0.510989,0
88,receipt_016,15,Bank Of Chocolat,0.552885,0
110,receipt_018,8,Buku Murah Tiga Serangkai DTP 200,0.352941,1
112,receipt_018,10,"2x20000 40,000",0.392734,1
113,receipt_018,11,BUKU MURAH TIGA SERANGKAI DTP 150,0.429066,1
115,receipt_018,13,"Zx15000 30,000",0.467128,1


In [39]:
fp = test_df[(y_test == 0) & (y_pred == 1)]
fp[["receipt_id","line_id","text","y_center_norm","has_digit"]].head(10)



Unnamed: 0,receipt_id,line_id,text,y_center_norm,has_digit
2,receipt_001,2,JL.,0.265625,0
4,receipt_001,4,KAJA,0.2875,0
5,receipt_001,5,0811 3988 879,0.310937,1
20,receipt_001,20,10.956,0.624609,1
22,receipt_001,22,+44,0.647656,1
24,receipt_001,24,11.000,0.670312,1
27,receipt_001,27,12.000,0.737891,1
30,receipt_001,30,1.000,0.762109,1
32,receipt_001,32,9.909,0.808594,1
34,receipt_001,34,(11.%) :,0.833203,1


In [102]:
ITEM_THRESHOLD = 0.4

test_df["is_item_pred"] = (y_proba >= ITEM_THRESHOLD).astype(int)

candidates = test_df[test_df["is_item_pred"] == 1] \
    .sort_values(["receipt_id", "y_center_norm"]) \
    .reset_index(drop=True)


In [103]:
import re

def is_name_like(text):
    # Ada huruf dan BUKAN pure number
    return bool(re.search(r"[A-Za-z]", text)) and not re.fullmatch(r"[\d.,]+", text.strip())


def is_qty_price(text):
    return bool(re.search(r"\d+\s*[xX@]\s*\d+", text))

def is_price_only(text):
    return bool(re.fullmatch(r"[\d.,]+", text.strip()))

def is_discount(text):
    return "diskon" in text.lower() or "-" in text


In [104]:
def looks_like_item_start(row):
    text = row["text"]
    y = row["y_center_norm"]

    has_alpha = bool(re.search(r"[A-Za-z]", text))
    not_header = y > 0.3

    is_not_obvious_summary = not any(
        k in text.lower()
        for k in ["total", "subtotal", "harga jual", "ppn", "tunai", "kembali"]
    )

    return has_alpha and not_header and is_not_obvious_summary


In [105]:
def looks_like_summary(text):
    keywords = ["total", "subtotal", "ppn", "tunai", "kembali", "anda hemat"]
    return any(k in text.lower() for k in keywords)


In [106]:
grouped_items = []
current_item = None
last_y = None
in_item_section = False

for _, row in candidates.iterrows():
    text = row["text"]
    y = row["y_center_norm"]
    receipt = row["receipt_id"]

    # reset jika receipt baru
    if current_item and current_item["receipt_id"] != receipt:
        grouped_items.append(current_item)
        current_item = None
        last_y = None
        in_item_section = False

    # STOP GATE: berhenti saat summary
    if in_item_section and looks_like_summary(text):
        if current_item:
            grouped_items.append(current_item)
        current_item = None
        in_item_section = False
        last_y = None
        continue

    # START GATE: tunggu masuk area item
    if not in_item_section:
        if looks_like_item_start(row):
            in_item_section = True
        else:
            continue

    # ===== GROUPING DIMULAI =====

    # item baru
    if current_item is None:
        current_item = {
            "receipt_id": receipt,
            "lines": [text],
            "name": text if is_name_like(text) else None,
            "qty_price": None,
            "total": None,
            "discount": None,
            "ys": [y]
        }
        last_y = y
        continue

    # masih item yang sama
    if abs(y - last_y) < 0.05:
        current_item["lines"].append(text)
        current_item["ys"].append(y)

        # 🔑 NAME LOGIC (SATU BLOK SAJA)
        if is_name_like(text):
            if current_item["name"] is None or len(text) > len(current_item["name"]):
                current_item["name"] = text

        elif is_qty_price(text):
            current_item["qty_price"] = text

        elif is_price_only(text):
            current_item["total"] = text

        elif is_discount(text):
            current_item["discount"] = text

    # item baru (jarak jauh)
    else:
        grouped_items.append(current_item)
        current_item = {
            "receipt_id": receipt,
            "lines": [text],
            "name": text if is_name_like(text) else None,
            "qty_price": None,
            "total": None,
            "discount": None,
            "ys": [y]
        }

    last_y = y

# append terakhir
if current_item:
    grouped_items.append(current_item)


In [107]:
def is_valid_item(item):
    has_name = item["name"] is not None
    has_number_line = any(re.search(r"\d", l) for l in item["lines"])
    return has_name and has_number_line

clean_items = [i for i in grouped_items if is_valid_item(i)]


In [108]:
def is_obvious_non_item(item):
    return item["name"] and item["name"].lower().startswith("kode")

clean_items = [
    i for i in clean_items
    if not is_obvious_non_item(i)
]


In [109]:
clean_items

[{'receipt_id': 'receipt_001',
  'lines': ['PROCOLD TAB@144', '12 @913', '10.956', '10.956', '+44', '11.000'],
  'name': 'PROCOLD TAB@144',
  'qty_price': '12 @913',
  'total': '11.000',
  'discount': None,
  'ys': [0.555078125,
   0.57578125,
   0.5796875,
   0.624609375,
   0.64765625,
   0.6703125]},
 {'receipt_id': 'receipt_002',
  'lines': ['NISSIN WeR CHO 11OG',
   '1',
   '9,300',
   '9300',
   'O/TATER THIN SWD 100',
   '1',
   '15,200',
   '15200'],
  'name': 'O/TATER THIN SWD 100',
  'qty_price': None,
  'total': '15200',
  'discount': None,
  'ys': [0.41950596252129474,
   0.42163543441226575,
   0.42163543441226575,
   0.42206132879046,
   0.4608177172061329,
   0.46252129471890974,
   0.46379897785349233,
   0.46422487223168657]},
 {'receipt_id': 'receipt_016',
  'lines': ['Check No : 3059689', '10 May 19 16:32:47'],
  'name': 'Check No : 3059689',
  'qty_price': None,
  'total': None,
  'discount': None,
  'ys': [0.3008241758241758, 0.3482142857142857]},
 {'receipt_id': '

In [110]:
import re

def to_int(num_str):
    if num_str is None:
        return None
    s = num_str.replace(".", "").replace(",", "").strip()
    return int(s) if s.isdigit() else None


In [111]:
def parse_qty_unit(text):
    if text is None:
        return None, None

    t = text.lower().replace(" ", "")

    # format 2x20000 atau 2@20000
    m = re.search(r"(\d{1,2})[x@](\d{3,6})", t)
    if m:
        return int(m.group(1)), int(m.group(2))

    return None, None



In [112]:
def parse_total_from_lines(lines):
    nums = []
    for l in lines:
        found = re.findall(r"\d+[.,]?\d*", l)
        for f in found:
            v = to_int(f)
            if v is not None:
                nums.append(v)

    return max(nums) if nums else None


In [113]:
def is_valid_qty(q):
    return q is not None and 1 <= q <= 100


In [114]:
def parse_item(item):
    qty = None
    unit_price = None

    # 1) coba dari qty_price
    q, u = parse_qty_unit(item.get("qty_price"))
    qty, unit_price = q, u

    # 2) fallback: cari qty/unit dari lines
    if qty is None or unit_price is None:
        for l in item["lines"]:
            q2, u2 = parse_qty_unit(l)
            if q2 is not None:
                qty = q2
            if u2 is not None:
                unit_price = u2

    # 3) total (WAJIB sebelum fallback)
    total = parse_total_from_lines(item["lines"])

    # 4) OCR fallback (INI KRUSIAL)
    if qty is None and total is not None:
        qty = 1

    if unit_price is None and qty and total:
        unit_price = total // qty

    # 5) sanity check qty
    if not is_valid_qty(qty):
        return None

    # 6) validasi sederhana (opsional flag)
    if qty and unit_price and total:
        if abs(qty * unit_price - total) > max(1000, 0.05 * total):
            pass  # bisa ditandai mismatch

    return {
        "receipt_id": item["receipt_id"],
        "name": item["name"],
        "qty": qty,
        "unit_price": unit_price,
        "total": total,
        "raw_lines": item["lines"]
    }


In [115]:
parsed_items = [parse_item(i) for i in clean_items]

parsed_items[:3]


[{'receipt_id': 'receipt_001',
  'name': 'PROCOLD TAB@144',
  'qty': 12,
  'unit_price': 913,
  'total': 11000,
  'raw_lines': ['PROCOLD TAB@144',
   '12 @913',
   '10.956',
   '10.956',
   '+44',
   '11.000']},
 {'receipt_id': 'receipt_002',
  'name': 'O/TATER THIN SWD 100',
  'qty': 1,
  'unit_price': 15200,
  'total': 15200,
  'raw_lines': ['NISSIN WeR CHO 11OG',
   '1',
   '9,300',
   '9300',
   'O/TATER THIN SWD 100',
   '1',
   '15,200',
   '15200']},
 {'receipt_id': 'receipt_016',
  'name': 'Check No : 3059689',
  'qty': 1,
  'unit_price': 3059689,
  'total': 3059689,
  'raw_lines': ['Check No : 3059689', '10 May 19 16:32:47']}]

In [116]:
def looks_like_non_item_name(name):
    keywords = ["check", "disc", "pp", "bkp", "kode"]
    return any(k in name.lower() for k in keywords)


In [117]:
final_items = [
    i for i in parsed_items
    if i["name"]
    and i["total"] is not None
    and not looks_like_non_item_name(i["name"])
]


In [118]:
final_items

[{'receipt_id': 'receipt_001',
  'name': 'PROCOLD TAB@144',
  'qty': 12,
  'unit_price': 913,
  'total': 11000,
  'raw_lines': ['PROCOLD TAB@144',
   '12 @913',
   '10.956',
   '10.956',
   '+44',
   '11.000']},
 {'receipt_id': 'receipt_002',
  'name': 'O/TATER THIN SWD 100',
  'qty': 1,
  'unit_price': 15200,
  'total': 15200,
  'raw_lines': ['NISSIN WeR CHO 11OG',
   '1',
   '9,300',
   '9300',
   'O/TATER THIN SWD 100',
   '1',
   '15,200',
   '15200']},
 {'receipt_id': 'receipt_018',
  'name': '2x20000 40,000',
  'qty': 2,
  'unit_price': 200004,
  'total': 40000,
  'raw_lines': ['2x20000 40,000']}]

In [127]:
def is_name_line(text):
    return (
        any(c.isalpha() for c in text)
        and not re.fullmatch(r'[\d.,]+', text.strip())
        and not text.lower().startswith(('disc', 'pp', 'bkp', 'kode', 'check'))
    )


In [128]:
def split_grouped_item(item):
    sub_items = []
    buffer = []
    current_name = None

    for line in item["lines"]:

        # Jika ketemu nama item baru DAN buffer tidak kosong → simpan item lama
        if is_name_line(line) and buffer:
            sub_items.append({
                "receipt_id": item["receipt_id"],
                "lines": buffer.copy(),
                "name": current_name,
                "qty_price": None,
                "total": None,
                "discount": None,
                "ys": []
            })
            buffer.clear()

        buffer.append(line)

        if is_name_line(line):
            current_name = line

    # append terakhir
    if buffer:
        sub_items.append({
            "receipt_id": item["receipt_id"],
            "lines": buffer.copy(),
            "name": current_name,
            "qty_price": None,
            "total": None,
            "discount": None,
            "ys": []
        })

    return sub_items


In [129]:
split_items = []
for it in clean_items:
    split_items.extend(split_grouped_item(it))

parsed_items = [parse_item(i) for i in split_items]
final_items = [i for i in parsed_items if i is not None]


In [131]:
def looks_like_non_item(name):
    if name is None:
        return True

    blacklist = [
        "check", "disc", "pp", "bkp",
        "total", "tunai", "kembali",
        "date", "may", "cashier"
    ]

    return any(k in name.lower() for k in blacklist)


In [132]:
def valid_unit_price(qty, unit_price, total):
    if unit_price is None:
        return False
    if qty * unit_price != total:
        # toleransi OCR kecil
        return abs(qty * unit_price - total) < 1000
    return True


In [133]:
final_clean_items = [
    i for i in final_items
    if not looks_like_non_item(i["name"])
    and valid_unit_price(i["qty"], i["unit_price"], i["total"])
]


In [134]:
final_clean_items

[{'receipt_id': 'receipt_001',
  'name': 'PROCOLD TAB@144',
  'qty': 12,
  'unit_price': 913,
  'total': 11000,
  'raw_lines': ['PROCOLD TAB@144',
   '12 @913',
   '10.956',
   '10.956',
   '+44',
   '11.000']},
 {'receipt_id': 'receipt_002',
  'name': 'NISSIN WeR CHO 11OG',
  'qty': 1,
  'unit_price': 9300,
  'total': 9300,
  'raw_lines': ['NISSIN WeR CHO 11OG', '1', '9,300', '9300']},
 {'receipt_id': 'receipt_002',
  'name': 'O/TATER THIN SWD 100',
  'qty': 1,
  'unit_price': 15200,
  'total': 15200,
  'raw_lines': ['O/TATER THIN SWD 100', '1', '15,200', '15200']}]