In [1]:
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.vectorstores import Qdrant
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline
from transformers import pipeline,  AutoModelForCausalLM, AutoTokenizer

import os
import numpy as np
import traceback
import pytesseract
import easyocr
import cv2
import fitz
import time
from PIL import Image

import warnings
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
file_path = "../Data/e-19_Laporan Hasil Audit atas Pengadaan Barang dan Jasa Pekerjaan Pembangunan Revitalisasi Halt.pdf"

In [3]:
# Reading PDF file
try:
    loader = PyPDFLoader(file_path)
    docs = loader.load()
    print(f"✅ Berhasil load {len(docs)} halaman dari {file_path}\n")
except Exception as e:
    print("❌ Gagal load PDF!")
    print(traceback.format_exc())
    exit()

Multiple definitions in dictionary at byte 0x5e5736 for key /Info
Multiple definitions in dictionary at byte 0x5e5743 for key /Info


✅ Berhasil load 17 halaman dari ../Data/e-19_Laporan Hasil Audit atas Pengadaan Barang dan Jasa Pekerjaan Pembangunan Revitalisasi Halt.pdf



In [4]:
reader = easyocr.Reader(['id', 'en'])

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


In [5]:
# Load PDF dan konversi halaman ke gambar
try:
    doc = fitz.open(file_path)
    print(f"✅ Berhasil load {len(doc)} halaman dari {file_path}\n")
except Exception as e:
    print("❌ Gagal load PDF!")
    print(traceback.format_exc())
    exit()

✅ Berhasil load 17 halaman dari ../Data/e-19_Laporan Hasil Audit atas Pengadaan Barang dan Jasa Pekerjaan Pembangunan Revitalisasi Halt.pdf



In [6]:
# Loop setiap halaman
for i, page in enumerate(doc):
    print(f"\n📃 Halaman {i + 1}")

    try:
        # Render halaman jadi gambar (pixmap)
        pix = page.get_pixmap(dpi=300)
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    except Exception as e:
        print("❌ Gagal render halaman ke gambar!")
        print(traceback.format_exc())
        continue

    # OCR pakai pytesseract
    try:
        start_time = time.time()
        text_tess = pytesseract.image_to_string(img, lang='ind')
        tess_time = time.time() - start_time
        print(f"🧠 pytesseract selesai dalam {tess_time:.2f}s")
    except Exception as e:
        print("❌ pytesseract gagal!")
        text_tess = ""

    # OCR pakai easyocr
    try:
        start_time = time.time()
        results = reader.readtext(np.array(img), detail=0, paragraph=True)
        text_easy = "\n".join(results)
        easy_time = time.time() - start_time
        print(f"🧠 easyocr selesai dalam {easy_time:.2f}s")
    except Exception as e:
        print("❌ easyocr gagal!")
        text_easy = ""

    # Preview hasil
    print("\n📌 Hasil OCR (pytesseract):")
    print(text_tess[:500], "...\n")  # tampilkan 500 karakter awal

    print("📌 Hasil OCR (easyocr):")
    print(text_easy[:500], "...\n")

    # Optional: break untuk test 1 halaman dulu
    break


📃 Halaman 1
❌ pytesseract gagal!
🧠 easyocr selesai dalam 34.25s

📌 Hasil OCR (pytesseract):
 ...

📌 Hasil OCR (easyocr):
transiakarta
NOTA DINAS
Kepada Nomor
Direktur Utama
e-19/NOTA-USIDUIPT.TJIII2025
Sifat
Kilat
Lampiran
Hal
Penyampaian Laporan Hasil Audit atas Pengadaan Barang dan Jasa Pekerjaan Pembangunan Revitalisasi Halte BRT Paket € & D
Dengan hormat; Sehubungan dengan Surat Tugas Direktur Utama PT Transportasi Jakarta Nomor 62/TUG DUIPT.TJIPT.TJIII/2024 tanggal 13 Februari 2024 Tentang Audit Atas Pengadaan Barang & Jasa   Pekerjaan   Pembangunan Revitalisasi   Halte BRT Paket C & D, bersama ini kami sampa ...



In [None]:
# Chungking
try:
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    splits = splitter.split_documents(docs)
    print(f"✅ Berhasil split jadi {len(splits)} chunk\n")
except Exception as e:
    print("❌ Gagal split dokumen!")
    print(traceback.format_exc())
    exit()

✅ Berhasil split jadi 0 chunk

