# LIBRARY REQUIREMENTS

In [None]:
#MODEL CLASSIFICATION

%pip install tensorflow==2.15.1 keras==2.15.0 pillow numpy  
#DOCSTRANGE_NANONETSOCR
%pip install -q 'git+https://github.com/facebookresearch/detectron2.git'
%pip install opencv-python-headless ultralytics
%pip install docstrange
%pip install torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1 --index-url https://download.pytorch.org/whl/cu121
%pip install tqdm pyyaml matplotlib scikit-image shapely
%pip install imgaug pycocotools
#PADDLE
%pip install paddlepaddle
%pip install paddleocr


In [None]:
!python --version
!python3.10 - <<'PY'
import sys, tensorflow as tf, keras, PIL
print("Python:", sys.version)
print("TensorFlow:", tf.__version__)
print("Keras:", keras.__version__)
print("Pillow:", PIL.__version__)

# CLASSIFICATION MODEL

In [None]:
import os
from keras.models import load_model
from PIL import Image, ImageOps
import numpy as np

# ==== CẤU HÌNH ====
# Thư mục chứa ảnh
IMAGE_FOLDER = r"D:\New folder\Classification\images" 
# Đường dẫn model
MODEL_PATH = r"D:\New folder\Classification\keras_model.h5"
# Đường dẫn labels
LABEL_PATH = r"D:\New folder\Classification\labels.txt"

# ==== CHUẨN BỊ MÔI TRƯỜNG ====
np.set_printoptions(suppress=True)
model = load_model(MODEL_PATH, compile=False)
class_names = open(LABEL_PATH, "r").readlines()

#============COUNT==============
bill_count = 0
no_bill_count = 0

print("Model & labels loaded successfully!")

# ==== DUYỆT QUA TỪNG ẢNH ====
for filename in os.listdir(IMAGE_FOLDER):
    if not filename.lower().endswith((".jpg", ".jpeg", ".png")):
        continue  # bỏ qua file không phải ảnh

    image_path = os.path.join(IMAGE_FOLDER, filename)
    print(f"\n--- Processing: {filename} ---")

    try:
        # Load ảnh
        image = Image.open(image_path).convert("RGB")

        # Resize & normalize
        size = (224, 224)
        image = ImageOps.fit(image, size, Image.Resampling.LANCZOS)
        image_array = np.asarray(image)
        normalized_image_array = (image_array.astype(np.float32) / 127.5) - 1

        # Dự đoán
        data = np.ndarray(shape=(1, 224, 224, 3), dtype=np.float32)
        data[0] = normalized_image_array
        prediction = model.predict(data)
        index = np.argmax(prediction)
        class_name = class_names[index].strip()
        confidence_score = prediction[0][index]

        print(f"→ Class: {class_name} | Confidence: {confidence_score:.4f}")

        # ==== XỬ LÝ THEO NHÃN ====
        if class_name == "1 Bill":
            # (bill)
            bill_count += 1            # Dùng model Paddle
        elif class_name == "0 Not Bill":
            # Nếu không phải hóa đơn (no bill)
            no_bill_count += 1         # Dùng model VLM

    except Exception as e:
        print(f"❌❌❌❌error in:{filename}: {e}")


print("\n============================")
print(f"Tổng số ảnh bill: {bill_count}")
print(f"Tổng số ảnh no-bill: {no_bill_count}")
print(f"Tổng cộng: {bill_count + no_bill_count}")
print("============================")


# NANONETSOCR MODEL

In [None]:
import os, cv2, torch
import numpy as np
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from detectron2 import model_zoo

#For Docstrange
from docstrange import DocumentExtractor
import random
import glob

def init_segmentor(threshold=0.5, device=None):
    cfg = get_cfg()
    cfg.merge_from_file(model_zoo.get_config_file(
        "COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x.yaml"))
    cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url(
        "COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x.yaml")
    cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = threshold
    cfg.MODEL.DEVICE = device or ("cuda" if torch.cuda.is_available() else "cpu")
    return DefaultPredictor(cfg)

# Tạo global predictor để tái sử dụng
predictor = init_segmentor()

# ===============================================================
# 🧠 Function chính: xóa object & lưu đè
# ===============================================================
def remove_objects_and_save(img_path, mask_color=(255, 255, 255)):


    """
    Dò tìm object trong ảnh, xóa (mask trắng) vùng đó và lưu đè lên ảnh gốc.
    """
    # Đọc ảnh
    img = cv2.imread(img_path)
    if img is None:
        print(f"[!] Không đọc được ảnh: {img_path}")
        return

    # Detect object
    outputs = predictor(img)
    instances = outputs["instances"].to("cpu")

    if not instances.has("pred_masks"):
        print(f"[INFO] Không phát hiện mask trong ảnh: {img_path}")
        return

    masks = instances.pred_masks.numpy()  # [N, H, W]

    # Nếu không có đối tượng nào
    if masks.shape[0] == 0:
        print(f"[INFO] Ảnh không có object cần xóa: {img_path}")
        return

    # Gộp tất cả mask lại
    combined_mask = np.any(masks, axis=0).astype(np.uint8)

    # Thay vùng mask bằng màu trắng
    img[combined_mask == 1] = mask_color

    # Lưu đè ảnh
    cv2.imwrite(img_path, img)
    print(f"[DONE] Đã xóa object và lưu lại: {img_path}")



def auto_split_image(img_path, diff_thresh=40, min_gap=2000):
    """
    Tự động cắt ảnh dài theo sự thay đổi màu nền dọc trục Y.
    Lưu các ảnh con vào thư mục Cropped_Images/{img_name}/.
    Ảnh được đặt tên theo dạng: {img_name}-A1.jpg, {img_name}-A2.jpg, ...
    """
    # ==== Load ảnh gốc ====
    img = cv2.imread(img_path)
    if img is None:
        print(f"[!] Không thể đọc ảnh: {img_path}")
        return
    h, w, _ = img.shape
    print(f"[INFO] Ảnh gốc: {w}x{h}")

    # ==== Làm mượt ảnh để giảm nhiễu chữ ====
    blur = cv2.GaussianBlur(img, (15, 15), 0)

    # ==== Chuyển sang LAB để ổn định màu ====
    lab = cv2.cvtColor(blur, cv2.COLOR_BGR2LAB)
    mean_color = lab.mean(axis=1)  # trung bình theo chiều ngang
    diff = np.abs(np.diff(mean_color, axis=0)).mean(axis=1)

    # ==== Chuẩn hóa & smooth ====
    diff = cv2.GaussianBlur(diff.reshape(-1,1), (1, 21), 0).flatten()
    diff_norm = (diff - diff.min()) / (diff.max() - diff.min() + 1e-8)

    # ==== Tính biên bằng Sobel để bổ sung thông tin cấu trúc ====
    gray = cv2.cvtColor(blur, cv2.COLOR_BGR2GRAY)
    sobel = cv2.Sobel(gray, cv2.CV_64F, 0, 1, ksize=3)
    edge_strength = np.mean(np.abs(sobel), axis=1)
    edge_strength = (edge_strength - edge_strength.min()) / (edge_strength.max() - edge_strength.min() + 1e-8)

    # ==== Căn chỉnh chiều dài ====
    if edge_strength.shape[0] != diff_norm.shape[0]:
        min_len = min(len(edge_strength), len(diff_norm))
        edge_strength = edge_strength[:min_len]
        diff_norm = diff_norm[:min_len]

    # ==== Kết hợp 2 tín hiệu ====
    signal = 0.7 * diff_norm + 0.3 * edge_strength

    # ==== Xác định ranh giới ====
    boundaries = []
    last_cut = 0
    for y in range(1, len(signal)-1):
        if signal[y] > 0.6 and signal[y] == max(signal[y-3:y+3]):
            if y - last_cut > min_gap:
                boundaries.append(y)
                last_cut = y

    if not boundaries:
        print("[!] Không phát hiện được ranh giới rõ ràng, ảnh không được cắt.")
        boundaries = [h]

    # ==== Crop và lưu ====
    base_name = os.path.splitext(os.path.basename(img_path))[0]
    save_dir = f"Cropped_Images/{base_name}"
    os.makedirs(save_dir, exist_ok=True)

    prev_y = 0
    count = 1
    for y in boundaries + [h]:
        crop = img[prev_y:y, :]
        if crop.shape[0] > 50:  # bỏ qua crop quá nhỏ
            save_path = os.path.join(save_dir, f"{base_name}-A{count}.jpg")
            cv2.imwrite(save_path, crop)
            print(f"[+] Saved: {save_path} ({crop.shape[1]}x{crop.shape[0]})")
            count += 1
        prev_y = y

    print(f"[DONE] Đã lưu {count-1} ảnh con vào thư mục: {save_dir}")
    return save_dir


#Creating many extractors to use more API :))
extractor1 = DocumentExtractor(
      preserve_layout=True,
      api_key="7138f093-9836-11f0-a5f8-6abdbfc750c5"
  )
extractor2 = DocumentExtractor(
      preserve_layout=True,
      api_key="7138f093-9836-11f0-a5f8-6abdbfc750c5"
  )
extractor3 = DocumentExtractor(
      preserve_layout=True,
      api_key="7138f093-9836-11f0-a5f8-6abdbfc750c5"
  )
extractor4 = DocumentExtractor(
      preserve_layout=True,
      api_key="7138f093-9836-11f0-a5f8-6abdbfc750c5"
  )
extractors_list = [extractor1,extractor2,extractor3,extractor4]


def docstrange_ocr_short(img_path,output_path):
  extractor = random.choice(extractors_list)
  remove_objects_and_save(img_path)
  result = extractor.extract(img_path)

  schema = {
      "text_top_to_bottom": "string\nstring"
  }

  os.makedirs(output_path, exist_ok=True)
  output_text = result.extract_data(json_schema=schema)["structured_data"]["text_top_to_bottom"]
  output_path = output_path + "/" + os.path.splitext(os.path.basename(img_path))[0] +".txt" #Lấy tên ảnh

  with open(output_path,mode="w") as f:
    f.write(output_text)

def docstrange_ocr_long(long_img_path, output_path):
    extractor = random.choice(extractors_list)

    img_name = "/" + os.path.basename(long_img_path) + ".txt"  # long_img_path là folder chứa các ảnh crop
    schema = {"text_top_to_bottom": "string\nstring"}

    # Tạo output folder (nếu chưa có)
    os.makedirs(output_path, exist_ok=True)
    output_file = output_path + img_name

    # ✅ Lấy tất cả ảnh hợp lệ trong folder
    img_extensions = ("*.png", "*.jpg", "*.jpeg", "*.webp")
    img_files = []
    for ext in img_extensions:
        img_files.extend(glob.glob(os.path.join(long_img_path, ext)))

    # ✅ OCR từng ảnh và nối text vào file duy nhất
    for img_path in sorted(img_files):  # sắp xếp cho đúng thứ tự
        remove_objects_and_save(img_path)
        result = extractor.extract(img_path)
        output_text = result.extract_data(json_schema=schema)["structured_data"]["text_top_to_bottom"]

        with open(output_file, mode="a", encoding="utf-8") as f:
            f.write(output_text + "\n")

    print(f"✅ Saved combined OCR text to: {output_file}")

# PADDLE OCR

In [None]:
#### PADDLE OCR 
class PaddleOCRmodel:
    def __init__(self, lang="korean"):
        """
        Initializes the PaddleOCR model.
        Args:
            lang (str): The language for OCR (default is "korean").
        """
        self.ocr = PaddleOCR(
            use_doc_orientation_classify=False,
            use_doc_unwarping=False,
            use_textline_orientation=False,
            lang=lang
        )
    
    def run_ocr_and_save_results(self, image_file, txt_dir):
        """
        Runs PaddleOCR on a single image file and saves the results.
        Args:
            image_file (str): Path to the image file to process.
            txt_dir (str): The directory to save the extracted text file.
        Returns:
            str: Path to the created text file, or None if failed.
        """
        if not image_file:
            print("No image file provided for OCR processing.")
            return None
            
        if not os.path.exists(image_file):
            print(f"Image file does not exist: {image_file}")
            return None
            
        os.makedirs(txt_dir, exist_ok=True)
        
        try:
            # Read image using cv2 to ensure proper format
            import cv2
            img = cv2.imread(image_file)
            if img is None:
                print(f"Failed to read image: {image_file}")
                return None
            
            # Apply preprocessing if utils is available
            try:
                import ExcalibURA_OCR.utils as utils
                img = utils.experiment_gamma_correction(img)  # Pass image data, not path
            except ImportError:
                print("Utils not available, using original image")
            except Exception as e:
                print(f"Preprocessing failed, using original image: {e}")
                img = cv2.imread(image_file)  # Reload original
            
            # Run OCR with the image data
            result = self.ocr.predict(img)  # Use ocr() method directly
            
            # Get base name for output files
            base_name = os.path.splitext(os.path.basename(image_file))[0]
            
            if not result or not result[0]:
                print(f"No OCR results for {image_file}")
                # Create empty text file
                txt_output_path = os.path.join(txt_dir, f"{base_name}.txt")
                with open(txt_output_path, 'w', encoding='utf-8') as f:
                    f.write('')
                return txt_output_path
            
            # Extract text from OCR results
            extracted_texts = []
            for line in result[0]:
                if len(line) >= 2:
                    text = line[1][0]  # Get the text from OCR result
                    confidence = line[1][1]  # Get confidence score
                    if confidence > 0.5:  # Filter low confidence results
                        extracted_texts.append(text)
            
            # Create text file in the specified directory
            txt_output_path = os.path.join(txt_dir, f"{base_name}.txt")
            with open(txt_output_path, 'w', encoding='utf-8') as f:
                for text in extracted_texts:
                    f.write(f"{text}\n")
            
            print(f"✅ OCR completed for {base_name}")
            return txt_output_path
            
        except Exception as e:
            print(f"❌ Error processing {image_file}: {str(e)}")
            # Create empty file on error
            try:
                base_name = os.path.splitext(os.path.basename(image_file))[0]
                txt_output_path = os.path.join(txt_dir, f"{base_name}.txt")
                with open(txt_output_path, 'w', encoding='utf-8') as f:
                    f.write('')
                return txt_output_path
            except:
                return None



# MAIN
### PADDLE IS WRITTEN IN FILE .PY SO YOU HAVE IMPLEMENT THE CODE IN .PY FILES NOT THE 
## THE PADDLE ALSO USE THE utils.py

In [None]:

### Main function
def main():
    IMAGE_FOLDER_DIR = "/content/drive/MyDrive/Chung_Innnovation/PHASE2/images_hyecho"
    FOLDER_NAME = os.path.basename(IMAGE_FOLDER_DIR)
    SAVE_DIR = "/content/drive/MyDrive/Chung_Innnovation/PHASE2/" + "Result_Text/" + FOLDER_NAME
    os.makedirs(SAVE_DIR, exist_ok=True)

    error_log_path = os.path.join(SAVE_DIR, "error.txt")

    # =========================================================
    # Các đuôi ảnh hợp lệ
    VALID_EXTENSIONS = (".jpg", ".jpeg", ".png", ".webp")

    # =========================================================
    error_files = []

    for img_path in glob.glob(IMAGE_FOLDER_DIR + "/*.*"):
        ext = os.path.splitext(img_path)[1].lower()

        # 🧩 1️⃣ Bỏ qua file không hợp lệ
        if ext not in VALID_EXTENSIONS:
            error_files.append(f"❌ Unsupported file type: {img_path}")
            continue

        # 🧩 2️⃣ Đọc ảnh
        img = cv2.imread(img_path)
        if img is None:
            error_files.append(f"⚠️ Cannot read image: {img_path}")
            continue

        # 🧩 3️⃣ Kiểm tra chiều cao ảnh
        height, width = img.shape[:2]

        try:
            if int(height) >= 5000:
                long_img_path = auto_split_image(img_path, diff_thresh=40, min_gap=2000)
                docstrange_ocr_long(long_img_path, output_path=SAVE_DIR)
            else:
                docstrange_ocr_short(img_path, output_path=SAVE_DIR)
        except Exception as e:
            error_files.append(f"💥 Error processing {img_path}: {str(e)}")

    # =========================================================
    # Ghi danh sách lỗi ra file
    if error_files:
        with open(error_log_path, "w", encoding="utf-8") as f:
            f.write("\n".join(error_files))
        print(f"⚠️ Some files skipped or failed. See log: {error_log_path}")
    else:
        print("✅ All images processed successfully!")

