In [1]:
%pip install datasets

Note: you may need to restart the kernel to use updated packages.


In [2]:
!pip install --upgrade datasets



In [5]:
import json
import os
import datasets 
from datasets import DatasetBuilder, SplitGenerator
# Find for instance the citation on arxiv or on the dataset repo/website
_CITATION = """\
@article{seker2022generalized, title={A generalized framework for recognition of expiration dates on product packages using fully convolutional networks}, author={Seker, Ahmet Cagatay and Ahn, Sang Chul}, journal={Expert Systems with Applications}, pages={117310}, year={2022}, publisher={Elsevier} }
"""

_DESCRIPTION = """\
The dataset for Date detection in the proposed framework aims to provide annotated images that are relevant for training and evaluating models tasked with detecting dates within product labels or similar contexts.
"""

_HOMEPAGE = "https://acseker.github.io/ExpDateWebsite/"

_LICENSE = "https://licenses.nuget.org/AFL-3.0"

_URLs = {
    "products_synth": "https://huggingface.co/datasets/dimun/ExpirationDate/resolve/main/Products-Synth.zip?download=true",
    "products_real": "https://huggingface.co/datasets/dimun/ExpirationDate/resolve/main/Products-Real.zip?download=true",
}


def has_extension(file_path, extensions):
    _, file_extension = os.path.splitext(file_path)
    return file_extension.lower() in extensions


logger = datasets.logging.get_logger(__name__)


class ExpirationDate(datasets.GeneratorBasedBuilder):
    VERSION = datasets.Version("0.0.1")
    CATEGORIES = ["prod", "date", "due", "code"]
    
    def __init__(self):
        pass
    def _info(self):
        features = datasets.Features(
            {
                "id": datasets.Value("string"),
                "transcriptions": datasets.Sequence(datasets.Value("string")),
                "bboxes_block": datasets.Sequence(datasets.Sequence(datasets.Value("int64"))),
                "categories": datasets.Sequence(datasets.features.ClassLabel(names=self.CATEGORIES)),
                "image_path": datasets.Value("string"),
                "width": datasets.Value("int32"),
                "height": datasets.Value("int32")
            }
        )

        return datasets.DatasetInfo(
            # Đây là mô tả sẽ xuất hiện trên trang bộ dữ liệu.
            description=_DESCRIPTION,
            # Các tính năng/mục tiêu của tập dữ liệu
            features=features,
            # Trang chủ của tập dữ liệu cho tài liệu
            homepage=_HOMEPAGE,
            # Giấy phép cho tập dữ liệu nếu có
            license=_LICENSE,
            # Trích dẫn cho tập dữ liệu
            citation=_CITATION,
        )

    def _split_generators(self):
        """Returns SplitGenerators."""
        
        # Đường dẫn đến các thư mục đã giải nén
        synth_data_dir = os.path.join(".", "Products-Synth")
        real_data_dir = os.path.join(".", "Products-Real")

        return [
            SplitGenerator(
                name=datasets.Split.TRAIN,
                gen_kwargs={
                    "filepath": synth_data_dir,
                    "split": "train",  # Chỉ định rõ ràng cho huấn luyện
                },
            ),
            SplitGenerator(
                name=datasets.Split.VALIDATION,
                gen_kwargs={
                    "filepath": real_data_dir,
                    "split": "validation",  # Chỉ định rõ ràng cho xác thực
                },
            ),
            SplitGenerator(
                name=datasets.Split.TEST,
                gen_kwargs={
                    "filepath": real_data_dir,
                    "split": "test",  # Chỉ định rõ ràng cho kiểm tra
                },
            ),
        ]

    def _generate_examples(self, filepath, split):
        logger.info(
            f"⏳ Generating examples from = {filepath} to the split {split}")
        ann_file = os.path.join(filepath, split, "annotations.json")

        # lấy json
        with open(ann_file, "r", encoding="utf8") as f:
            features_map = json.load(f)

        img_dir = os.path.join(filepath, split, "images")
        img_listdir = os.listdir(img_dir)

        for guid, filename in enumerate(img_listdir):
            if filename.endswith(".jpg"):
                image_features = features_map[filename]
                image_ann = image_features.get("ann")

                transcriptions = [box.get("transcription", "")
                                  for box in image_ann]
                bboxes_block = [box.get("bbox") for box in image_ann]
                categories = [box.get("cls") if box.get(
                    "cls") in self.CATEGORIES else "date" for box in image_ann]

                # lấy ảnh
                image_path = os.path.join(img_dir, filename)

                yield guid, {
                    "id": filename,
                    "transcriptions": transcriptions,
                    "bboxes_block": bboxes_block,
                    "categories": categories,
                    "image_path": image_path,
                    "width": image_features.get("width"),
                    "height": image_features.get("height"),
                }

# Tạo đối tượng của lớp ExpirationDateDataset
dataset_builder = ExpirationDate()

# Gọi hàm _split_generators
dataset = dataset_builder._split_generators()

# In kết quả
for split in dataset:
    print(f"Split Name: {split.name}, Filepath: {split.gen_kwargs['filepath']}")

Split Name: train, Filepath: .\Products-Synth
Split Name: validation, Filepath: .\Products-Real
Split Name: test, Filepath: .\Products-Real


In [30]:
import easyocr
import cv2
import matplotlib.pyplot as plt

# Khởi tạo EasyOCR
reader = easyocr.Reader(['en'])  # Bạn có thể chỉ định ngôn ngữ khác nếu cần

def detect_text(image_path):
    # Đọc hình ảnh
    img = cv2.imread(image_path)

    # Nhận diện văn bản
    results = reader.readtext(img)

    return results, img
# Thay đổi đường dẫn đến hình ảnh của bạn
image_path = './train/images/img_00006.jpg'
detected_texts, img = detect_text(image_path)

# Vẽ hình chữ nhật quanh văn bản phát hiện
for (bbox, text, prob) in detected_texts:
    (top_left, top_right, bottom_right, bottom_left) = bbox
    top_left = tuple(map(int, top_left))
    bottom_right = tuple(map(int, bottom_right))

    cv2.rectangle(img, top_left, bottom_right, (0, 255, 0), 2)
    cv2.putText(img, text, (top_left[0], top_left[1]-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)

# Hiển thị hình ảnh với văn bản được phát hiện
plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
plt.axis('off')
plt.show()

  net.load_state_dict(copyStateDict(torch.load(trained_model, map_location=device)))
  model.load_state_dict(torch.load(model_path, map_location=device))


ValueError: Invalid input type. Supporting format = string(file path or url), bytes, numpy array

In [11]:

# Create an instance of the ExpirationDate dataset builder
dataset_builder = ExpirationDate()

# Call the _split_generators method
dataset_splits = dataset_builder._split_generators()

# Print the split information
for split in dataset_splits:
    print(f"Split Name: {split.name}, Filepath: {split.gen_kwargs['filepath']}")

Split Name: train, Filepath: .\Products-Synth
Split Name: validation, Filepath: .\Products-Real
Split Name: test, Filepath: .\Products-Real


In [36]:
import os
import json
import logging

# Khởi tạo logger
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def generate_examples(filepath, split):
    # Ghi log thông tin
    logger.info(f"⏳ Generating examples from = {filepath} to the split {split}")

    # Lấy đường dẫn đến tệp JSON
    ann_file = os.path.join(filepath, split, "annotations.json")

    # Lấy đường dẫn tuyệt đối
    absolute_path = os.path.abspath(ann_file)

    # In đường dẫn tuyệt đối
    print(f"Absolute path to annotations.json: {absolute_path}")

    # Kiểm tra xem tệp có tồn tại không
    if not os.path.exists(absolute_path):
        logger.error(f"Error: The file does not exist at {absolute_path}")
        return

    logger.info("Data loaded successfully!")
    
    # Đọc dữ liệu JSON
    with open(absolute_path, 'r', encoding='utf-8') as f:
        features_map = json.load(f)

    # Kiểm tra sự tồn tại của thư mục hình ảnh
    img_dir = os.path.join(filepath, split, "images")
    if not os.path.exists(img_dir):
        logger.error(f"Error: The image directory does not exist at {img_dir}")
        return

    img_listdir = os.listdir(img_dir)

    for guid, filename in enumerate(img_listdir):
        if filename.endswith(".jpg"):
            image_features = features_map.get(filename)
            if image_features is None:
                logger.warning(f"Warning: No features found for {filename}")
                continue  # Bỏ qua nếu không có dữ liệu cho ảnh

            image_ann = image_features.get("ann")

            transcriptions = [box.get("transcription", "") for box in image_ann]
            bboxes_block = [box.get("bbox") for box in image_ann]
            categories = [box.get("cls") if box.get("cls") in self.CATEGORIES else "date" for box in image_ann]

            # Lấy đường dẫn ảnh
            image_path = os.path.join(img_dir, filename)

            yield guid, {
                "id": filename,
                "transcriptions": transcriptions,
                "bboxes_block": bboxes_block,
                "categories": categories,
                "image_path": image_path,
                "width": image_features.get("width"),
                "height": image_features.get("height"),
            }

# Ví dụ gọi hàm
filepath = './ExpirationDate/Products-Real/train'  # Ví dụ
split = 'train'  # Ví dụ

for example in generate_examples(filepath, split):
    print(example)

INFO:__main__:⏳ Generating examples from = ./ExpirationDate/Products-Real/train to the split train
ERROR:__main__:Error: The file does not exist at c:\Users\kienb\Downloads\Test_ORC\ExpirationDate\ExpirationDate\Products-Real\train\train\annotations.json


Absolute path to annotations.json: c:\Users\kienb\Downloads\Test_ORC\ExpirationDate\ExpirationDate\Products-Real\train\train\annotations.json


In [39]:
import os
import json
import easyocr
import cv2

# Khởi tạo EasyOCR
reader = easyocr.Reader(['en'])

def detect_expiration_date(image_path):
    img = cv2.imread(image_path)
    results = reader.readtext(img)
    expiration_dates = []
    
    for (bbox, text, prob) in results:
        if "expiry" in text.lower() or "expiration" in text.lower():
            expiration_dates.append(text)

    return expiration_dates

# Sử dụng đường dẫn tuyệt đối
json_path = os.path.abspath('./ExpirationDate/Products-Real/train/annotations.json')

# Kiểm tra xem tệp có tồn tại không
if not os.path.exists(json_path):
    print(f"Error: The file does not exist at {json_path}")
else:
    # Đọc dữ liệu JSON
    with open(json_path, 'r', encoding='utf-8') as f:
        features_map = json.load(f)

    img_dir = os.path.join(os.path.dirname(json_path), "images")
    for filename, features in features_map.items():
        if features.get("ann"):
            image_path = os.path.join(img_dir, filename)
            
            if not os.path.exists(image_path):
                print(f"Warning: Image file does not exist at {image_path}")
                continue

            expiration_dates = detect_expiration_date(image_path)
            print(f"Detected expiration dates for {filename}: {expiration_dates}")

Error: The file does not exist at c:\Users\kienb\Downloads\Test_ORC\ExpirationDate\ExpirationDate\Products-Real\train\annotations.json


In [42]:
import os
import json
import logging
import cv2

# Khởi tạo logger
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def generate_examples(filepath, split):
    logger.info(f"⏳ Generating examples from = {filepath} to the split {split}")

    ann_file = os.path.join(filepath, split, "annotations.json")
    absolute_path = os.path.abspath(ann_file)
    print(f"Absolute path to annotations.json: {absolute_path}")

    if not os.path.exists(absolute_path):
        logger.error(f"Error: The file does not exist at {absolute_path}")
        return

    logger.info("Data loaded successfully!")

    with open(absolute_path, 'r', encoding='utf-8') as f:
        features_map = json.load(f)

    img_dir = os.path.join(filepath, split, "images")
    if not os.path.exists(img_dir):
        logger.error(f"Error: The image directory does not exist at {img_dir}")
        return

    img_listdir = os.listdir(img_dir)

    for guid, filename in enumerate(img_listdir):
        if filename.endswith(".jpg"):
            image_features = features_map.get(filename)
            if image_features is None:
                logger.warning(f"Warning: No features found for {filename}")
                continue

            image_ann = image_features.get("ann")

            transcriptions = [box.get("transcription", "") for box in image_ann]
            bboxes_block = [box.get("bbox") for box in image_ann]
            categories = [box.get("cls") for box in image_ann]

            image_path = os.path.join(img_dir, filename)

            yield guid, {
                "id": filename,
                "transcriptions": transcriptions,
                "bboxes_block": bboxes_block,
                "categories": categories,
                "image_path": image_path,
                "width": image_features.get("width"),
                "height": image_features.get("height"),
            }

def display_images(filepath, split):
    for guid, example in generate_examples(filepath, split):
        image_path = example["image_path"]
        image = cv2.imread(image_path)

        for idx, annotation in enumerate(example['bboxes_block']):
            bbox = annotation
            cv2.rectangle(image, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (0, 255, 0), 2)

            if example['transcriptions'][idx]:
                cv2.putText(image, example['transcriptions'][idx], (bbox[0], bbox[1] - 10), 
                            cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2)

        cv2.imshow(example["id"], image)
        cv2.waitKey(0)

    cv2.destroyAllWindows()

# Ví dụ gọi hàm
filepath = './ExpirationDate/Products-Real/train'
split = 'train'

display_images(filepath, split)

INFO:__main__:⏳ Generating examples from = ./ExpirationDate/Products-Real/train to the split train
ERROR:__main__:Error: The file does not exist at c:\Users\kienb\Downloads\Test_ORC\ExpirationDate\ExpirationDate\Products-Real\train\train\annotations.json


Absolute path to annotations.json: c:\Users\kienb\Downloads\Test_ORC\ExpirationDate\ExpirationDate\Products-Real\train\train\annotations.json


error: OpenCV(4.10.0) D:\a\opencv-python\opencv-python\opencv\modules\highgui\src\window.cpp:1295: error: (-2:Unspecified error) The function is not implemented. Rebuild the library with Windows, GTK+ 2.x or Cocoa support. If you are on Ubuntu or Debian, install libgtk2.0-dev and pkg-config, then re-run cmake or configure script in function 'cvDestroyAllWindows'


In [43]:
import matplotlib.pyplot as plt

def display_images(filepath, split):
    for guid, example in generate_examples(filepath, split):
        image_path = example["image_path"]
        image = cv2.imread(image_path)

        for idx, annotation in enumerate(example['bboxes_block']):
            bbox = annotation
            cv2.rectangle(image, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (0, 255, 0), 2)

            if example['transcriptions'][idx]:
                cv2.putText(image, example['transcriptions'][idx], (bbox[0], bbox[1] - 10), 
                            cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2)

        plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
        plt.title(example["id"])
        plt.axis('off')
        plt.show()

# Ví dụ gọi hàm
filepath = './ExpirationDate/Products-Real/train'
split = 'train'

display_images(filepath, split)

INFO:__main__:⏳ Generating examples from = ./ExpirationDate/Products-Real/train to the split train
ERROR:__main__:Error: The file does not exist at c:\Users\kienb\Downloads\Test_ORC\ExpirationDate\ExpirationDate\Products-Real\train\train\annotations.json


Absolute path to annotations.json: c:\Users\kienb\Downloads\Test_ORC\ExpirationDate\ExpirationDate\Products-Real\train\train\annotations.json


In [38]:
print(f"Checking for JSON file at: {json_path}")

Checking for JSON file at: ./ExpirationDate/Products-Real/train/annotations.json


In [32]:
import os

# Lấy đường dẫn tuyệt đối
json_path = os.path.abspath('./ExpirationDate/Products-Real/train/annotations.json')
print(json_path)

c:\Users\kienb\Downloads\Test_ORC\ExpirationDate\ExpirationDate\Products-Real\train\annotations.json


In [46]:
import os
import cv2
import json
import logging
import matplotlib.pyplot as plt

# Khởi tạo logger
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def generate_examples(filepath, split):
    logger.info(f"⏳ Generating examples from = {filepath} to the split {split}")

    ann_file = os.path.join(filepath, split, "annotations.json")
    absolute_path = os.path.abspath(ann_file)
    print(f"Absolute path to annotations.json: {absolute_path}")

    if not os.path.exists(absolute_path):
        logger.error(f"Error: The file does not exist at {absolute_path}")
        return

    logger.info("Data loaded successfully!")

    with open(absolute_path, 'r', encoding='utf-8') as f:
        features_map = json.load(f)

    img_dir = os.path.join(filepath, split, "images")
    if not os.path.exists(img_dir):
        logger.error(f"Error: The image directory does not exist at {img_dir}")
        return

    img_listdir = os.listdir(img_dir)

    for guid, filename in enumerate(img_listdir):
        if filename.endswith(".jpg"):
            image_features = features_map.get(filename)
            if image_features is None:
                logger.warning(f"Warning: No features found for {filename}")
                continue

            image_ann = image_features.get("ann")

            transcriptions = [box.get("transcription", "") for box in image_ann]
            bboxes_block = [box.get("bbox") for box in image_ann]
            categories = [box.get("cls") for box in image_ann]

            image_path = os.path.join(img_dir, filename)

            yield guid, {
                "id": filename,
                "transcriptions": transcriptions,
                "bboxes_block": bboxes_block,
                "categories": categories,
                "image_path": image_path,
                "width": image_features.get("width"),
                "height": image_features.get("height"),
            }

def display_single_image(filepath, split, image_id):
    for guid, example in generate_examples(filepath, split):
        print(f"Checking image: {example['id']}")  # In ra tên ảnh đang kiểm tra
        if example["id"] == image_id:
            image_path = example["image_path"]
            image = cv2.imread(image_path)

            for idx, annotation in enumerate(example['bboxes_block']):
                bbox = annotation
                cv2.rectangle(image, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (0, 255, 0), 2)

                if example['transcriptions'][idx]:
                    cv2.putText(image, example['transcriptions'][idx], (bbox[0], bbox[1] - 10), 
                                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2)

            plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
            plt.title(example["id"])
            plt.axis('off')
            plt.show()
            return

    print(f"Image with ID {image_id} not found.")

# Ví dụ gọi hàm để hiển thị một ảnh cụ thể
filepath = './ExpirationDate/Products-Real/train'
split = 'train'
image_id = 'img_00002.jpg'  # Thay đổi ID ảnh theo nhu cầu

display_single_image(filepath, split, image_id)


INFO:__main__:⏳ Generating examples from = ./ExpirationDate/Products-Real/train to the split train
ERROR:__main__:Error: The file does not exist at c:\Users\kienb\Downloads\Test_ORC\ExpirationDate\ExpirationDate\Products-Real\train\train\annotations.json


Absolute path to annotations.json: c:\Users\kienb\Downloads\Test_ORC\ExpirationDate\ExpirationDate\Products-Real\train\train\annotations.json
Image with ID img_00002.jpg not found.


In [50]:
import os

# Khởi tạo các biến
base_filepath = os.path.abspath('./ExpirationDate/Products-Real')
split = 'train'
ann_file = os.path.join(base_filepath, split, "annotations.json")
img_dir = os.path.join(base_filepath, split, "images")

print("Annotation file path:", ann_file)
print("Image directory path:", img_dir)

# Đọc tệp JSON
if os.path.exists(ann_file):
    with open(ann_file, 'r', encoding='utf-8') as f:
        features_map = json.load(f)
else:
    print("Annotation file does not exist:", ann_file)

# In danh sách ảnh từ JSON
json_images = features_map.keys() if 'features_map' in locals() else []
print("Images in annotations.json:", json_images)

# In danh sách ảnh từ thư mục
img_listdir = os.listdir(img_dir) if os.path.exists(img_dir) else []
print("Images in directory:", img_listdir)

Annotation file path: c:\Users\kienb\Downloads\Test_ORC\ExpirationDate\ExpirationDate\Products-Real\train\annotations.json
Image directory path: c:\Users\kienb\Downloads\Test_ORC\ExpirationDate\ExpirationDate\Products-Real\train\images
Annotation file does not exist: c:\Users\kienb\Downloads\Test_ORC\ExpirationDate\ExpirationDate\Products-Real\train\annotations.json
Images in annotations.json: []
Images in directory: []


In [51]:
# Tìm các ảnh có trong JSON nhưng không có trong thư mục
missing_in_directory = [img for img in json_images if img not in img_listdir]
print("Images in annotations.json but missing in directory:", missing_in_directory)

# Tìm các ảnh có trong thư mục nhưng không có trong JSON
missing_in_json = [img for img in img_listdir if img not in json_images]
print("Images in directory but missing in annotations.json:", missing_in_json)

Images in annotations.json but missing in directory: []
Images in directory but missing in annotations.json: []


In [58]:
import json
import os

# Đường dẫn đến tệp JSON
json_file_path = '../ExpirationDate/Products-Real/train/annotations.json'  # Thay đổi đường dẫn cho phù hợp

# Đọc tệp JSON
if os.path.exists(json_file_path):
    with open(json_file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # Truy cập dữ liệu
    for image_id, image_info in data.items():
        print(f"Image ID: {image_id}")
        print(f"Height: {image_info['height']}")
        print(f"Width: {image_info['width']}")

        for annotation in image_info['ann']:
            print(f"  Class: {annotation['cls']}")
            print(f"  Bounding Box: {annotation['bbox']}")
            if 'transcription' in annotation:
                print(f"  Transcription: {annotation['transcription']}")
else:
    print(f"File does not exist: {json_file_path}")

Image ID: img_00001.jpg
Height: 1008
Width: 756
  Class: date
  Bounding Box: [289, 660, 418, 673]
  Transcription: 2025/01/28
  Class: due
  Bounding Box: [286, 611, 315, 626]
  Class: due
  Bounding Box: [353, 642, 429, 657]
Image ID: img_00002.jpg
Height: 1107
Width: 779
  Class: date
  Bounding Box: [321, 661, 526, 675]
  Transcription: 2021/10/28
  Class: code
  Bounding Box: [378, 694, 469, 707]
Image ID: img_00003.jpg
Height: 1263
Width: 947
  Class: date
  Bounding Box: [285, 767, 519, 784]
  Transcription: 21.05.26
  Class: date
  Bounding Box: [284, 747, 523, 765]
  Transcription: 20.08.27
  Class: due
  Bounding Box: [607, 772, 654, 796]
  Class: prod
  Bounding Box: [612, 733, 660, 757]
  Class: code
  Bounding Box: [527, 767, 550, 785]
Image ID: img_00004.jpg
Height: 839
Width: 1119
  Class: date
  Bounding Box: [551, 393, 758, 409]
  Transcription: 2021.03.23
  Class: due
  Bounding Box: [516, 354, 571, 381]
  Class: due
  Bounding Box: [774, 349, 802, 375]
  Class: code


In [7]:
import json
from PIL import Image, ImageDraw
import os
import matplotlib.pyplot as plt

# Đọc file JSON
with open('../ExpirationDate/Products-Real/train/annotations.json', 'r', encoding='utf-8') as json_file:
    data = json.load(json_file)

# Thư mục chứa ảnh
image_dir = '../ExpirationDate/Products-Real/train/images'
# Kích thước chữ
font_size = 20

# Xử lý từng ảnh trong JSON
for img_file, img_data in data.items():
    image_path = os.path.join(image_dir, img_file)
    try:
        with Image.open(image_path) as img:
            draw = ImageDraw.Draw(img)

            # Duyệt qua các chú thích và vẽ chữ
            for entry in img_data["ann"]:
                if "transcription" in entry:
                    bbox = entry["bbox"]
                    transcription = entry["transcription"]
                    # Vẽ chữ tại tọa độ trong bbox
                    draw.text((bbox[0], bbox[1]), transcription, fill="white")

            # Lưu ảnh đã chỉnh sửa
            modified_image_path = os.path.join(image_dir, f"{img_file}")
            img.save(modified_image_path)
            print(f"Đã xử lý và lưu: {modified_image_path}")

            

    except Exception as e:
        print(f"Lỗi khi xử lý {img_file}: {e}")


Đã xử lý và lưu: ../ExpirationDate/Products-Real/train/images\img_00001.jpg
Đã xử lý và lưu: ../ExpirationDate/Products-Real/train/images\img_00002.jpg
Đã xử lý và lưu: ../ExpirationDate/Products-Real/train/images\img_00003.jpg
Đã xử lý và lưu: ../ExpirationDate/Products-Real/train/images\img_00004.jpg
Đã xử lý và lưu: ../ExpirationDate/Products-Real/train/images\img_00005.jpg
Đã xử lý và lưu: ../ExpirationDate/Products-Real/train/images\img_00006.jpg
Đã xử lý và lưu: ../ExpirationDate/Products-Real/train/images\img_00007.jpg
Đã xử lý và lưu: ../ExpirationDate/Products-Real/train/images\img_00008.jpg
Đã xử lý và lưu: ../ExpirationDate/Products-Real/train/images\img_00009.jpg
Đã xử lý và lưu: ../ExpirationDate/Products-Real/train/images\img_00010.jpg
Đã xử lý và lưu: ../ExpirationDate/Products-Real/train/images\img_00011.jpg
Đã xử lý và lưu: ../ExpirationDate/Products-Real/train/images\img_00012.jpg
Đã xử lý và lưu: ../ExpirationDate/Products-Real/train/images\img_00013.jpg
Đã xử lý và 

In [9]:
import json
import os

def process_images(ann_file, image_dir):
    # Đọc file JSON
    with open(ann_file, 'r', encoding='utf-8') as json_file:
        data = json.load(json_file)

    # Xử lý từng ảnh trong JSON
    for img_file, img_data in data.items():
        image_path = os.path.join(image_dir, img_file)
        try:
            # Lấy thông tin từ ảnh
            image_features = img_data
            transcriptions = [entry.get("transcription", "") for entry in image_features.get("ann", [])]
            bboxes_block = [entry.get("bbox") for entry in image_features.get("ann", [])]
            categories = [entry.get("cls", "unknown") for entry in image_features.get("ann", [])]

            # Trả về thông tin ảnh
            yield {
                "id": img_file,
                "transcriptions": transcriptions,
                "bboxes_block": bboxes_block,
                "categories": categories,
                "image_path": image_path,
                "width": image_features.get("width"),
                "height": image_features.get("height"),
            }
        except Exception as e:
            print(f"Lỗi khi xử lý {img_file}: {e}")

# Sử dụng hàm
ann_file = '../ExpirationDate/Products-Real/train/annotations.json'
image_dir = '../ExpirationDate/Products-Real/train/images'

for image_info in process_images(ann_file, image_dir):
    print(image_info)

{'id': 'img_00001.jpg', 'transcriptions': ['2025/01/28', '', ''], 'bboxes_block': [[289, 660, 418, 673], [286, 611, 315, 626], [353, 642, 429, 657]], 'categories': ['date', 'due', 'due'], 'image_path': '../ExpirationDate/Products-Real/train/images\\img_00001.jpg', 'width': 756, 'height': 1008}
{'id': 'img_00002.jpg', 'transcriptions': ['2021/10/28', ''], 'bboxes_block': [[321, 661, 526, 675], [378, 694, 469, 707]], 'categories': ['date', 'code'], 'image_path': '../ExpirationDate/Products-Real/train/images\\img_00002.jpg', 'width': 779, 'height': 1107}
{'id': 'img_00003.jpg', 'transcriptions': ['21.05.26', '20.08.27', '', '', ''], 'bboxes_block': [[285, 767, 519, 784], [284, 747, 523, 765], [607, 772, 654, 796], [612, 733, 660, 757], [527, 767, 550, 785]], 'categories': ['date', 'date', 'due', 'prod', 'code'], 'image_path': '../ExpirationDate/Products-Real/train/images\\img_00003.jpg', 'width': 947, 'height': 1263}
{'id': 'img_00004.jpg', 'transcriptions': ['2021.03.23', '', '', ''], 'b

In [4]:
# Xử lý từng ảnh trong JSON
for img_file, img_data in data.items():
    image_path = os.path.join(image_dir, img_file)
    try:
        with Image.open(image_path) as img:
            draw = ImageDraw.Draw(img)

            # Duyệt qua các chú thích và vẽ chữ
            for entry in img_data["ann"]:
                if "transcription" in entry:
                    bbox = entry["bbox"]
                    transcription = entry["transcription"]
                    # Vẽ chữ tại tọa độ trong bbox
                    draw.text((bbox[0], bbox[1]), transcription, fill="white")

            # Lưu ảnh đã chỉnh sửa
            modified_image_path = os.path.join(image_dir, f"modified_{img_file}")
            img.save(modified_image_path)
            print(f"Đã xử lý và lưu: {modified_image_path}")

            # Hiển thị ảnh đã nhúng
            plt.imshow(img)
            plt.axis('off')  # Tắt trục
            plt.show()


SyntaxError: incomplete input (953046937.py, line 24)

In [2]:
def process_annotations(image_folder, annotations_file):
    # Đọc tệp JSON chứa nhãn
    with open(annotations_file, 'r', encoding='utf-8') as file:
        annotations = json.load(file)

    # Đánh giá mô hình trên tập dữ liệu
    for img_file, img_data in annotations.items():
        image_path = os.path.join(image_folder, img_file)
        if not os.path.exists(image_path):
            print(f"File không tồn tại: {image_path}")
            continue
        
        # Chỉ nhận diện transcription
        for annotation in img_data.get('ann', []):
            transcription = annotation.get('transcription', '')
            if transcription:
                print(f"Hình ảnh: {img_file}, Transcription: {transcription}")

                
image_folder = '../ExpirationDate/Products-Real/train/images'
annotations_file = '../ExpirationDate/Products-Real/train/annotations.json'
process_annotations(image_folder, annotations_file)

Hình ảnh: img_00001.jpg, Transcription: 2025/01/28
Hình ảnh: img_00002.jpg, Transcription: 2021/10/28
Hình ảnh: img_00003.jpg, Transcription: 21.05.26
Hình ảnh: img_00003.jpg, Transcription: 20.08.27
Hình ảnh: img_00004.jpg, Transcription: 2021.03.23
Hình ảnh: img_00005.jpg, Transcription: 23.07.2022
Hình ảnh: img_00006.jpg, Transcription: 22.10.2020
Hình ảnh: img_00006.jpg, Transcription: 22.10.2021
Hình ảnh: img_00007.jpg, Transcription: 04/03/2023
Hình ảnh: img_00008.jpg, Transcription: 29 NOV 2021
Hình ảnh: img_00009.jpg, Transcription: 25.09.2021
Hình ảnh: img_00010.jpg, Transcription: 25 07 2020
Hình ảnh: img_00010.jpg, Transcription: 23 10 2021
Hình ảnh: img_00011.jpg, Transcription: 15-09-2021
Hình ảnh: img_00012.jpg, Transcription: 15.10.2021
Hình ảnh: img_00013.jpg, Transcription: FEB/26/21
Hình ảnh: img_00014.jpg, Transcription: MAR 22 21
Hình ảnh: img_00015.jpg, Transcription: 2021.07.01
Hình ảnh: img_00016.jpg, Transcription: 03/04/23
Hình ảnh: img_00017.jpg, Transcription

In [4]:
from datasets import load_dataset

dataset = load_dataset('../Test_ORC/ExpirationDate/ExpirationDate.py') 

train_data = dataset['train']
validation_data = dataset['validation']
test_data = dataset['test']

FileNotFoundError: Couldn't find a dataset script at c:\Users\kienb\Downloads\Test_ORC\Test_ORC\ExpirationDate\ExpirationDate.py

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
import json
import os

In [1]:
class SimpleOCR(nn.Module):
    def __init__(self, num_chars):
        super(SimpleOCR, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.fc1 = nn.Linear(64 * 7 * 7, 128)
        self.fc2 = nn.Linear(128, num_chars)

NameError: name 'nn' is not defined