# Paddle OCR을 실행해보자

In [253]:
# CUDA-11.8이 설치되어 있는 GPU 사용 가능한 컴퓨터기 때문에 gpu 버전 다운로드
!pip install paddlepaddle-gpu



In [254]:
# PaddleOCR whl package 설치
!pip install "paddleocr>=2.0.1"



# 한번 사용해보도록 하자

실행하기 전, 컴퓨터에 cuDNN이 설치되어있는지 확인하자. 없다면 설치

CUDA 11.8 사용 기준으로

wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
sudo dpkg -i cuda-keyring_1.1-1_all.deb
sudo apt-get update
sudo apt-get -y install cudnn
sudo apt-get -y install cudnn-cuda-11

In [255]:
import paddle
print(f"Is compiled with CUDA? {paddle.is_compiled_with_cuda()}")
print(f"CUDA devices:, {paddle.device.cuda.device_count()}")

Is compiled with CUDA? True
CUDA devices:, 1


In [256]:
# 콘솔에 아래 명령 입력
# paddleocr --image_dir .책표지_기타_000001.jpg --lang=korean

In [257]:
from paddleocr import PaddleOCR,draw_ocr

ocr = PaddleOCR(lang="korean")

[2025/03/17 22:55:51] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=True, use_xpu=False, use_npu=False, use_mlu=False, use_gcu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/home/uijong/.paddleocr/whl/det/ml/Multilingual_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/home/uijong/.paddleocr/whl/rec/korean/korean_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 32

In [266]:
# 코드로 실행하자면?
# Paddleocr supports Chinese, English, French, German, Korean and Japanese.
# You can set the parameter `lang` as `ch`, `en`, `fr`, `german`, `korean`, `japan`
# to switch the language model in order.
# ocr = PaddleOCR(use_angle_cls=True, lang='korean') # need to run only once to download and load model into memory
img_path = 'asdfasdf.jpg'
result = ocr.ocr(img_path, cls=True)
for idx in range(len(result)):
    res = result[idx]
    for line in res:
        print(line)


# draw result
from PIL import Image
result = result[0]
image = Image.open(img_path).convert('RGB')
boxes = [line[0] for line in result]
txts = [line[1][0] for line in result]
scores = [line[1][1] for line in result]
im_show = draw_ocr(image, boxes, txts, scores, font_path='MaruBuri-Regular.ttf')
im_show = Image.fromarray(im_show)
im_show.save('result.jpg')
im_show.show()

[2025/03/17 22:58:36] ppocr DEBUG: dt_boxes num : 2, elapsed : 0.04052305221557617
[2025/03/17 22:58:36] ppocr DEBUG: rec_res num  : 2, elapsed : 0.021736621856689453
[[[221.0, 196.0], [1224.0, 160.0], [1233.0, 416.0], [231.0, 452.0]], ('도리토스', 0.9994586706161499)]
[[[252.0, 1890.0], [1429.0, 1819.0], [1446.0, 2092.0], [269.0, 2163.0]], ('나쵸치즈맛', 0.8946758508682251)]


In [259]:
# for idx in range(len(boxes)):
#     print(f"boxes: {boxes[idx]}, txts: {txts[idx]}")

# 한글 또는 영어, 숫자 txt만 포함된 box로 재구성
boxes = [boxes[idx] for idx in range(len(boxes)) if txts[idx].isalnum()]

for idx in range(len(boxes)):
    print(f"boxes: {boxes[idx]}")

boxes: [[0.0, 278.0], [163.0, 267.0], [169.0, 352.0], [4.0, 363.0]]


In [260]:
!pip install --upgrade pip



In [261]:
!pip install opencv-python



In [267]:
import cv2
import numpy as np

# 1. cv2 imread로 다시 그레이스케일 이미지 획득
image = cv2.imread('asdfasdf.jpg', cv2.IMREAD_GRAYSCALE)

# 2. 연산 효율성을 올리기 위해 boxes를 numpy array로 변환
boxes_np_arr = np.array([])
if boxes and len(boxes) > 0:
    boxes_np_arr = np.array([np.array(box) for box in boxes])

    for box in boxes:
        print(box)
else:
    print("No boxes")

[[221.0, 196.0], [1224.0, 160.0], [1233.0, 416.0], [231.0, 452.0]]
[[252.0, 1890.0], [1429.0, 1819.0], [1446.0, 2092.0], [269.0, 2163.0]]


In [278]:
# 3. 각 box = bounding box
# bounding box 영역 안에 있는 각 영역을 Crop해서 새로운 이미지로 추출한다
croped_images = []
for idx, box in enumerate(boxes_np_arr):
    # box 점 정렬
    rect = np.zeros((4, 2), dtype="float32")
    s = box.sum(axis=1) # 네 좌표의 합
    diff = np.diff(box, axis=1) # 네 좌표의 차이
    rect[0] = box[np.argmin(s)] # 좌상단 좌표
    rect[2] = box[np.argmax(s)] # 우하단 좌표
    rect[1] = box[np.argmin(diff)] # 우상단 좌표
    rect[3] = box[np.argmax(diff)] # 좌하단 좌표

    # rect = [ 좌상, 우상, 우하, 좌하 ]

    # 네 꼭짓점 좌표를 numpy array로 변환
    # 이때, 검출 영역이 너무 Fit하게 되어있어서 +25을 해준다
    offset = 50
    image_height, image_width = image.shape[:2]
    image_height -= 1
    image_width -= 1

    # rect[0] = 좌상단
    x1 = max(0, rect[0][0] - offset)
    y1 = max(0, rect[0][1] - offset)
    # rect[1] = 우상단
    x2 = min(rect[1][0] + offset, image_width)
    y2 = max(0, rect[1][1] - offset)
    # rect[2] = 우하단
    x3 = min(rect[2][0] + offset, image_width)
    y3 = min(rect[2][1] + offset, image_height)
    # rect[3] = 좌하단
    x4 = max(0, rect[3][0] - offset)
    y4 = min(rect[3][1] + offset, image_height)
    pts = np.array(
        [
            [x1, y1],
            [x2, y2],
            [x3, y3],
            [x4, y4]
        ],
        dtype="float32"
    )

    # 네 좌표를 바탕으로 이미지의 너비 및 높이 계산
    widthA = np.sqrt((pts[2][0] - pts[3][0])**2 + (pts[2][1] - pts[3][1])**2)
    widthB = np.sqrt((pts[1][0] - pts[0][0])**2 + (pts[1][1] - pts[0][1])**2)
    maxWidth = max(int(widthA), int(widthB))

    heightA = np.sqrt((pts[1][0] - pts[2][0])**2 + (pts[1][1] - pts[2][1])**2)
    heightB = np.sqrt((pts[0][0] - pts[3][0])**2 + (pts[0][1] - pts[3][1])**2)
    maxHeight = max(int(heightA), int(heightB))

    # print(maxWidth, maxHeight)

    # 출력될 이미지 좌표 설정
    dst = np.array(
        [
            [0, 0],
            [maxWidth - 1, 0],
            [maxWidth - 1, maxHeight - 1],
            [0, maxHeight - 1]
        ],
        dtype="float32"
    )

    # Perspective Transformation
    M = cv2.getPerspectiveTransform(pts, dst)
    warped = cv2.warpPerspective(image, M, (maxWidth, maxHeight))

    # 음영 제거 -> 모폴로지 연산
    kernel_size = 150
    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (kernel_size, kernel_size))

    background = cv2.morphologyEx(warped, cv2.MORPH_OPEN, kernel)
    gray_minus_bg = cv2.subtract(warped, background)
    normed = cv2.normalize(gray_minus_bg, None, 0, 255, cv2.NORM_MINMAX)

    croped_images.append(normed)
    cv2.imwrite(f"./cropped/croped_{idx}.png", normed)

In [279]:
print(len(croped_images))

2


In [312]:
# 각 이미지 적응형 이진화 수행
# 근데 이미지에 들어있는 글자가 검정색일수도 있고 흰색일수도 있어서
# Crop된 이미지의 평균 픽셀값을 계산해서 평균값이 128보다 작으면 cv2.THRESH_BINARY_INV로 수행

for idx, croped_image in enumerate(croped_images):

    # Median 블러링
    blurred = cv2.medianBlur(croped_image, 3)

    # 적응형 이진화
    # binarization = cv2.adaptiveThreshold(
    #     blurred,
    #     maxValue=255,
    #     adaptiveMethod=cv2.ADAPTIVE_THRESH_MEAN_C,
    #     thresholdType=cv2.THRESH_BINARY,  # 또는 THRESH_BINARY_INV
    #     blockSize=21,   # 주변 픽셀 블록 크기
    #     C=13     # 결과에서 뺄 상수
    # )
    _, binarization = cv2.threshold(blurred, 127, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    # 만약 배경이 반전되어 있다면 invert
    if np.mean(binarization) < 128:
        binarization = cv2.bitwise_not(binarization)

    kernel = np.ones((5, 5), np.uint8)
    opened = cv2.morphologyEx(binarization, cv2.MORPH_CLOSE, kernel)

    cv2.imwrite(f"./character/character_{idx}.png", opened)


In [336]:
import os

# Load all images from the character directory
binarizationed_images = []
character_dir = "./character/"
for filename in sorted(os.listdir(character_dir)):
    if filename.endswith((".png", ".jpg", ".jpeg")):  # Correct way to check multiple extensions
        img_path = os.path.join(character_dir, filename)
        # Load as grayscale directly
        img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
        _, thresh = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        thresh_inv = cv2.bitwise_not(thresh)

        if thresh_inv is not None:
            binarizationed_images.append(thresh_inv)

print(f"Loaded {len(binarizationed_images)} images")

for idx, image in enumerate(binarizationed_images):
    contours, hierarchy = cv2.findContours(
        image,
        cv2.RETR_EXTERNAL,
        cv2.CHAIN_APPROX_SIMPLE
    )

    # 외곽선 기반으로 각 글자 이미지 추출
    for contour in contours:
        x, y, w, h = cv2.boundingRect(contour)

        print(f"Contour: {x}, {y}, {w}, {h}")

        # 글자 이미지 추출 및 검정색으로 다시 변환
        character = image[y:y+h, x:x+w]
        character = cv2.bitwise_not(character)
        cv2.imwrite(f"./final/character_{idx}_{x}_{y}_{w}_{h}.png", character)


Loaded 2 images
Contour: 844, 251, 227, 60
Contour: 857, 59, 198, 146
Contour: 571, 50, 245, 246
Contour: 18, 48, 252, 242
Contour: 292, 40, 232, 267
Contour: 808, 238, 210, 50
Contour: 1065, 196, 159, 116
Contour: 726, 58, 48, 271
Contour: 820, 57, 191, 151
Contour: 557, 55, 159, 186
Contour: 1044, 53, 116, 134
Contour: 294, 52, 230, 240
Contour: 44, 44, 233, 273
Contour: 1176, 40, 71, 162
