# Paddle OCR을 실행해보자

In [584]:
# CUDA-11.8이 설치되어 있는 GPU 사용 가능한 컴퓨터기 때문에 gpu 버전 다운로드
!pip install paddlepaddle-gpu



In [585]:
# PaddleOCR whl package 설치
!pip install "paddleocr>=2.0.1"



# 한번 사용해보도록 하자

실행하기 전, 컴퓨터에 cuDNN이 설치되어있는지 확인하자. 없다면 설치

CUDA 11.8 사용 기준으로

wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
sudo dpkg -i cuda-keyring_1.1-1_all.deb
sudo apt-get update
sudo apt-get -y install cudnn
sudo apt-get -y install cudnn-cuda-11

In [586]:
import paddle
print(f"Is compiled with CUDA? {paddle.is_compiled_with_cuda()}")
print(f"CUDA devices:, {paddle.device.cuda.device_count()}")

Is compiled with CUDA? True
CUDA devices:, 1


In [587]:
# 콘솔에 아래 명령 입력
# paddleocr --image_dir .책표지_기타_000001.jpg --lang=korean

In [588]:
from paddleocr import PaddleOCR,draw_ocr

ocr = PaddleOCR(lang="korean")

[2025/03/20 00:06:25] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=True, use_xpu=False, use_npu=False, use_mlu=False, use_gcu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/home/uijong/.paddleocr/whl/det/ml/Multilingual_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/home/uijong/.paddleocr/whl/rec/korean/korean_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 32

In [589]:
# 코드로 실행하자면?

img_path = 'test_imgs/test14.jpg'
result = ocr.ocr(img_path, cls=True)
for idx in range(len(result)):
    res = result[idx]
    for line in res:
        print(line)


# draw result
from PIL import Image
result = result[0]
image = Image.open(img_path).convert('RGB')
boxes = [line[0] for line in result]
txts = [line[1][0] for line in result]
scores = [line[1][1] for line in result]
im_show = draw_ocr(image, boxes, txts, scores, font_path='MaruBuri-Regular.ttf')
im_show = Image.fromarray(im_show)
im_show.save('first_ocr_result.jpg')
im_show.show()

[2025/03/20 00:06:28] ppocr DEBUG: dt_boxes num : 7, elapsed : 0.023177146911621094
[2025/03/20 00:06:28] ppocr DEBUG: rec_res num  : 7, elapsed : 0.030938148498535156
[[[99.0, 139.0], [290.0, 135.0], [291.0, 193.0], [100.0, 197.0]], ('한국축구는', 0.9993413686752319)]
[[[306.0, 136.0], [479.0, 136.0], [479.0, 199.0], [306.0, 199.0]], ('죽었다!!', 0.999909520149231)]
[[[138.0, 207.0], [226.0, 212.0], [225.0, 238.0], [136.0, 233.0]], ('다음카페', 0.9985363483428955)]
[[[232.0, 211.0], [393.0, 217.0], [392.0, 246.0], [231.0, 240.0]], ('"너땐에졌어"', 0.9088563323020935)]


In [590]:
# for idx in range(len(boxes)):
#     print(f"boxes: {boxes[idx]}, txts: {txts[idx]}")

# 한글 또는 영어, 숫자 txt만 포함된 box로 재구성
boxes = [boxes[idx] for idx in range(len(boxes)) if txts[idx].isalnum()]

for idx in range(len(boxes)):
    print(f"boxes: {boxes[idx]}")

boxes: [[99.0, 139.0], [290.0, 135.0], [291.0, 193.0], [100.0, 197.0]]
boxes: [[138.0, 207.0], [226.0, 212.0], [225.0, 238.0], [136.0, 233.0]]


In [591]:
!pip install --upgrade pip



In [592]:
!pip install opencv-python



In [593]:
import cv2
import numpy as np

# 1. cv2 imread로 다시 그레이스케일 이미지 획득
image = cv2.imread('test_imgs/test14.jpg', cv2.IMREAD_GRAYSCALE)

# 2. 연산 효율성을 올리기 위해 boxes를 numpy array로 변환
boxes_np_arr = np.array([])
if boxes and len(boxes) > 0:
    boxes_np_arr = np.array([np.array(box) for box in boxes])

    for box in boxes:
        print(box)
else:
    print("No boxes")

[[99.0, 139.0], [290.0, 135.0], [291.0, 193.0], [100.0, 197.0]]
[[138.0, 207.0], [226.0, 212.0], [225.0, 238.0], [136.0, 233.0]]


In [594]:
# 3. 각 box = bounding box
# bounding box 영역 안에 있는 각 영역을 Crop해서 새로운 이미지로 추출한다
import os

current_path = os.getcwd()
os.makedirs(current_path + "/cropped", exist_ok=True)
croped_images = []

for idx, box in enumerate(boxes_np_arr):
    # box 점 정렬
    rect = np.zeros((4, 2), dtype="float32")
    s = box.sum(axis=1) # 네 좌표의 합
    diff = np.diff(box, axis=1) # 네 좌표의 차이
    rect[0] = box[np.argmin(s)] # 좌상단 좌표
    rect[2] = box[np.argmax(s)] # 우하단 좌표
    rect[1] = box[np.argmin(diff)] # 우상단 좌표
    rect[3] = box[np.argmax(diff)] # 좌하단 좌표

    # rect = [ 좌상, 우상, 우하, 좌하 ]

    # 네 꼭짓점 좌표를 numpy array로 변환
    # 이때, 검출 영역이 너무 Fit하게 되어있어서 +10을 해준다
    offset = 10
    image_height, image_width = image.shape[:2]
    image_height -= 1
    image_width -= 1

    # rect[0] = 좌상단
    x1 = max(0, rect[0][0] - offset)
    y1 = max(0, rect[0][1] - offset)
    # rect[1] = 우상단
    x2 = min(rect[1][0] + offset, image_width)
    y2 = max(0, rect[1][1] - offset)
    # rect[2] = 우하단
    x3 = min(rect[2][0] + offset, image_width)
    y3 = min(rect[2][1] + offset, image_height)
    # rect[3] = 좌하단
    x4 = max(0, rect[3][0] - offset)
    y4 = min(rect[3][1] + offset, image_height)
    pts = np.array(
        [
            [x1, y1],
            [x2, y2],
            [x3, y3],
            [x4, y4]
        ],
        dtype="float32"
    )

    # 네 좌표를 바탕으로 이미지의 너비 및 높이 계산
    widthA = np.sqrt((pts[2][0] - pts[3][0])**2 + (pts[2][1] - pts[3][1])**2)
    widthB = np.sqrt((pts[1][0] - pts[0][0])**2 + (pts[1][1] - pts[0][1])**2)
    maxWidth = max(int(widthA), int(widthB))

    heightA = np.sqrt((pts[1][0] - pts[2][0])**2 + (pts[1][1] - pts[2][1])**2)
    heightB = np.sqrt((pts[0][0] - pts[3][0])**2 + (pts[0][1] - pts[3][1])**2)
    maxHeight = max(int(heightA), int(heightB))

    # print(maxWidth, maxHeight)

    # 출력될 이미지 좌표 설정
    dst = np.array(
        [
            [0, 0],
            [maxWidth - 1, 0],
            [maxWidth - 1, maxHeight - 1],
            [0, maxHeight - 1]
        ],
        dtype="float32"
    )

    # Perspective Transformation
    M = cv2.getPerspectiveTransform(pts, dst)
    warped = cv2.warpPerspective(image, M, (maxWidth, maxHeight))

    # 음영 제거 -> 모폴로지 연산
    kernel_size = 150
    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (kernel_size, kernel_size))

    background = cv2.morphologyEx(warped, cv2.MORPH_OPEN, kernel)
    gray_minus_bg = cv2.subtract(warped, background)
    normed = cv2.normalize(gray_minus_bg, None, 0, 255, cv2.NORM_MINMAX)

    croped_images.append(normed)
    cv2.imwrite(f"./cropped/croped_{idx}.png", normed)

In [595]:
print(len(croped_images))

2


In [596]:
# 각 이미지 적응형 이진화 수행
# 근데 이미지에 들어있는 글자가 검정색일수도 있고 흰색일수도 있어서
# Crop된 이미지의 평균 픽셀값을 계산해서 평균값이 128보다 작으면 cv2.THRESH_BINARY_INV로 수행
os.makedirs(current_path + "/preprocessed", exist_ok=True)

for idx, croped_image in enumerate(croped_images):

    # Median 블러링
    blurred = cv2.medianBlur(croped_image, 3)

    # 적응형 이진화
    # binarization = cv2.adaptiveThreshold(
    #     blurred,
    #     maxValue=255,
    #     adaptiveMethod=cv2.ADAPTIVE_THRESH_MEAN_C,
    #     thresholdType=cv2.THRESH_BINARY,  # 또는 THRESH_BINARY_INV
    #     blockSize=21,   # 주변 픽셀 블록 크기
    #     C=13     # 결과에서 뺄 상수
    # )
    _, binarization = cv2.threshold(blurred, 127, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    # 만약 배경이 반전되어 있다면 invert
    if np.mean(binarization) < 128:
        binarization = cv2.bitwise_not(binarization)

    kernel = np.ones((1, 1), np.uint8)
    # opened = cv2.morphologyEx(binarization, cv2.MORPH_CLOSE, kernel)

    opened = cv2.dilate(binarization, kernel, iterations=2)
    # eroded = cv2.erode(binarization, kernel, iterations=1)

    cv2.imwrite(f"./preprocessed/preprocessed_{idx}.png", opened)


In [597]:
# 전처리 이미지 불러와서 테두리 따고, 한 글자씩 추출
os.makedirs(current_path + "/character", exist_ok=True)
binarizationed_images = []
preprocessed_dir = current_path + "/preprocessed/"

for filename in sorted(os.listdir(preprocessed_dir)):
    if filename.endswith((".png", ".jpg", ".jpeg")):  # Correct way to check multiple extensions
        img_path = os.path.join(preprocessed_dir, filename)
        # Load as grayscale directly
        img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
        _, thresh = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        thresh_inv = cv2.bitwise_not(thresh)

        if thresh_inv is not None:
            binarizationed_images.append(thresh_inv)

print(f"Loaded {len(binarizationed_images)} images")

for idx, image in enumerate(binarizationed_images):
    contours, _ = cv2.findContours(
        image,
        cv2.RETR_EXTERNAL,
        cv2.CHAIN_APPROX_SIMPLE
    )

    # 외곽선 기반으로 각 글자 이미지 추출
    for contour in contours:
        x, y, w, h = cv2.boundingRect(contour)

        print(f"Contour: {x}, {y}, {w}, {h}")

        # w, h 비율이 1.4 : 1 이상 넘어가는 이미지는 제거
        if w / h > 1.4:
            continue

        character = image[y:y+h, x:x+w]
        character = cv2.bitwise_not(character)
        cv2.imwrite(f"./character/character_{idx}_{x}_{y}_{w}_{h}.png", character)


Loaded 2 images
Contour: 166, 48, 31, 19
Contour: 22, 46, 25, 15
Contour: 163, 37, 39, 9
Contour: 0, 21, 15, 57
Contour: 86, 18, 35, 47
Contour: 17, 17, 20, 27
Contour: 123, 12, 37, 54
Contour: 167, 10, 31, 22
Contour: 39, 9, 45, 55
Contour: 96, 8, 15, 8
Contour: 21, 8, 12, 7
Contour: 0, 45, 12, 1
Contour: 54, 14, 10, 16
Contour: 11, 13, 12, 16
Contour: 75, 12, 23, 22
Contour: 66, 12, 8, 22
Contour: 24, 11, 28, 22
Contour: 106, 10, 3, 8


In [598]:
import re


os.makedirs(current_path + "/character_extracted", exist_ok=True)
character_extracted_dir = current_path + "/character_extracted/"
character_dir = current_path + "/character/"

for idx, filename in enumerate(sorted(os.listdir(character_dir))):
    if filename.endswith((".png", ".jpg", ".jpeg")):
        img = cv2.imread(f"{character_dir}" + filename, cv2.IMREAD_GRAYSCALE)
        result = ocr.ocr(img, det=False, rec=True, cls=False)

        # 전체 픽셀 중 흰색 픽셀 대 검정 픽셀 비율이 80% 이상인 경우 continue
        if np.mean(img) > 200:
            continue

        if not result or not result[0] or not result[0][0]: # 인식 못하면 continue
            continue

        text = result[0][0][0]
        confidence = result[0][0][1]

        if re.search(r"[^가-힣ㄱ-ㅎㅏ-ㅣ]", text): # 한글이 아닌 문자가 있는 경우 continue
            continue

        if text and len(text) == 1 and confidence > 0.9:
            print(f"{idx}th // Character: {text}, Confidence: {confidence}")
            cv2.imwrite(character_extracted_dir + f"character_{idx}_{confidence:.2f}.jpg", img)


1th // Character: 구, Confidence: 0.9112302660942078
3th // Character: 국, Confidence: 0.9955982565879822
4th // Character: 죽, Confidence: 0.9493946433067322
