# Paddle OCR을 실행해보자

In [390]:
# CUDA-11.8이 설치되어 있는 GPU 사용 가능한 컴퓨터기 때문에 gpu 버전 다운로드
# !pip install paddlepaddle-gpu
# %pip install paddlepaddle-gpu

In [391]:
# PaddleOCR whl package 설치
# !pip install "paddleocr>=2.0.1"
# %pip install paddleocr

# 한번 사용해보도록 하자

실행하기 전, 컴퓨터에 cuDNN이 설치되어있는지 확인하자. 없다면 설치

CUDA 11.8 사용 기준으로

wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
sudo dpkg -i cuda-keyring_1.1-1_all.deb
sudo apt-get update
sudo apt-get -y install cudnn
sudo apt-get -y install cudnn-cuda-11

In [392]:
import paddle
print(f"Is compiled with CUDA? {paddle.is_compiled_with_cuda()}")
print(f"CUDA devices:, {paddle.device.cuda.device_count()}")

Is compiled with CUDA? True
CUDA devices:, 3


In [393]:
# 콘솔에 아래 명령 입력
# paddleocr --image_dir .책표지_기타_000001.jpg --lang=korean

In [394]:
from paddleocr import PaddleOCR,draw_ocr

ocr = PaddleOCR(lang="korean")

[2025/04/01 13:32:12] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=True, use_xpu=False, use_npu=False, use_mlu=False, use_gcu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/home/sslab3/.paddleocr/whl/det/ml/Multilingual_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/home/sslab3/.paddleocr/whl/rec/korean/korean_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 32

In [395]:
# 코드로 실행하자면?

image_path = 'inputs/2.png'
result = ocr.ocr(image_path, cls=True)


# draw result
from PIL import Image
result = result[0]
image = Image.open(image_path).convert('RGB')
boxes = [line[0] for line in result]
txts = [line[1][0] for line in result]
scores = [line[1][1] for line in result]
im_show = draw_ocr(image, boxes, txts, scores, font_path='MaruBuri-Regular.ttf')
im_show = Image.fromarray(im_show)
im_show.save('paddle_ocr_result.png')
im_show.show()

# 한글 또는 영어, 숫자 txt만 포함된 box로 재구성
boxes = [boxes[idx] for idx in range(len(boxes)) if txts[idx]]

for idx in range(len(boxes)):
    print(f"boxes: {boxes[idx]}")

[2025/04/01 13:32:16] ppocr DEBUG: dt_boxes num : 70, elapsed : 0.05645275115966797
[2025/04/01 13:32:16] ppocr DEBUG: rec_res num  : 70, elapsed : 0.2253570556640625
boxes: [[282.0, 212.0], [867.0, 212.0], [867.0, 354.0], [282.0, 354.0]]
boxes: [[2991.0, 357.0], [3467.0, 318.0], [3478.0, 460.0], [3002.0, 500.0]]
boxes: [[2610.0, 368.0], [2965.0, 347.0], [2974.0, 498.0], [2619.0, 519.0]]
boxes: [[2028.0, 403.0], [2278.0, 403.0], [2278.0, 525.0], [2028.0, 525.0]]
boxes: [[2303.0, 401.0], [2586.0, 380.0], [2595.0, 498.0], [2312.0, 519.0]]
boxes: [[1401.0, 429.0], [2000.0, 405.0], [2005.0, 536.0], [1406.0, 559.0]]
boxes: [[962.0, 449.0], [1364.0, 434.0], [1368.0, 548.0], [966.0, 563.0]]
boxes: [[197.0, 464.0], [668.0, 455.0], [670.0, 569.0], [199.0, 578.0]]
boxes: [[726.0, 460.0], [927.0, 460.0], [927.0, 553.0], [726.0, 553.0]]
boxes: [[3060.0, 557.0], [3290.0, 557.0], [3290.0, 696.0], [3060.0, 696.0]]
boxes: [[2746.0, 570.0], [3028.0, 541.0], [3044.0, 696.0], [2761.0, 724.0]]
boxes: [[18

In [396]:
# !pip install --upgrade pip
# %pip install --upgrade pip

In [397]:
# !pip install opencv-python
# %pip install opencv-python

In [398]:
import cv2
import numpy as np

# 1. cv2 imread로 그레이스케일 이미지 획득
image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

# 2. 연산 효율성을 올리기 위해 boxes를 numpy array로 변환
boxes_np_arr = np.array([])
if boxes and len(boxes) > 0:
    boxes_np_arr = np.array([np.array(box) for box in boxes])

    for box in boxes:
        print(box)
else:
    print("No boxes")

[[282.0, 212.0], [867.0, 212.0], [867.0, 354.0], [282.0, 354.0]]
[[2991.0, 357.0], [3467.0, 318.0], [3478.0, 460.0], [3002.0, 500.0]]
[[2610.0, 368.0], [2965.0, 347.0], [2974.0, 498.0], [2619.0, 519.0]]
[[2028.0, 403.0], [2278.0, 403.0], [2278.0, 525.0], [2028.0, 525.0]]
[[2303.0, 401.0], [2586.0, 380.0], [2595.0, 498.0], [2312.0, 519.0]]
[[1401.0, 429.0], [2000.0, 405.0], [2005.0, 536.0], [1406.0, 559.0]]
[[962.0, 449.0], [1364.0, 434.0], [1368.0, 548.0], [966.0, 563.0]]
[[197.0, 464.0], [668.0, 455.0], [670.0, 569.0], [199.0, 578.0]]
[[726.0, 460.0], [927.0, 460.0], [927.0, 553.0], [726.0, 553.0]]
[[3060.0, 557.0], [3290.0, 557.0], [3290.0, 696.0], [3060.0, 696.0]]
[[2746.0, 570.0], [3028.0, 541.0], [3044.0, 696.0], [2761.0, 724.0]]
[[1865.0, 611.0], [2339.0, 597.0], [2343.0, 744.0], [1870.0, 758.0]]
[[2349.0, 603.0], [2714.0, 589.0], [2718.0, 702.0], [2353.0, 717.0]]
[[1367.0, 639.0], [1826.0, 639.0], [1826.0, 765.0], [1367.0, 765.0]]
[[515.0, 672.0], [1319.0, 646.0], [1324.0, 772.0

Error: no "view" rule for type "image/png" passed its test case
       (for more information, add "--debug=1" on the command line)


In [399]:
# 3. 각 box = bounding box
# bounding box 영역 안에 있는 각 영역을 Crop해서 새로운 이미지로 추출한다
import os
import shutil

current_path = os.getcwd()
shutil.rmtree(current_path + "/cropped", ignore_errors=True)
os.makedirs(current_path + "/cropped", exist_ok=True)
croped_images = []

for idx, box in enumerate(boxes_np_arr):
    # box 점 정렬
    rect = np.zeros((4, 2), dtype="float32")
    s = box.sum(axis=1) # 네 좌표의 합
    diff = np.diff(box, axis=1) # 네 좌표의 차이
    rect[0] = box[np.argmin(s)] # 좌상단 좌표
    rect[2] = box[np.argmax(s)] # 우하단 좌표
    rect[1] = box[np.argmin(diff)] # 우상단 좌표
    rect[3] = box[np.argmax(diff)] # 좌하단 좌표

    # rect = [ 좌상, 우상, 우하, 좌하 ]

    # 네 꼭짓점 좌표를 numpy array로 변환
    # 이때, 검출 영역이 너무 Fit하게 되어있어서 +5을 해준다
    offset = 5
    image_height, image_width = image.shape[:2]
    image_height -= 1
    image_width -= 1

    # rect[0] = 좌상단
    x1 = max(0, rect[0][0] - offset)
    y1 = max(0, rect[0][1] - offset)
    # rect[1] = 우상단
    x2 = min(rect[1][0] + offset, image_width)
    y2 = max(0, rect[1][1] - offset)
    # rect[2] = 우하단
    x3 = min(rect[2][0] + offset, image_width)
    y3 = min(rect[2][1] + offset, image_height)
    # rect[3] = 좌하단
    x4 = max(0, rect[3][0] - offset)
    y4 = min(rect[3][1] + offset, image_height)
    pts = np.array(
        [
            [x1, y1],
            [x2, y2],
            [x3, y3],
            [x4, y4]
        ],
        dtype="float32"
    )

    # 네 좌표를 바탕으로 이미지의 너비 및 높이 계산
    widthA = np.sqrt((pts[2][0] - pts[3][0])**2 + (pts[2][1] - pts[3][1])**2)
    widthB = np.sqrt((pts[1][0] - pts[0][0])**2 + (pts[1][1] - pts[0][1])**2)
    maxWidth = max(int(widthA), int(widthB))

    heightA = np.sqrt((pts[1][0] - pts[2][0])**2 + (pts[1][1] - pts[2][1])**2)
    heightB = np.sqrt((pts[0][0] - pts[3][0])**2 + (pts[0][1] - pts[3][1])**2)
    maxHeight = max(int(heightA), int(heightB))

    # 출력될 이미지 좌표 설정
    dst = np.array(
        [
            [0, 0],
            [maxWidth - 1, 0],
            [maxWidth - 1, maxHeight - 1],
            [0, maxHeight - 1]
        ],
        dtype="float32"
    )

    # Perspective Transformation
    M = cv2.getPerspectiveTransform(pts, dst)
    warped = cv2.warpPerspective(image, M, (maxWidth, maxHeight))

    # # 음영 제거 -> 모폴로지 연산
    # kernel_size = 150
    # kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (kernel_size, kernel_size))

    # background = cv2.morphologyEx(warped, cv2.MORPH_OPEN, kernel)
    # gray_minus_bg = cv2.subtract(warped, background)
    # normed = cv2.normalize(gray_minus_bg, None, 0, 255, cv2.NORM_MINMAX)

    croped_images.append(warped)
    cv2.imwrite(f"./cropped/croped_{idx}.png", warped)

print(f"# of Cropped Images: {len(croped_images)}")

# of Cropped Images: 68


In [400]:

shutil.rmtree(current_path + "/projection", ignore_errors=True)
os.makedirs(current_path + "/projection", exist_ok=True)
vertical_projected_images = []

def hough_transform(img):
    # 엣지 검출
    edges = cv2.Canny(bin_img, 50, 200)

    # 허프 변환으로 직선 검출
    lines = cv2.HoughLines(edges, 1, np.pi / 180, 100)
    if lines is None:
        return img

    # 검출된 모든 직선의 각도를 구한 뒤, 최빈값으로 이미지 기울기 추정
    angles = []
    for line in lines:
        rho, theta = line[0]
        # theta는 0~pi 범위, 세로선(=수직)인 경우를 제외하고 처리
        # 일반적인 텍스트는 수평에 가까우므로 theta를 잘 조절해야 함
        # theta가 (80~100도) 근처면 수직인 경우가 많으므로 제외
        deg = np.rad2deg(theta)
        if 80 < deg < 100:  # 수직에 가까운 각도는 제외(문자 baseline일 확률 낮음)
            continue
        angles.append(deg)

    if not angles:
        return img

    median_angle = np.median(angles)
    # 허프 변환 각도는 수평선 기준이므로, median_angle만큼 이미지를 회전
    # 보통은 텍스트가 약간 기울었다면 median_angle ~ 180 - median_angle 범위 재조정 필요
    # 여기서는 간단히 median_angle을 사용
    if median_angle > 90:
        rotate_angle = median_angle - 180
    else:
        rotate_angle = median_angle

    # 이미지 회전 수행
    (h, w) = img.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, rotate_angle, 1.0)
    rotated = cv2.warpAffine(img, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
    return rotated
    
for idx, image in sorted(enumerate(croped_images)):
    print("#" * 50)
    bin_img = cv2.threshold(image, 128, 255, cv2.THRESH_BINARY_INV)[1] # 흰색 글자로 변환
    
    # 허프만 변환
    # hough = hough_transform(bin_img)
    
    vertical_sum = np.sum(bin_img, axis=0)  # 각 열의 픽셀 합계
    print(f"vertical_sum: {vertical_sum}")
    
    columns = bin_img.shape[1] # 열의 개수
    threshold_value = 2500 # 임계값
    
    start_col = 0
    for col in range(columns): # 각 열마다 임계값 비교
        if vertical_sum[col] <= threshold_value:
            # [start_col ~ col-1] 구간을 하나의 글자로 추정
            if col - start_col > 5:  # 폭이 너무 좁지 않도록 필터링
                char_img = bin_img[:, max(0, start_col - offset):min(col + offset, columns)]
                vertical_projected_images.append(char_img)
                cv2.imwrite(f"./projection/projection_{idx}_{start_col}_{col}.png", cv2.bitwise_not(char_img))
            start_col = col + 1

print(len(vertical_projected_images))


##################################################
vertical_sum: [    0     0     0     0     0     0     0     0     0     0     0     0
  1020  1275  1785  1785  2040  2040  2040  2040  2040  2040  2040  2040
  2040  2040  2040  2040  2040  2040  2040  2040  2040  2040  2040  4845
  7905  9945 10455 10965 11220 11730 10965  8670  6885  6375  6120  8925
 10455 10710 11730 12240 13005 13260 11985 11730 11220 10710 10710 14535
 14790 15300 14535 14790 13515 13515 12240 11730 10455 10455 10200  9945
  9945  9945 10200 10455  9690  9945  9945  9945  9690  9690  9180  8925
  9180  9945 10455  9945  9690  9435  7905  6630  5100  2550  1530  1530
  1785  1785  1785  1785  1530  1530  1785  1785  1785  1785  1785  1785
  1785  1785  1785  1785  1785  2040  2040  2040  1530  1530  1275  1020
     0     0     0     0     0     0     0     0     0     0     0   765
  1275  1785  3060  3570  3825  3825  4080  3825  3825  3825  3825  3570
  3570  3315  3570  3570  4590  6375  7905  8925  8925  943

##################################################
vertical_sum: [    0     0     0     0     0     0     0     0     0  1530  2040  2295
  2550  2550  2805  2805  2805  2805  3060  3315  3315  3315  3570  3570
  3825  3825  4080  4590  5100  5865  6630  7395  7905  7395  7395  7140
  6885  6630  6630  6375  5610  4590  3825  3060  3315  3315  3570  3570
  3825  4080  4080  4080  4080  4590  5355  5865  6630  7650  8415  7905
  7650  7140  6630  6120  5355  4335  2295  2295  2040  2295  2040  1785
  1785  1530   765     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0 11220 18360 18870 19125 19125
 18870 18105  6120  2295  2040  2040  2040  2040  2040  2040  2040  2040
  2040  2040  2040  2040  2040  2040  1785  1785  1530  1275   765     0
     0     0     0     0  1275  1530  1530  1530  1530  1785  1785  1785
  1785  1785  1785  2805  3825  4845  6120  7395  8160  8670 10200 12750
 13515 13770 13770 13260 13005 11730 10200  9945  9690  943

In [401]:
# 수직 투영으로 추출한 글자 이미지 크기가 20px 이상인 이미지만 선택
shutil.rmtree(current_path + "/best_size", ignore_errors=True)
os.makedirs(current_path + "/best_size", exist_ok=True)
best_size_images = []
for idx, image in enumerate(vertical_projected_images):
    height, width = image.shape[:2]

    if height < 20 or width < 20:
        continue
    
    # gaussian blur
    image = cv2.GaussianBlur(image, (5, 5), 0)
    
    cv2.imwrite(f"./best_size/character_{idx}.png", cv2.bitwise_not(image))
    best_size_images.append(cv2.bitwise_not(image))


In [402]:
import re

shutil.rmtree(current_path + "/final", ignore_errors=True)
os.makedirs(current_path + "/final", exist_ok=True)
save_dir = current_path + "/final/"
best_size_dir = current_path + "/best_size/"
result_cnt = 0
result_images = []
for idx, filename in enumerate(sorted(os.listdir(best_size_dir))):
    if filename.endswith((".png", ".jpg", ".jpeg")):
        img = cv2.imread(f"{best_size_dir}" + filename, cv2.IMREAD_GRAYSCALE)
        result = ocr.ocr(img, det=False, rec=True, cls=False)

        if not result or not result[0] or not result[0][0]: # 인식 못하면 continue
            continue

        text = result[0][0][0]
        confidence = result[0][0][1]

        if re.search(r"[^가-힣]", text): # 한글이 아닌 문자가 있는 경우 continue
            continue

        if text and len(text) == 1 and confidence > 0.9:
            result_cnt += 1
            print(f"#### {idx}th // Character: {text}, Confidence: {confidence}")
            result_images.append((img, text))


for image, text in result_images:
    cv2.imwrite(save_dir + f"{text}.png", image)


#### 0th // Character: 동, Confidence: 0.9869202375411987
#### 1th // Character: 호, Confidence: 0.9914605617523193
#### 3th // Character: 고, Confidence: 0.9890404343605042
#### 5th // Character: 초, Confidence: 0.9959445595741272
#### 6th // Character: 강, Confidence: 0.9996229410171509
#### 9th // Character: 들, Confidence: 0.9974265694618225
#### 17th // Character: 호, Confidence: 0.9991931319236755
#### 23th // Character: 도, Confidence: 0.9996401071548462
#### 29th // Character: 같, Confidence: 0.9993897676467896
#### 30th // Character: 은, Confidence: 0.9823402166366577
#### 31th // Character: 존, Confidence: 0.9995673298835754
#### 33th // Character: 호, Confidence: 0.9980819225311279
#### 39th // Character: 설, Confidence: 0.9995643496513367
#### 57th // Character: 루, Confidence: 0.9025371670722961
#### 58th // Character: 토, Confidence: 0.9982927441596985
#### 60th // Character: 는, Confidence: 0.9998307228088379
#### 61th // Character: 루, Confidence: 0.9789921641349792
#### 62th // Charact