In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Hugging Face 라이브러리 설치
!pip install huggingface_hub ultralytics
!pip install doclayout-yolo
!git clone https://github.com/opendatalab/DocLayout-YOLO.git
# 필요한 라이브러리 설치
!apt-get update
!apt-get install -y poppler-utils
!pip install pdf2image
!pip install torch transformers==4.40.0 accelerate
!pip install pytesseract
!pip install opencv-python
!pip install pillow
!pip install easyocr

Collecting ultralytics
  Downloading ultralytics-8.3.39-py3-none-any.whl.metadata (35 kB)
Collecting ultralytics-thop>=2.0.0 (from ultralytics)
  Downloading ultralytics_thop-2.0.12-py3-none-any.whl.metadata (9.4 kB)
Downloading ultralytics-8.3.39-py3-none-any.whl (896 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m896.9/896.9 kB[0m [31m31.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ultralytics_thop-2.0.12-py3-none-any.whl (26 kB)
Installing collected packages: ultralytics-thop, ultralytics
Successfully installed ultralytics-8.3.39 ultralytics-thop-2.0.12
Collecting doclayout-yolo
  Downloading doclayout_yolo-0.0.2-py3-none-any.whl.metadata (8.9 kB)
Collecting thop>=0.1.1 (from doclayout-yolo)
  Downloading thop-0.1.1.post2209072238-py3-none-any.whl.metadata (2.7 kB)
Downloading doclayout_yolo-0.0.2-py3-none-any.whl (708 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m708.2/708.2 kB[0m [31m33.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloadi

In [3]:
from huggingface_hub import hf_hub_download
from ultralytics import YOLO
import numpy as np
import pandas as pd
import cv2
import json
from doclayout_yolo import YOLOv10
import os
import uuid
from pdf2image import convert_from_path
import easyocr
from collections import Counter
from google.colab.patches import cv2_imshow
from tqdm import tqdm
import transformers
import torch

Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.


In [4]:
class TableCellExtractor:
    def __init__(self):
        """
        Initialize the TableCellExtractor with necessary components
        """
        # OCR reader initialization
        self.reader = easyocr.Reader(['ko', 'en'])

    def process_image(self, image):
      """
      Process the input image and extract table cells

      Args:
          image: JPG image object (numpy array or file path)

      Returns:
          dict: JSON formatted extraction results
      """
      if isinstance(image, str):
          self.image = cv2.imread(image)
      else:
          self.image = image

      self.result = self.image.copy()

      self.detect_lines()
      self.classify_lines_and_find_intersections()
      self.remove_duplicate_points()

      # 텍스트 추출 및 셀 정보 얻기
      data, extracted_cells = self.extract_text_from_cells()

      # 데이터프레임 생성 및 처리
      df = pd.DataFrame(data)

      # 빈 행/열 제거를 위한 전처리
      # 모든 빈 문자열을 NaN으로 변환
      df = df.replace(r'^\s*$', np.nan, regex=True)
      df = df.replace('', np.nan)

      # 모든 값이 NaN인 행과 열 제거
      df = df.dropna(how='all', axis=0).dropna(how='all', axis=1)

      # 인덱스 리셋
      df = df.reset_index(drop=True)

      # NaN을 다시 빈 문자열로 변환
      df = df.fillna('')

      # 처리된 데이터프레임을 기반으로 셀 정보 업데이트
      processed_cells = []
      for i in range(len(df)):
          for j in range(len(df.columns)):
              # 원본 좌표 찾기
              original_cell = next(
                  (cell for cell in extracted_cells
                  if cell['row'] == i + 1 and cell['col'] == j + 1),
                  None
              )

              if original_cell:
                  processed_cells.append({
                      'row': i + 1,
                      'col': j + 1,
                      'text': df.iloc[i, j],
                      'coordinates': original_cell['coordinates']
                  })

      # 최종 결과를 JSON 형태로 반환
      final_result = {
          'cells': processed_cells,
          'grid_info': {
              'rows': len(df),
              'cols': len(df.columns)
          }
      }

      return final_result

    def detect_lines(self):
        """
        Detect lines in the image using Canny edge detection and Hough transform
        """
        # 1. 엣지 검출 (Canny)
        self.edges = cv2.Canny(self.image, 50, 150, apertureSize=3)

        # 2. Hough 변환 적용하여 선 감지
        self.lines = cv2.HoughLinesP(
            self.edges,
            1,
            np.pi/180,
            threshold=100,
            minLineLength=100,
            maxLineGap=10
        )

        return self.lines

    def classify_lines_and_find_intersections(self):
        """
        Classify lines as horizontal or vertical and find their intersection points
        """
        self.intersection_points = []
        self.horizontal_lines = []
        self.vertical_lines = []

        if self.lines is not None:
            # 선 분류 (수평/수직)
            for line in self.lines:
                x1, y1, x2, y2 = line[0]
                angle = np.abs(np.arctan2(y2 - y1, x2 - x1) * 180.0 / np.pi)

                if angle < 10 or angle > 170:
                    self.horizontal_lines.append(line[0])
                elif 80 < angle < 100:
                    self.vertical_lines.append(line[0])

            # 이미지의 경계에 가상의 테두리 선 추가
            height, width = self.image.shape[:2]
            margin = 10
            self.horizontal_lines.append([margin, margin, width - margin, margin])  # 상단 경계선
            self.horizontal_lines.append([margin, height - margin, width - margin, height - margin])  # 하단 경계선
            self.vertical_lines.append([margin, margin, margin, height - margin])  # 왼쪽 경계선
            self.vertical_lines.append([width - margin, margin, width - margin, height - margin])  # 오른쪽 경계선

            # 교차점 찾기
            self._find_intersection_points()

            # 끝점 처리
            self._process_end_points()

    def _find_intersection_points(self):
        """
        Calculate intersection points between horizontal and vertical lines
        """
        for h_line in self.horizontal_lines:
            for v_line in self.vertical_lines:
                x1, y1, x2, y2 = h_line
                x3, y3, x4, y4 = v_line

                denominator = ((x1 - x2) * (y3 - y4) - (y1 - y2) * (x3 - x4))
                if denominator != 0:
                    t = ((x1 - x3) * (y3 - y4) - (y1 - y3) * (x3 - x4)) / denominator
                    u = -((x1 - x2) * (y1 - y3) - (y1 - y2) * (x1 - x3)) / denominator

                    if 0 <= t <= 1 and 0 <= u <= 1:
                        x = int(x1 + t * (x2 - x1))
                        y = int(y1 + t * (y2 - y1))
                        self.intersection_points.append((x, y))
        # 교차점 정렬
        self.intersection_points = sorted(set(self.intersection_points), key=lambda p: (p[1], p[0]))

    def _process_end_points(self):
        """
        Process end points of lines and combine with intersection points
        """
        # 끝점 수집
        end_points = []
        for line in self.horizontal_lines + self.vertical_lines:
            x1, y1, x2, y2 = line
            end_points.append((x1, y1))
            end_points.append((x2, y2))

        # 최소/최대 좌표 계산
        x_values = [point[0] for point in end_points]
        y_values = [point[1] for point in end_points]

        x_min, x_max = min(x_values), max(x_values)
        y_min, y_max = min(y_values), max(y_values)

        # 필터링된 끝점 선택
        self.filtered_end_points = [
            (x, y) for (x, y) in end_points
            if (x_min <= x <= x_min + 10 or x_max - 10 <= x <= x_max) or
               (y_min <= y <= y_min + 10 or y_max - 10 <= y <= y_max)
        ]

        # 모든 점 결합
        self.all_points = self.intersection_points + self.filtered_end_points

    def remove_duplicate_points(self, distance_threshold=15):
        """
        Remove duplicate points that are within a certain distance threshold

        Args:
            distance_threshold (int): Maximum distance between points to be considered duplicates
        """
        self.unique_points = []
        points_array = np.array(self.all_points)

        # 각 점에 대해 거리를 계산하여 중복 제거
        for point in self.all_points:
            is_unique = True
            for unique_point in self.unique_points:
                distance = np.linalg.norm(np.array(point) - np.array(unique_point))
                if distance <= distance_threshold:
                    is_unique = False
                    break
            if is_unique:
                self.unique_points.append(point)

    def extract_text_from_cells(self, min_height=30, min_width=30):
      """
      Extract text from each cell in the table grid and return with cell coordinates

      Args:
          min_height (int): Minimum height of cell to process
          min_width (int): Minimum width of cell to process

      Returns:
          tuple: (2D list of extracted text, list of cell information)
      """
      # x, y 좌표 분리 및 정렬
      self.x_coords = sorted(list(set([point[0] for point in self.intersection_points])))
      self.y_coords = sorted(list(set([point[1] for point in self.intersection_points])))

      # 격자 구간별 텍스트 추출
      data = []
      extracted_cells = []

      for i in range(len(self.y_coords) - 1):
          row = []
          for j in range(len(self.x_coords) - 1):
              # 격자 영역 좌표 계산
              top_left_x = self.x_coords[j]
              top_left_y = self.y_coords[i]
              bottom_right_x = self.x_coords[j + 1]
              bottom_right_y = self.y_coords[i + 1]

              # 격자 영역 잘라내기
              tile = self.image[top_left_y:bottom_right_y, top_left_x:bottom_right_x]

              # 셀 정보 생성
              cell_info = {
                  'row': i + 1,
                  'col': j + 1,
                  'coordinates': {
                      'top_left': (top_left_x, top_left_y),
                      'bottom_right': (bottom_right_x, bottom_right_y)
                  }
              }

              # 너무 작은 이미지는 빈 텍스트로 처리
              if tile.shape[0] < min_height or tile.shape[1] < min_width:
                  row.append("")
                  cell_info['text'] = ""
                  extracted_cells.append(cell_info)
                  continue

              # EasyOCR로 텍스트 추출
              text_result = self.reader.readtext(tile, detail=0)
              text = "\n".join(text_result).strip()
              row.append(text)
              cell_info['text'] = text
              extracted_cells.append(cell_info)

          data.append(row)

      return data, extracted_cells

In [5]:
import os
import cv2
import json
import numpy as np
import easyocr
from tqdm import tqdm

def extract_text_from_cells(cells_data):
    """Extract text content from cells data"""
    extracted_text = []
    for cell in cells_data:
        if 'text' in cell:
            extracted_text.append(cell['text'])
    return ' '.join(extracted_text)

def process_images_in_directory(input_dir, output_dir):
    """
    Process all images in a directory and extract text.

    Args:
        input_dir (str): Directory containing the images.
        output_dir (str): Directory to save the JSON file with extracted text.
    """
    if not os.path.exists(input_dir):
        raise ValueError(f"Input directory does not exist: {input_dir}")

    results = []

    # Initialize TableCellExtractor once to reuse the EasyOCR reader
    processor = TableCellExtractor()

    # Iterate through all images in the directory
    for file_name in tqdm(os.listdir(input_dir), desc="Processing Images"):
        if file_name.lower().endswith((".jpg", ".jpeg", ".png", ".bmp", ".tiff")):
            image_path = os.path.join(input_dir, file_name)

            # Read image
            image = cv2.imread(image_path)
            if image is None:
                print(f"Failed to read image: {image_path}")
                continue

            # Process image and extract text
            try:
                # Process the image using TableCellExtractor
                result = processor.process_image(image)

                if result and 'cells' in result and 'grid_info' in result:
                    # Extract text from cells
                    extracted_text = extract_text_from_cells(result['cells'])

                    json_data = {
                        "data_id": file_name,
                        "제목": file_name,  # 임시로 파일명 사용
                        "유형": "표",
                        "내용": extracted_text,
                    }
                    results.append(json_data)
                else:
                    print(f"No valid table data found in {file_name}")

            except Exception as e:
                print(f"Error processing {file_name}: {e}")

    # Save results as JSON
    save_results_as_json(results, output_dir)

def save_results_as_json(results, output_dir):
    """Save results to a JSON file."""
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # JSON save path
    results_json_path = os.path.join(output_dir, "result_table.json")

    # Save to JSON
    with open(results_json_path, "w", encoding="utf-8") as json_file:
        json.dump(results, json_file, ensure_ascii=False, indent=4)
    print(f"Extracted text saved to: {results_json_path}")

def process_single_result(result, file_name):
    """
    Process a single result dictionary and convert to desired format.

    Args:
        result (dict): Dictionary containing cells and grid_info
        file_name (str): Name of the file being processed
    """
    extracted_text = extract_text_from_cells(result['cells'])

    json_data = {
        "data_id": file_name,
        "제목": file_name,
        "유형": "표",
        "내용": extracted_text,
    }

    return json_data

In [None]:
# 여러 이미지 처리할 경우
input_dir = "/content/drive/MyDrive/cv project/기본표/기본표jpg"
output_dir = "/content/drive/MyDrive/cv project/기본표/학습데이터"
process_images_in_directory(input_dir, output_dir)



Progress: |██████████████████████████████████████████████████| 100.0% Complete



Progress: |██████████████████████████████████████████████████| 100.1% Complete

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  df = df.replace(r'^\s*$', np.nan, regex=True)
  df = df.replace(r'^\s*$', np.nan, regex=True)
  df = df.replace(r'^\s*$', np.nan, regex=True)
  df = df.replace(r'^\s*$', np.nan, regex=True)
  df = df.replace(r'^\s*$', np.nan, regex=True)
  df = df.replace(r'^\s*$', np.nan, regex=True)
  df = df.replace(r'^\s*$', np.nan, regex=True)
  df = df.replace(r'^\s*$', np.nan, regex=True)
  df = df.replace(r'^\s*$', np.nan, regex=True)
  df = df.replace(r'^\s*$', np.nan, regex=True)
  df = df.replace(r'^\s*$', np.nan, regex=True)
  df = df.replace(r'^\s*$', np.nan, regex=True)
  df = df.replace(r'^\s*$', np.nan, regex=True)
  df = df.replace(r'^\s*$', np.nan, regex=True)
  df = df.replace(r'^\s*$', np.nan, regex=True)
  df = df.replace(r'^\s*$', np.nan, regex=True)
  df = df.replace(r'^\s*$', np.nan, regex=True)
  df = df.replace(r'^\s*$', np.nan, regex=True)
  df = df.replace(r'^\s*$', np.nan, regex=True)
  df = df.replace(r'^\s

Extracted text saved to: /content/drive/MyDrive/cv project/기본표/학습데이터/result_table.json


In [None]:
# Load the JSON data from the file
file_path = '/content/outputs/result_table.json'

with open(file_path, 'r', encoding='utf-8') as file:
    data = json.load(file)

# Process the JSON to remove the content of the "제목" key and completely remove the "cells" key
for entry in data:
    if "제목" in entry:
        entry["제목"] = ""
    if "cells" in entry:
        del entry["cells"]
    if "grid" in entry:
        del entry["grid"]
    entry["요약"] = ""  # Add an empty "요약" key

# Save the modified JSON back to a file
output_file_path = '/content/outputs/result_table_modified.json'
with open(output_file_path, 'w', encoding='utf-8') as file:
    json.dump(data, file, ensure_ascii=False, indent=4)

In [None]:
import os
import json

# Paths to the necessary files and folders
result_table_path = '/content/drive/MyDrive/cv project/기본표/학습데이터/result_table.json'
folder_path = '/content/drive/MyDrive/cv project/기본표/기본표json'
output_path = '/content/drive/MyDrive/cv project/기본표/학습데이터/result_table_with_summary.json'

try:
    # Load the result_table.json
    with open(result_table_path, 'r', encoding='utf-8') as result_file:
        result_table = json.load(result_file)

    # Iterate over each entry in result_table
    for entry in result_table:
        data_id = entry.get("data_id", "")
        if not data_id:
            continue

        # Extract the matching file name (remove .jpg)
        base_name = data_id.rsplit(".", 1)[0]
        target_json_path = os.path.join(folder_path, f"{base_name}.json")

        # Check if the corresponding JSON file exists
        if os.path.exists(target_json_path):
            # Load the corresponding JSON file
            with open(target_json_path, 'r', encoding='utf-8') as json_file:
                target_data = json.load(json_file)

            # Extract the value of "table_data.text_explanation"
            table_data = target_data.get("table_data", {})
            text_explanation = table_data.get("table_data.text_explanation", "")

            # Add the text_explanation value to the 요약 key in the current entry
            entry["요약"] = text_explanation
        else:
            # If the file does not exist, set 요약 as an empty string
            entry["요약"] = ""

    # Save the updated result_table.json with the 요약 key added
    with open(output_path, 'w', encoding='utf-8') as output_file:
        json.dump(result_table, output_file, ensure_ascii=False, indent=4)

    print(f"Updated result_table.json has been saved to: {output_path}")

except Exception as e:
    print(f"An error occurred: {e}")

Updated result_table.json has been saved to: /content/drive/MyDrive/cv project/기본표/학습데이터/result_table_with_summary.json
