# import 
## related module
> such as numpy, pandas

In [None]:
# === System and File Management ===
import os
import logging
import gc
import psutil

# === Data Handling and Visualization ===
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# === Image Processing ===
import cv2
from PIL import Image as PILImage  # PIL is often used for RGB image handling

# === Optical Character Recognition (OCR) ===
import pytesseract                 # Tesseract-based OCR
from pix2text import Pix2Text     # Transformer-based OCR model (Pix2Text)


## self defined modules
> for config, yolo packages
> 
> for further use, maybe include OCR packages


In [None]:
# self defined modules
from project_function import config, convert_images, vision_crop    # for basic settings, PDF pages output, yolo computing


# define logging function for recording system working state

In [None]:
# === Logging Setup ===
def setup_logging():
    """
    Set up logging to record runtime messages into 'record.log'.
    Format includes timestamp, log level, and message.
    """
    handler = logging.FileHandler("record.log", mode="a", encoding="utf-8")
    formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
    handler.setFormatter(formatter)

    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    logger.addHandler(handler)


# system configuration

In [None]:
# === Initialize Logging ===
setup_logging()

logging.info("---------- Setting PDF and CSV paths ----------")

# === Load path configuration from config.py ===
pdf_path = config.PDF_PATH                      # Path to input PDFs
pdf_image_folder = config.PDF_IMAGE_PATH        # Folder where PDF pages are converted to images
tem_image_folder = config.TEM_IMAGE_PATH        # Folder to save cropped TEM images
description_image_path = config.DESCRIPTION_PATH# Folder to save description/figure caption crops
csv_path = config.CSV_PATH                      # Output CSV for metadata logging

# === (Optional) Configure OCR tools ===
# pytesseract.pytesseract.tesseract_cmd = config.TESSERACT
# p2t = Pix2Text(model_backend="onnx", device="cuda")  # Initialize Pix2Text OCR using GPU

logging.info("Finished setting paths.")

# main function

In [None]:
buffer = []  # temp storage for CSV rows
buffer_threshold = 100

# === Read PDF file list ===
dir = os.listdir(pdf_path)

# Create CSV if not exist
if not os.path.isfile(csv_path):
    df = pd.DataFrame(columns=['parent_image', 'sub_image', 'TEM_type'])
    df.to_csv(csv_path, index=False)

for i, file in enumerate(dir):
    if i <= 30000:
        print (f'skip {i}th files')
        continue
    logging.info(f"Processing {i}th file: {file}")

    try:
        images = convert_images.convert_pdf_to_image(pdf_path, file, zoom_factor=5)
    except Exception as e:
        logging.error(f"Error converting PDF {file}: {e}")
        continue

    if images is None:
        logging.warning(f"No images extracted from {file}")
        continue

    cut_num = 0
    for image in images:
        image = np.frombuffer(image.samples, dtype=np.uint8).reshape(image.height, image.width, image.n)
        crop_images = vision_crop.crop_images(image)

        if not crop_images:
            continue

        for crop_image in crop_images:
            cut_image, description_image = vision_crop.image_description(crop_image)
            TEM_images = vision_crop.tem_images_crop(cut_image) if cut_image is not None else None

            type_of_TEM = 0
            if TEM_images is not None:
                for tem_image in TEM_images:
                    tem_type = vision_crop.TEM_classifier(tem_image)
                    if tem_type not in ['None', 'SEM']:
                        type_of_TEM += 1
            if type_of_TEM == 0:
                continue

            if TEM_images is not None and description_image is not None:
                filename = os.path.splitext(file)[0]
                cut_image_filename = f"PDF{filename}_Image{cut_num + 1}.png"

                try:
                    if cut_image.shape[-1] == 3:
                        cut_image = cv2.cvtColor(cut_image, cv2.COLOR_BGR2RGB)
                    if description_image.shape[-1] == 3:
                        description_image = cv2.cvtColor(description_image, cv2.COLOR_BGR2RGB)
                except:
                    continue

                cut_image_path = os.path.join(pdf_image_folder, cut_image_filename)
                description_image_path_full = os.path.join(description_image_path, cut_image_filename)

                cv2.imwrite(cut_image_path, cut_image)
                cv2.imwrite(description_image_path_full, description_image)

                logging.info(f"Saved {cut_image_filename}")

                tem_num = 0
                for tem_image in TEM_images:
                    tem_type = vision_crop.TEM_classifier(tem_image)
                    if tem_type not in ['None', 'SEM']:
                        tem_image_filename = f"PDF{filename}_Image{cut_num + 1}_{tem_num + 1}.png"
                        tem_image_path_full = os.path.join(tem_image_folder, tem_image_filename)

                        cv2.imwrite(tem_image_path_full, tem_image)

                        # Append to buffer
                        buffer.append({
                            'parent_image': cut_image_filename,
                            'sub_image': tem_image_filename,
                            'TEM_type': tem_type
                        })
                        tem_num += 1

                cut_num += 1

    # === Write CSV in chunks ===
    if len(buffer) >= buffer_threshold:
        df_temp = pd.DataFrame(buffer)
        df_temp.to_csv(csv_path, mode='a', header=False, index=False)
        buffer.clear()
        del df_temp
        gc.collect()

    # === Free memory per PDF ===
    for var in ['images', 'image', 'crop_images', 'cut_image', 'description_image', 'TEM_images', 'tem_image']:
        if var in locals():
            del locals()[var]
        
    gc.collect()

    if i % 500 == 0:
        mem = psutil.Process().memory_info().rss / 1024**2
        logging.info(f"[Info] Memory used at file {i}: {mem:.2f} MB")

    if i == 40000:
        break

# === Write remaining buffer (after loop) ===
if buffer:
    df_temp = pd.DataFrame(buffer)
    df_temp.to_csv(csv_path, mode='a', header=False, index=False)
    buffer.clear()
    del df_temp
    gc.collect()

