# OCR Text Extraction

Extracting text from document images using Tesseract OCR to create a text dataset for multi-modal learning.

## Setup and Imports

Import necessary modules for OCR processing.

In [None]:
import sys
from pathlib import Path

# Add project root to path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

import pandas as pd
from src.utils.paths import RAW_DATA_DIR
from src.utils.ocr_utils import run_ocr

## Load Sample Dataset

Load the existing labels CSV from the RVL-CDIP sample directory.

In [None]:
sample_dir = RAW_DATA_DIR / "rvl_cdip_sample"
df = pd.read_csv(sample_dir / "labels.csv")
df.head()

## Run OCR on All Images

Extract text from all 200 document images using Tesseract. This may take a few minutes.

In [None]:
texts = []

for _, row in df.iterrows():
    img_path = sample_dir / row["image_file"]
    text = run_ocr(img_path)
    texts.append(text)

len(texts)

## Save OCR Results

Add the extracted text to the dataframe and save it as a new CSV file with OCR text included.

In [None]:
df["ocr_text"] = texts
df.to_csv(sample_dir / "labels_with_text.csv", index=False)

df.head()