## Test

In [None]:
### IMPORTS ###
from pathlib import Path
import pytesseract
from pdf2image import convert_from_path
from pdfminer.high_level import extract_text
import easyocr
from pdf2image import convert_from_path
import numpy as np


### GLOBAL VARIABLES ###
# Paths for input and output directories
PATH_INPUT = Path(".").parent.resolve()/'pdf_files'
PATH_OUTPUT = Path(".").parent.resolve()/'extracted_files'
PDF_FILES = list(PATH_INPUT.glob("*.pdf"))
                 
# OCR and text extraction settings)
OCR_DPI = 300
WEAK_CHAR_THRESHOLD = 500
WEAK_LINE_THRESHOLD = 10



In [None]:

# --- Utility Functions ---

def extract_pdfminer(file_path):
    return extract_text(file_path)

def extract_ocr(file_path):
    images = convert_from_path(file_path, dpi=OCR_DPI)
    return "\n".join([pytesseract.image_to_string(img, config ="--psm 1") for img in images])

def is_extraction_weak(text):
    return len(text) < WEAK_CHAR_THRESHOLD or text.count("\n") < WEAK_LINE_THRESHOLD


In [None]:
try_extract_mine = extract_text(PATH_INPUT / PDF_FILES[0])
with open(PATH_OUTPUT / f"{PDF_FILES[0].split(".")[0]}.txt", "w") as f:
    f.write(try_extract_mine)

In [65]:
try_extract_mine

'2d Civ. B303096\nCourt of Appeal, Second District, Division 6, California.\nHuy Fong Foods, Inc. v. Underwood Ranches, LP\n66 Cal.App.5th 1112 (Cal. Ct. App. 2021)\n•\n281 Cal. Rptr. 3d 757\nDecided Jul 27, 2021\n2d Civ. No. B303096\n07-27-2021\nHUY FONG FOODS, INC., Plaintiff, Cross-\ndefendant, and Appellant, v. UNDERWOOD\nRANCHES, LP, Defendant, Cross-complainant\nand Appellant; Underwood & Son, LLC, Cross-\nComplainant and Respondent; David Tran, Cross-\ndefendant and Respondent.\nGILBERT, P. J.\nLatham & Watkins, Joshua G. Hamilton, New\nYork, NY, Dixie C. Tauber, Los Angeles, CA,\nRoman \nMartinez, \nCharles \nS. \nDameron,\nWashington, DC, Riley T. Keenan, Washington,\nDC; Pearson, Simon & Warshaw, Sherman Oaks,\nCA, Thomas J. Nolan for Plaintiff, Cross-\ndefendant and Appellant Huy Fong Foods, Inc.,\nand Cross-defendant and Respondent David Tran.\nFerguson Case Orr Paterson, Wendy C. Lascher,\nJohn A. Hribar, Ventura, CA, James Q.\nMcDermott, Ventura, CA, Michael A. Velthoen,\

In [89]:
reader = easyocr.Reader(['en'])
pages = convert_from_path(PATH_INPUT / PDF_FILES[1], dpi=300)


In [79]:
img = np.array(pages[1])
x0 = 305
y1 = 3150
img_crop = img[:y1, x0:]  
result = reader.readtext(np.array(img_crop), detail=0, paragraph=True)
print("/n".join(result))
with open(PATH_OUTPUT / f"{PDF_FILES[0].split(".")[0]}.txt", "w") as f:
    f.write("/n ".join(result))

Huy Fong Foods; Inc: V_ Underwood Ranches, LP 66 Cal.App:' Sth 1112 (Cal: Ct: App. 2021)/nor the first 10 years, the parties executed written reements   specifying the price per pound and )lume to be supplied. Thereafter; the parties dealt ith each other informally with oral agreements. riginally, Huy Fong needed more peppers than nderwood could   supply, SO it  contracted with her farmers as needed./nDue to Tran's   suggestion and encouragement; Underwood invested millions of dollars in acquiring additional acres in Kern County and, to a lesser extent, Ventura County By the end of the 2016 growing season, Underwood had  acquired over 1,800 acres in Kern County It took a year Or more to prepare the ground for growing peppers Many of the leases extended into the   2020's, 2030's, and beyond. By 2016, Huy Fong accounted for approximately 80 percent of Underwood's revenue./nnderwood's pepper sales to Huy Fong grew ong with the success   of Huy Fong's business. raig testified that; by 2005

In [90]:
img = np.array(pages[2])

result = reader.readtext(img, detail=0, paragraph=True)
with open(PATH_OUTPUT / f"{PDF_FILES[1].split(".")[0]}.txt", "w") as f:
    f.write("\n".join(result))

#### Another level Auto detect paragraphsm

In [58]:
import cv2
import numpy as np
from pdf2image import convert_from_path
import easyocr

reader = easyocr.Reader(['en'])

# Step 1: Load first page of PDF as image
pages = convert_from_path(PATH / PDF_FILES[1], dpi=300)
img = np.array(pages[0])
gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)

# Step 2: Threshold to binary
_, thresh = cv2.threshold(gray, 180, 255, cv2.THRESH_BINARY_INV)

# Step 3: Morphological operations to merge lines into blocks
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
morph = cv2.dilate(thresh, kernel, iterations=2)

# Step 4: Find contours
contours, _ = cv2.findContours(morph, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

# Step 5: Find the largest rectangular contour in the central area
h, w = gray.shape
candidates = []
for cnt in contours:
    x, y, cw, ch = cv2.boundingRect(cnt)
    #if cw * ch < 1000:  # skip small noise
    #    continue
    #if y < h * 0.1 or y + ch > h * 0.9:  # avoid headers/footers
    #    continue
    if x < w * 0.05 or x + cw > w * 0.95:  # avoid sidebars
        continue
    candidates.append((cw * ch, x, y, x + cw, y + ch))

if not candidates:
    print("No main block detected.")
else:
    # Choose largest
    _, x1, y1, x2, y2 = max(candidates)
    roi = img[y1:y2, x1:x2]

    # Step 6: OCR only that region
    results = reader.readtext(roi)
    text = " ".join([r[1] for r in results])
    print("---- Detected Paragraph Block ----")
    print(text)


---- Detected Paragraph Block ----
Electronically FILED by Superior Court of California, County of Los Angeles 12/12/2023 5;19 PM Miles M, Cooley, (SBN: 206783) David W. Slayton, mcooley@@ftllpcom Executive Officer/Clerk of Court Jacob T. Bolan; (SBN: 329117) By S. Ruiz, Deputy Clerk jbolan@ftllpcom FREEDMAN + TAITELMAN; LLP 1801 Century Park West, Fifth Floor Los Angeles, California 90067 Telephone: (310) 201-0005 Facsimile: (310) 201-0045 Attorneys for Plaintiff Adrian Velasquez Eskupe SUPERIOR COURT OF THE STATE OF CALIFORNIA FOR THE COUNTY OF LOS ANGELES ADRIAN VELASQUEZ, an individual, Case No:: 238tcv30337 COMPLAINT FOR: Plaintiff, ) FRAUD AND DECEIT (INTENTIONAL VS. ) MISREPRESENTA TION) ) MARCO ANTHONY ARCHER, an ) FRAUD AND DECEIT individual; PHORA LLC, a California limited (CONCEALMENT) liability company; PHORA ENTERTAINMENT INC , a terminated BREACH OF COVENANT OF GOOD FAITH AND FAIR DEALING California corporation; WARNER RECORDS NNC. flkla WARNER BROS. BREACH OF IMPLIED-IN-