In [28]:
import cv2
import pytesseract
import os
import numpy as np
import difflib

In [29]:
tesseractPath = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
if not os.path.exists(tesseractPath):
    print("Caminho incorreto:", tesseractPath)
else:
    print("Tesseract encontrado:", tesseractPath)
pytesseract.pytesseract.tesseractCmd = tesseractPath

Tesseract encontrado: C:\Program Files\Tesseract-OCR\tesseract.exe


In [30]:
inputFolder = r"C:\Users\Navarro\Desktop\Faculdade\6 sem\Inteligencia Artificial\projeto 1 bim\AI_Training_Images"
watermarkTopFolder = r"C:\Users\Navarro\Desktop\Faculdade\6 sem\Inteligencia Artificial\projeto 1 bim\watermarkTop"
watermarkCenterFolder = r"C:\Users\Navarro\Desktop\Faculdade\6 sem\Inteligencia Artificial\projeto 1 bim\watermarkCenter"
watermarkBottomFolder = r"C:\Users\Navarro\Desktop\Faculdade\6 sem\Inteligencia Artificial\projeto 1 bim\watermarkBottom"
os.makedirs(watermarkTopFolder, exist_ok=True)
os.makedirs(watermarkBottomFolder, exist_ok=True)
os.makedirs(watermarkCenterFolder, exist_ok=True)

In [33]:
def processRegion(region):
    processedRegion = cv2.adaptiveThreshold(region,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C,cv2.THRESH_BINARY,11,2)
    return processedRegion

def detectWatermarkPosition(image):
    grayImage = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    height = grayImage.shape[0]
    topRegion = grayImage[0:height//3, :]
    centerRegion = grayImage[height//3:2*height//3, :]
    bottomRegion = grayImage[2*height//3:height, :]
    topProcessed = processRegion(topRegion)
    centerProcessed = processRegion(centerRegion)
    bottomProcessed = processRegion(bottomRegion)
    configStr = "--psm 6 -c tessedit_char_whitelist=abcdefghijklmnopqrstuvwxyz-.0123456789"
    expectedText = "this-person-does-not-exist.com"
    simThreshold = 0.75
    regionSims = {}
    topText = pytesseract.image_to_string(topProcessed, config=configStr).lower().replace(" ", "")
    centerText = pytesseract.image_to_string(centerProcessed, config=configStr).lower().replace(" ", "")
    bottomText = pytesseract.image_to_string(bottomProcessed, config=configStr).lower().replace(" ", "")
    regionSims["top"] = difflib.SequenceMatcher(None, topText, expectedText).ratio()
    regionSims["center"] = difflib.SequenceMatcher(None, centerText, expectedText).ratio()
    regionSims["bottom"] = difflib.SequenceMatcher(None, bottomText, expectedText).ratio()
    bestRegion = max(regionSims, key=regionSims.get)
    if regionSims[bestRegion] >= simThreshold:
        return bestRegion
    countTop = np.sum(cv2.Canny(topRegion, 50, 150) > 0)
    countCenter = np.sum(cv2.Canny(centerRegion, 50, 150) > 0)
    countBottom = np.sum(cv2.Canny(bottomRegion, 50, 150) > 0)
    edgeCounts = {"top": countTop, "center": countCenter, "bottom": countBottom}
    return max(edgeCounts, key=edgeCounts.get)

In [34]:
for imageFile in os.listdir(inputFolder):
    if imageFile.lower().endswith((".png",".jpg",".jpeg")):
        imagePath = os.path.join(inputFolder,imageFile)
        image = cv2.imread(imagePath)
        if image is None:
            continue
        position = detectWatermarkPosition(image)
        if position=="top":
            targetFolder = watermarkTopFolder
        elif position=="center":
            targetFolder = watermarkCenterFolder
        else:
            targetFolder = watermarkBottomFolder
        newPath = os.path.join(targetFolder,imageFile)
        cv2.imwrite(newPath,image)