In [2]:
import os
import re
import cv2
import bs4
import numpy as np
import pytesseract

header_footer_data = 'header_footer_data'
output_folder = 'output_folder' # düzenlenmiş resimlerin kaydedileceği klasör
pytesseract.pytesseract.tesseract_cmd = "C:\\Users\\reyhan\\AppData\\Local\\Programs\\Tesseract-OCR\\tesseract.exe"


if not os.path.exists(output_folder):
    os.makedirs(output_folder)

for filename in os.listdir(header_footer_data):
    if filename.endswith('.jpg') or filename.endswith('.png'):
        filepath = os.path.join(header_footer_data, filename)
        
        # resim dosyasını işle
        extension = os.path.splitext(filename)[1][1:]
        pytesseract.pytesseract.run_tesseract(filepath, "output_hocr", extension=extension, lang='tur', config="--psm 6 -c tessedit_create_hocr=1 -c hocr_font_info=1")
        xml_input = open("output_hocr.hocr", "r", encoding="utf-8")
        soup = bs4.BeautifulSoup(xml_input, 'lxml')
        ocr_words = soup.findAll("span", {"class": "ocrx_word"})
        words_structure = []
        for word in ocr_words:
            word_text = word.text.replace("\n", " ").strip()
            title = word['title']
            x1, y1, x2, y2 = map(int, title[5:title.find(";")].split())
            word_id = int(re.search(r'\d+', word['id']).group())
            words_structure.append({"x1": x1, "y1": y1, "x2": x2, "y2": y2, "text": word_text, "id": word_id})
        
        # kelimeleri grupla
        grouped_words = []
        y_tolerance = 35
        current_group = []
        
        # önce y eksenine göre grupla
        for word in sorted(words_structure, key=lambda x: x['y1']):
            if not current_group:
                current_group.append(word)
            else:
                if abs(word['y1'] - current_group[-1]['y1']) <= y_tolerance:
                    current_group.append(word)
                else:
                    grouped_words.append(current_group)
                    current_group = [word]
        if current_group:
            grouped_words.append(current_group)
        
        # grupları satırlara dönüştür
        lines_structure = []
        for group in grouped_words:
            top_word = min(group, key=lambda x: x['y1'])
            bottom_line = max(group, key=lambda x: x['y2'])
            left_word = min(group, key=lambda x: x['x1'])
            right_word = max(group, key=lambda x: x['x2'])
            
            # grubun sınırlarını belirle
            x1 = left_word['x1']
            y1 = top_word['y1']
            x2 = right_word['x2']
            y2 = bottom_line['y2']
            
            line_text = " ".join([word['text'] for word in sorted(group, key=lambda x: x['id'])])
            
            lines_structure.append({"x1": x1, "y1": y1, "x2": x2, "y2": y2, "text": line_text})
        
        img = cv2.imread(filepath)
        
        # resmin boyutlarını belirle
        img_height, img_width, _ = img.shape
        
        # resimdeki çizgileri bul
        gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        edges = cv2.Canny(gray_img, 50, 150)
        lines = cv2.HoughLinesP(edges, 1, np.pi/180, 100, minLineLength=100, maxLineGap=10)
        
        horizontal_lines_y_coords=[]
        
        for line in lines:
          for x1, y1, x2, y2 in line:
            if abs(y1-y2)<10:
              horizontal_lines_y_coords.append(y1)
        
        # header ve footer'ları bul
        header_lines = []
        footer_lines = []

        first_lines = lines_structure[:5]
        for first_line in first_lines:
            if first_line['y1'] < min(horizontal_lines_y_coords) and not first_line['text'].isdigit():
                header_lines.append(first_line)

        last_lines = lines_structure[-5:]
        for last_line in last_lines:
            if last_line['y2'] > max(horizontal_lines_y_coords) and not first_line['text'].isdigit():
                footer_lines.append(last_line)
        
        # header ve footer'ları resimlerin üzerinde göster
        for header_line in header_lines:
            x1, y1, x2, y2 = header_line['x1'], header_line['y1'], header_line['x2'], header_line['y2']
            cv2.rectangle(img, (x1, y1), (x2, y2), (0, 0, 255), 2)

        for footer_line in footer_lines:
            x1, y1, x2, y2 = footer_line['x1'], footer_line['y1'], footer_line['x2'], footer_line['y2']
            cv2.rectangle(img, (x1, y1), (x2, y2), (0, 0, 255), 2)

        # resmi kaydet
        output_filepath = os.path.join(output_folder, filename)
        cv2.imwrite(output_filepath, img)


