In [1]:
import pytesseract
import cv2
import re
import pandas as pd
import os

pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"


def extract_text_from_image(image_path):
    img = cv2.imread(image_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    text = pytesseract.image_to_string(gray)
    return text


def parse_report(text):
    def find(pattern):
        m = re.search(pattern, text)
        return float(m.group(1)) if m else None

    def find_text(pattern):
        m = re.search(pattern, text, re.IGNORECASE)
        return m.group(1).strip() if m else None

    return {
        "Age": int(find_text(r"Age\s*:\s*(\d+)")),
        "Gender": find_text(r"Gender\s*:\s*(Male|Female)"),
        "Hemoglobin": find(r"Hemoglobin\s*:\s*([0-9.]+)"),
        "Glucose": find(r"Glucose\s*:\s*([0-9.]+)"),
        "Cholesterol": find(r"Cholesterol\s*:\s*([0-9.]+)"),
        "Red Blood Cells": find(r"Red Blood Cells\s*:\s*([0-9.]+)"),
        "White Blood Cells": find(r"White Blood Cells\s*:\s*([0-9.]+)"),
        "Platelets": find(r"Platelets\s*:\s*([0-9.]+)")
    }


def load_dataset(folder="reports"):
    rows = []
    for f in os.listdir(folder):
        if f.endswith(".png"):
            text = extract_text_from_image(os.path.join(folder, f))
            rows.append(parse_report(text))
    return pd.DataFrame(rows)
