In [1]:
!apt-get install -y tesseract-ocr
!pip install pytesseract opencv-python




import pytesseract
import cv2
import os
import re
from PIL import Image



Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 122 not upgraded.


In [2]:
def extract_text(image_path):
    # Load image using OpenCV
    img = cv2.imread(image_path)
    
    # Optional: Preprocessing for better OCR accuracy
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    _, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    
    # OCR using pytesseract
    text = pytesseract.image_to_string(thresh)
    
    return text


In [3]:
def parse_lab_tests(text):
    results = []
    
    pattern = r"(?P<test_name>[\w\s\(\)\-\/]+)[\s:]+(?P<test_value>\d+\.?\d*)[\s]*?(?P<unit>[a-zA-Z\/%]*)[\s]*?(?P<range>\d+\.?\d*\s*[-–]\s*\d+\.?\d*)?"
    
    for match in re.finditer(pattern, text):
        try:
            test_name = match.group("test_name").strip()
            test_value = match.group("test_value").strip()
            test_unit = match.group("unit").strip()
            bio_reference_range = match.group("range").strip() if match.group("range") else ""

            # Check out-of-range logic
            lower, upper = None, None
            if bio_reference_range:
                parts = re.split(r"[-–]", bio_reference_range)
                if len(parts) == 2:
                    lower = float(parts[0].strip())
                    upper = float(parts[1].strip())

            out_of_range = False
            if lower is not None and upper is not None:
                val = float(test_value)
                out_of_range = val < lower or val > upper

            results.append({
                "test_name": test_name,
                "test_value": test_value,
                "bio_reference_range": bio_reference_range,
                "test_unit": test_unit,
                "lab_test_out_of_range": out_of_range
            })
        except Exception:
            continue

    return {
        "is_success": True,
        "data": results
    }


In [4]:
image_dir = "/kaggle/input/lab-reports/lbmaske"
image_files = [os.path.join(image_dir, f) for f in os.listdir(image_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]

sample_image = image_files[0]
text = extract_text(sample_image)
parsed_results = parse_lab_tests(text)

from pprint import pprint
pprint(parsed_results)


{'data': [{'bio_reference_range': '',
           'lab_test_out_of_range': False,
           'test_name': 'EA) OF',
           'test_unit': '',
           'test_value': '7.90'},
          {'bio_reference_range': '',
           'lab_test_out_of_range': False,
           'test_name': 'aol\nMale',
           'test_unit': '',
           'test_value': '12.0'},
          {'bio_reference_range': '',
           'lab_test_out_of_range': False,
           'test_name': '-',
           'test_unit': '',
           'test_value': '16.0'},
          {'bio_reference_range': '',
           'lab_test_out_of_range': False,
           'test_name': 'Female',
           'test_unit': '',
           'test_value': '11.5'},
          {'bio_reference_range': '',
           'lab_test_out_of_range': False,
           'test_name': '5\nChild up to 1 Year',
           'test_unit': '',
           'test_value': '12.6'},
          {'bio_reference_range': '',
           'lab_test_out_of_range': False,
           'test_name

In [None]:
!pip install -q fastapi uvicorn python-multipart opencv-python pytesseract


In [5]:
import os
import re
import json
from typing import List
from fastapi import FastAPI, File, UploadFile
from fastapi.responses import JSONResponse
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from PIL import Image
import pytesseract
import cv2
import numpy as np


ModuleNotFoundError: No module named 'fastapi'

In [None]:
def extract_text_from_image(image_bytes) -> str:
    image = Image.open(image_bytes).convert("RGB")
    image_np = np.array(image)
    text = pytesseract.image_to_string(image_np)
    return text


def parse_lab_tests(text: str) -> List[dict]:
    lines = text.split("\n")
    results = []

    pattern = re.compile(
        r"(?P<test_name>[\w\s\(\)\-%]+?)\s*[:\-]?\s*(?P<test_value>\d+\.?\d*)\s*(?P<unit>[^\d\s]+)?\s*(?P<range>\(?\d+\.?\d*\s*[-–]\s*\d+\.?\d*\)?)"
    )

    for line in lines:
        match = pattern.search(line)
        if match:
            test_name = match.group("test_name").strip()
            test_value = float(match.group("test_value"))
            unit = match.group("unit") or ""
            range_text = match.group("range").replace("(", "").replace(")", "").replace("–", "-")
            try:
                lower, upper = map(float, range_text.split("-"))
                out_of_range = not (lower <= test_value <= upper)
                result = {
                    "test_name": test_name.upper(),
                    "test_value": str(test_value),
                    "bio_reference_range": f"{lower}-{upper}",
                    "test_unit": unit,
                    "lab_test_out_of_range": out_of_range
                }
                results.append(result)
            except:
                continue

    return results


In [None]:
app = FastAPI()

# Allow CORS for testing in browser
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)


@app.post("/get-lab-tests")
async def get_lab_tests(file: UploadFile = File(...)):
    try:
        contents = await file.read()
        from io import BytesIO
        text = extract_text_from_image(BytesIO(contents))
        data = parse_lab_tests(text)
        return JSONResponse(content={"is_success": True, "data": data})
    except Exception as e:
        return JSONResponse(content={"is_success": False, "error": str(e)})


In [None]:
from pathlib import Path

img_path = Path("/kaggle/input/lab-reports/lbmaske")  # Change if needed
sample_image = next(img_path.glob("*.jpg"))  # or .png

with open(sample_image, "rb") as f:
    text = extract_text_from_image(f)
    result = parse_lab_tests(text)

import pprint
pprint.pprint({"is_success": True, "data": result})
