In [21]:
import sys
import os
import json
import glob
import pandas as pd

In [22]:
sys.path.append(os.path.abspath('..'))

In [23]:
from ingestion.ocr_parser import extract_text
from extract.extract import smart_parse_and_correct
from validation.validator import standardize_units
from models.model import interpret_results

In [24]:
IMAGE_DIR = "/home/quasar_011/Developer/v_intern/data/sample/"
REF_RANGES_PATH = "/home/quasar_011/Developer/v_intern/validation/reference_ranges.json"

In [25]:
image_paths = glob.glob(os.path.join(IMAGE_DIR, "*.jpg")) + \
              glob.glob(os.path.join(IMAGE_DIR, "*.png"))


print(f"Found {len(image_paths)} images to process.")

Found 12 images to process.


In [26]:
all_patients_summary = []

for i, img_path in enumerate(image_paths, 1):
    filename = os.path.basename(img_path)
    print(f"\n[{i}/{len(image_paths)}] Processing: {filename}...")
    
    try:
        raw_text = extract_text(img_path)
        if not raw_text:
            print("   (Skipping: OCR returned empty text)")
            continue
            
        extracted_data = smart_parse_and_correct(raw_text)
        
        standardized_data = standardize_units(extracted_data)
        
        final_report = interpret_results(standardized_data, REF_RANGES_PATH)
        
        if final_report:
            print(f"   -> Found {len(final_report)} parameters.")
            
            for item in final_report:
                all_patients_summary.append({
                    "File": filename,
                    "Parameter": item['Parameter'],
                    "Value": item['Value'],
                    "Unit": item['Unit'],
                    "Status": item['Status'],
                    "Interpretation": item['Note']
                })
        else:
            print("   -> No parameters extracted (Check image quality or format).")
            
    except Exception as e:
        print(f"   (!) Error processing {filename}: {e}")


[1/12] Processing: sample.png...
   -> Found 14 parameters.

[2/12] Processing: AHD-0425-PA-0007719_E-REPORTS_250427_2032@E.pdf_page_7.png...
   -> Found 21 parameters.

[3/12] Processing: AHD-0425-PA-0008061_E-mahendrasinghdischargecard_250427_1114@E.pdf_page_13.png...
   -> Found 11 parameters.

[4/12] Processing: AHD-0425-PA-0008061_E-mahendrasinghdischargecard_250427_1114@E.pdf_page_27.png...
   -> Found 14 parameters.

[5/12] Processing: BLR-0425-PA-0036693_ARVIND REDDY REPALA 0036693_28-04-2025_1120-45_AM@E.pdf_page_24.png...
   -> Found 14 parameters.

[6/12] Processing: BLR-0425-PA-0039320_501848074 Final bill and DS   26042025_27-04-2025_1054-20_AM.pdf_page_10.png...
   -> Found 13 parameters.

[7/12] Processing: BLR-0425-PA-0040880_E-1745741621426SUSHILADEVIds_250427_1347@E.pdf_page_2.png...
   -> Found 21 parameters.

[8/12] Processing: BLR-0425-PA-0041078_D CARD AND REPORT_11zon_27-04-2025_1204-56_PM.pdf_page_8.png...
   -> Found 3 parameters.

[9/12] Processing: BLR-0425-

In [28]:
if all_patients_summary:
    df = pd.DataFrame(all_patients_summary)
    
    cols = ["File", "Parameter", "Value", "Unit", "Status", "Interpretation"]
    df = df[cols]
    
    from IPython.display import display
    
    print("\n--- All Extracted Data ---")
    display(df)
    
    print("\n--- Abnormal Results Only (High Priority) ---")
    abnormal_df = df[df["Status"] == "Abnormal"]
    if not abnormal_df.empty:
        display(abnormal_df)
    else:
        print("Good news! No abnormalities detected in this batch.")
else:
    print("No data extracted from any images.")


--- All Extracted Data ---


Unnamed: 0,File,Parameter,Value,Unit,Status,Interpretation
0,sample.png,Hemoglobin,12.1,g/dl,Abnormal,Low (Ref: 13.0-17.0)
1,sample.png,Packed Cell Volume,42.5,%,Normal,Reference range unavailable
2,sample.png,R.B.C,4.9,mill/emm,Normal,Reference range unavailable
3,sample.png,Mean Cell Volume,86.7,fl,Normal,Reference range unavailable
4,sample.png,Mean Cell Hemoglobin( Mch},24.7,g/dl,Abnormal,High (Ref: 13.0-17.0)
...,...,...,...,...,...,...
157,BLR-0425-PA-0041664_ef7835c4565245d6afdb7d905c...,Lymphocytes *,16.2,,Normal,Reference range unavailable
158,BLR-0425-PA-0041664_ef7835c4565245d6afdb7d905c...,Eosinophits 3/,7.0,,Normal,Reference range unavailable
159,BLR-0425-PA-0041664_ef7835c4565245d6afdb7d905c...,Monocytes,5.3,,Normal,Reference range unavailable
160,BLR-0425-PA-0041664_ef7835c4565245d6afdb7d905c...,1G,0.3,,Normal,Reference range unavailable



--- Abnormal Results Only (High Priority) ---


Unnamed: 0,File,Parameter,Value,Unit,Status,Interpretation
0,sample.png,Hemoglobin,12.1,g/dl,Abnormal,Low (Ref: 13.0-17.0)
4,sample.png,Mean Cell Hemoglobin( Mch},24.7,g/dl,Abnormal,High (Ref: 13.0-17.0)
23,AHD-0425-PA-0007719_E-REPORTS_250427_2032@E.pd...,Lymphocytes,59.0,%,Abnormal,High (Ref: 20-40)
31,AHD-0425-PA-0007719_E-REPORTS_250427_2032@E.pd...,Lymphocytes,62.3,/ul,Abnormal,High (Ref: 20-40)
46,AHD-0425-PA-0008061_E-mahendrasinghdischargeca...,Hemoglobin,12.1,g/dl,Abnormal,Low (Ref: 13.0-17.0)
50,AHD-0425-PA-0008061_E-mahendrasinghdischargeca...,Mean Cell Hemoglobin( Mch},24.7,g/dl,Abnormal,High (Ref: 13.0-17.0)
69,BLR-0425-PA-0036693_ARVIND REDDY REPALA 003669...,Neutrophils,76.8,%,Abnormal,High (Ref: 40-70)
78,BLR-0425-PA-0039320_501848074 Final bill and D...,Rbc (Red = 5.33 10°,6.0,mil/cumm,Abnormal,High (Ref: 4.5-5.5)
89,BLR-0425-PA-0040880_E-1745741621426SUSHILADEVI...,Lymphocytes,16.0,%,Abnormal,Low (Ref: 20-40)
109,BLR-0425-PA-0041078_D CARD AND REPORT_11zon_27...,Hemoglobin,1.0,g/dl,Abnormal,Low (Ref: 13.0-17.0)
