In [None]:
pip install pdfplumber pytesseract pillow transformers torch scikit-learn numpy pandas

Collecting pdfplumber
  Using cached pdfplumber-0.11.8-py3-none-any.whl.metadata (43 kB)
Collecting pytesseract
  Using cached pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting pdfminer.six==20251107 (from pdfplumber)
  Downloading pdfminer_six-20251107-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-5.2.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (67 kB)
Downloading pdfplumber-0.11.8-py3-none-any.whl (60 kB)
Downloading pdfminer_six-20251107-py3-none-any.whl (5.6 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m5.6/5.6 MB[0m [31m23.9 MB/s[0m  [33m0:00:00[0m
[?25hDownloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Downloading pypdfium2-5.2.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚î

In [None]:
pip install --upgrade pip setuptools wheel



In [None]:
import pdfplumber
import pytesseract
from PIL import Image
import re

def extract_text_from_pdf(path):
    text = ""
    try:
        with pdfplumber.open(path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
    except:
        pass
    return text

def ocr_pdf(path):
    images = Image.open(path)
    text = pytesseract.image_to_string(images)
    return text

def get_report_text(file_path):
    text = extract_text_from_pdf(file_path)

    # If empty ‚Üí scanned PDF ‚Üí use OCR
    if len(text.strip()) == 0:
        text = ocr_pdf(file_path)

    return text


In [None]:
import torch
from transformers import BertTokenizer, BertModel

device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased").to(device)
bert_model.eval()

def get_bert_embedding(text):
    tokens = tokenizer(
        text,
        padding="max_length",
        truncation=True,
        max_length=256,
        return_tensors="pt"
    )

    tokens = {k: v.to(device) for k, v in tokens.items()}

    with torch.no_grad():
        outputs = bert_model(**tokens)

    cls_vec = outputs.last_hidden_state[:, 0, :].cpu().numpy().flatten()
    return cls_vec


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
import numpy as np
import pandas as pd

FEATURES = {
    "Blood_glucose": r"(glucose|blood sugar)[^\d]*(\d+)",
    "HbA1C": r"(hba1c|a1c)[^\d]*(\d+\.?\d*)",
    "Systolic_BP": r"(systolic)[^\d]*(\d+)",
    "Diastolic_BP": r"(diastolic)[^\d]*(\d+)",
    "LDL": r"(ldl)[^\d]*(\d+)",
    "HDL": r"(hdl)[^\d]*(\d+)",
    "Triglycerides": r"(triglycerides|tg)[^\d]*(\d+)",
    "Haemoglobin": r"(hemoglobin|haemoglobin)[^\d]*(\d+\.?\d*)",
    "MCV": r"(mcv)[^\d]*(\d+\.?\d*)"
}

def extract_tabular_features(text):
    data = {}

    text = text.lower()

    for key, pattern in FEATURES.items():
        match = re.search(pattern, text)
        if match:
            data[key] = float(match.group(2))
        else:
            data[key] = np.nan

    df = pd.DataFrame([data])
    df = df.fillna(df.mean())
    return df


In [None]:
def convert_report_to_features(report_path):

    text = get_report_text(report_path)

    # 1) Tabular
    tabular_df = extract_tabular_features(text)
    tab_features = tabular_df.values.flatten()

    # 2) BERT
    bert_features = get_bert_embedding(text)

    # 3) Combine
    final_features = np.hstack((tab_features, bert_features))

    return final_features, tabular_df, text


In [None]:
features, table, text = convert_report_to_features("/content/33806e7015fbfcaff211.png")

print("Tabular Features:")
print(table)

print("Final Feature Vector Shape:", features.shape)


Tabular Features:
   Blood_glucose  HbA1C  Systolic_BP  Diastolic_BP  LDL  HDL  Triglycerides  \
0            NaN    NaN          NaN           NaN  NaN  NaN            NaN   

   Haemoglobin  MCV  
0         15.0  NaN  
Final Feature Vector Shape: (777,)


In [None]:
pip install pytesseract pdf2image pillow

Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Installing collected packages: pdf2image
Successfully installed pdf2image-1.17.0


In [None]:
!apt-get install -y poppler-utils
!apt-get install -y tesseract-ocr
!pip install pdf2image pytesseract pillow pandas numpy

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 1 not upgraded.
Need to get 186 kB of archives.
After this operation, 697 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.12 [186 kB]
Fetched 186 kB in 0s (379 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 117528 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.12_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.12) ...
Setting up poppler-utils (22.02.0-2ubuntu0.12) ...
Processing triggers for man-db (2.10.2-1) ...
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly instal

In [None]:
import pytesseract
from pdf2image import convert_from_path
import pandas as pd
import numpy as np
import re

# If on Windows, set this:
# pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

CBC_KEYS = [
"Haemoglobin",
"Total Leucocyte Count",
"Neutrophils",
"Lymphocytes",
"Eosinophils",
"Monocytes",
"Basophils",
"Absolute Neutrophils",
"Absolute Lymphocytes",
"Absolute Eosinophils",
"Absolute Monocytes",
"RBC Count",
"MCV",
"MCH",
"MCHC",
"Hct",
"RDW-CV",
"RDW-SD",
"Platelet Count",
"PCT",
"MPV",
"PDW"
]

# Clean helper
def clean_number(text):
    text = text.strip()
    text = re.sub(r"[^\d.]", "", text)
    return float(text) if text else np.nan


def extract_table_text(pdf_path):
    pages = convert_from_path(pdf_path, dpi=300)
    text = pytesseract.image_to_string(pages[0], config="--psm 6")
    return text


def parse_cbc(text):
    lines = text.split("\n")
    lines = [l.strip() for l in lines if l.strip()]

    results = {}

    for key in CBC_KEYS:
        for line in lines:
            if key.lower().split()[0] in line.lower():
                nums = re.findall(r"\d+\.?\d*", line)
                if nums:
                    results[key.replace(" ", "_")] = float(nums[0])

    df = pd.DataFrame([results])
    vec = df.values.astype(float)

    return df, vec


if __name__ == "__main__":
    pdf = "/content/33806e7015fbfcaff211.pdf"   # your file

    print("üìÑ Reading PDF and applying OCR...")
    text = extract_table_text(pdf)

    print("\nüîé OCR Text Preview:\n", text[:400])

    df, vec = parse_cbc(text)

    print("\nüìã CBC TABLE")
    print(df)

    print("\nüî¢ FEATURE VECTOR")
    print(vec)
    print("Shape:", vec.shape)


üìÑ Reading PDF and applying OCR...


PDFInfoNotInstalledError: Unable to get page count. Is poppler installed and in PATH?

In [None]:
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"

In [None]:
import pytesseract
from pdf2image import convert_from_path
import pandas as pd
import numpy as np
import re

CBC_KEYS = [
"Haemoglobin",
"Total Leucocyte Count",
"Neutrophils",
"Lymphocytes",
"Eosinophils",
"Monocytes",
"Basophils",
"Absolute Neutrophils",
"Absolute Lymphocytes",
"Absolute Eosinophils",
"Absolute Monocytes",
"RBC Count",
"MCV",
"MCH",
"MCHC",
"Hct",
"RDW-CV",
"RDW-SD",
"Platelet Count",
"PCT",
"MPV",
"PDW"
]

def extract_table_text(pdf_path):
    pages = convert_from_path(pdf_path, dpi=300)
    text = pytesseract.image_to_string(pages[0], config="--psm 6")
    return text

def parse_cbc(text):
    lines = [l.strip() for l in text.split("\n") if l.strip()]
    results = {}

    for key in CBC_KEYS:
        for line in lines:
            if key.lower().split()[0] in line.lower():
                nums = re.findall(r"\d+\.?\d*", line)
                if nums:
                    results[key.replace(" ", "_")] = float(nums[0])

    df = pd.DataFrame([results])
    vec = df.values.astype(float)
    return df, vec

pdf_path = "/content/1.pdf"

print("üìÑ Reading PDF + OCR...")
text = extract_table_text(pdf_path)
print(text)

print("\nüîé OCR Preview:\n", text[:400])

df, vec = parse_cbc(text)

print("\nüìã CBC TABLE")
print(df)

print("\nüî¢ FEATURE VECTOR")
print(vec)
print("Shape:", vec.shape)


üìÑ Reading PDF + OCR...
$47 - RADHA DIAGNOSTICS tony) Brigg De. Arvind Lat Dr. Vandana Lal
Ma DOD, MO IPATHL Oo
G.T.ROAD, JHUNSI, AWAS VIKAS COLONY, Phare (hot Chil of SPotholegy
YOJNA-2, NR POLICE CHOWKI, ALLAHABAD, Pe ERATION TO IME SUNNDOND CO NA SAADAN UN ARES
uP
Name - Mr. NEERAJ OJHA Collected : 1/8/2017 8:21:00AM
Received : 1/8/2017 8:37:03AM
Lab No. : 242862201 Age: 27 Years Gender: Male Reported : 4/8/2017 6:52:30PM
Alc Status ¬© P RefBy: Or. V.K.PANDEY Report Status - Final
Test Name Results Units Bio. Ref. Interval
COMPLETE BLOOD COUNT (CBC)
Hemoglobin 12000 ki 43.00- 17.00
Packed Cell Volume (PCV) jsr7od%SSCSCSCSC*d' 40.00 50.00
RBC Count 579 mitimm3 450-550
Mev ‚Äî(‚ÄòS‚Ñ¢SCsi*d SD Ci 80,00 - 100.00
fMcH‚Äî‚Äú‚ÄòSC*C‚Äò*dC‚ÄòOWG‚Äî a <se ‚ÄîCiPGSCCCd;2>7.00- 32.00
MCHC jsi900 sgt S¬´ 32.00 - 35.00
Red Cell Distribution Width (ROW) 459000 tt 50-1450
Total Leukocyte Count (TLC) 4200s thoutmm3_‚Äî | 4.00- 10.00
Differential Leucocyte Count (DLC)
Segmented Neutrophils 39.60 