In [58]:
from lib.prismaClient import prisma
import sys, os
import asyncio
import logging
import json
from pprintpp import pprint
from langchain_ollama.llms import OllamaLLM
import re
from enum import Enum
from pydantic import BaseModel
from typing import List, Optional

In [59]:
class OCRElementEnumType(str, Enum):
    number = "number"
    biologyTest = "biologyTest"
    otherText = "otherText"


class OCRrawResponse(BaseModel):
    image_size: List[float]
    coordinates: List[tuple[str, List[float]]]
    text: List[str]


class OCResponse(BaseModel):
    id: str
    text: str
    x: float
    y: float
    width: float
    high: float
    typeOfContent: Optional[OCRElementEnumType] = None
    systemMapping: Optional[str] = None
    notFoundInSystemMapping: bool = False
    associatedNumberValueId: str | None = None
    associatedNumberValue: str | None = None

In [60]:
doc_id = "c5f0fcd6-0562-4c69-b992-835e9f84e074"

In [61]:
document: List[OCResponse] = []
try:
    await prisma.connect()
    rawDocument = await prisma.patientscanneddocument.find_first_or_raise(
        where={"id": doc_id}
    )
    if rawDocument.extractedData == None:
        raise Exception("No extracted data")
    pattern = r"^[><=]?\s*-?\d+(\.\d+)?$"
    # Parse the JSON data as list of dictionaries
    document_data = OCRrawResponse.model_validate(json.loads(rawDocument.extractedData))
    document = list(
        map(
            lambda x: OCResponse(
                id=x[0],
                text=re.sub(r":", "", re.sub(r"Text Box ID \d+: ?", "", x[0])),
                x=x[1][0],
                y=x[1][1],
                width=x[1][2],
                high=x[1][3],
                typeOfContent=(
                    OCRElementEnumType.number
                    if re.match(pattern, re.sub(r"Text Box ID \d+: ?", "", x[0]))
                    else None
                ),
            ),
            filter(lambda c: c[0].startswith("Text Box ID"), document_data.coordinates),
        )
    )

    await prisma.disconnect()
    pass
except Exception as e:
    await prisma.disconnect()
    logging.error(e)
    raise e

pprint(document)

[
    OCResponse(id='Text Box ID 0: B', text='B', x=1090.0, y=0.0, width=40.0, high=24.0, typeOfContent=None, systemMapping=None, notFoundInSystemMapping=False, associatedNumberValueId=None, associatedNumberValue=None),
    OCResponse(id='Text Box ID 1: lonogramme: Kaliemie (k)', text='lonogramme Kaliemie (k)', x=158.0, y=72.99999237060547, width=445.0, high=48.0, typeOfContent=None, systemMapping=None, notFoundInSystemMapping=False, associatedNumberValueId=None, associatedNumberValue=None),
    OCResponse(id='Text Box ID 2: 3.76', text='3.76', x=976.9999389648438, y=69.0, width=75.00006103515625, high=47.0, typeOfContent=<OCRElementEnumType.number: 'number'>, systemMapping=None, notFoundInSystemMapping=False, associatedNumberValueId=None, associatedNumberValue=None),
    OCResponse(id='Text Box ID 3: Ionogramme: Natremie (Na)', text='Ionogramme Natremie (Na)', x=155.99996948242188, y=123.0, width=487.0000305175781, high=50.0, typeOfContent=None, systemMapping=None, notFoundInSystemMap

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [62]:
documentContent = list(
    map(
        lambda x: x.text,
        filter(lambda y: y.typeOfContent != OCRElementEnumType.number, document),
    )
)
print(documentContent)

['B', 'lonogramme Kaliemie (k)', 'Ionogramme Natremie (Na)', 'Bilan renal', 'Uree sanguine', 'Creatine sanguine', 'H', 'TP (inr) taux de prothrombine', 'TQ (inr) temps de quick', 'INR (Interational Normalizated Ratio)', 'TSH 3eme generation', 'FORMULE', 'Globules Rouges', 'Hemoglobine', 'Hematocrite', 'VGM', 'CCMH', 'TGMH', 'Globules Blancs', 'PN_neutrophiles', 'PNN', 'PN_Eosinophiles', 'PNE', 'PN_Basophiles', 'PNB', 'LYM', 'Lymphocytes', 'Monocytes', 'ABORATOIREDAMALYSESVE', 'MONO', '219BDELVALEKCH', 'Plaquettes', 'FOamaclah Sieiogi']


In [63]:
from pydantic import Field, BaseModel
from typing import List
from langchain_core.output_parsers import PydanticOutputParser


class FilteredData(BaseModel):
    biologicalTests: List[str] = Field(
        description="List of the filtered biological tests"
    )


parser = PydanticOutputParser(pydantic_object=FilteredData)

In [64]:
from langchain_ollama.chat_models import ChatOllama
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_core.prompts.chat import ChatPromptTemplate, HumanMessagePromptTemplate


model = ChatOllama(
    model="gemma2:latest",
    base_url="100.108.14.46",
    keep_alive="10m",
    temperature=0.2,
)

system_prompt = """
you are a helpful agent specialized in analyzing content of biological test OCR. the user will give you an array of strings from an OCR of biological test results written in French. analyze this array keep only the elements referring to a biological test and return another array formed with the filtered elements.
"""


user_prompt = HumanMessagePromptTemplate.from_template(
    "{request}\n{format_instructions}"
)
chat_prompt = ChatPromptTemplate.from_messages(
    [
        SystemMessage(content=system_prompt),
        user_prompt,
    ]
)
request = chat_prompt.format_prompt(
    request=str(documentContent), format_instructions=parser.get_format_instructions()
).to_messages()

results = model.invoke(request, temperature=0, format='json')
results_values: FilteredData | None = None
if results:
    results_values = parser.parse(results.content)  # type: ignore
results_values.biologicalTests # type: ignore

['lonogramme Kaliemie (k)',
 'Ionogramme Natremie (Na)',
 'Bilan renal',
 'Uree sanguine',
 'Creatine sanguine',
 'TP (inr) taux de prothrombine',
 'TQ (inr) temps de quick',
 'INR (Interational Normalizated Ratio)',
 'TSH 3eme generation',
 'FORMULE',
 'Globules Rouges',
 'Hemoglobine',
 'Hematocrite',
 'VGM',
 'CCMH',
 'TGMH',
 'Globules Blancs',
 'PN_neutrophiles',
 'PNN',
 'PN_Eosinophiles',
 'PNE',
 'PN_Basophiles',
 'PNB',
 'LYM',
 'Lymphocytes',
 'Monocytes',
 'Plaquettes']

In [65]:
for index, doc in enumerate(document):
    if results_values:
        if doc.text in results_values.biologicalTests:
            document[index].typeOfContent = OCRElementEnumType.biologyTest

# document

In [66]:
from langchain_huggingface import HuggingFaceEmbeddings
from tqdm import tqdm

# Specify the model name and local directory for saving

local_model_path = "./huggingFaceModels/paraphrase-multilingual-MiniLM-L12-v2"

# model_name = "mixedbread-ai/mxbai-embed-large-v1"
# model_name='microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract'
model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
model_kwargs = {"device": "cuda"}
encode_kwargs = {"normalize_embeddings": False}
hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs,
    cache_folder=local_model_path,
)

In [67]:
from qdrant_client import QdrantClient

url = "100.108.14.46"
client = QdrantClient(url)
collection_exists = client.collection_exists(
    collection_name="biologyParametersStore",
)

In [68]:
for key, doc in enumerate(document):
    if doc.typeOfContent == OCRElementEnumType.biologyTest:

        query_vector = await hf.aembed_query(doc.text)

        results = client.search(
            collection_name="biologyParametersStore",
            query_vector=query_vector,
            limit=1,
            score_threshold=0.7,
        )
        formatted_results = [
            {
                "id": result.id,
                "score": result.score,
                "payload": result.payload,
            }
            for result in results
        ]
        if len(formatted_results) > 0:
            document[key].systemMapping = formatted_results[0]["payload"][
                "parameterName"
            ]
            pprint(document[key])

OCResponse(id='Text Box ID 1: lonogramme: Kaliemie (k)', text='lonogramme Kaliemie (k)', x=158.0, y=72.99999237060547, width=445.0, high=48.0, typeOfContent=<OCRElementEnumType.biologyTest: 'biologyTest'>, systemMapping='Potassium (K)', notFoundInSystemMapping=False, associatedNumberValueId=None, associatedNumberValue=None)
OCResponse(id='Text Box ID 3: Ionogramme: Natremie (Na)', text='Ionogramme Natremie (Na)', x=155.99996948242188, y=123.0, width=487.0000305175781, high=50.0, typeOfContent=<OCRElementEnumType.biologyTest: 'biologyTest'>, systemMapping='Sodium (Na)', notFoundInSystemMapping=False, associatedNumberValueId=None, associatedNumberValue=None)
OCResponse(id='Text Box ID 6: Uree sanguine', text='Uree sanguine', x=159.00001525878906, y=281.0, width=248.99998474121094, high=55.0, typeOfContent=<OCRElementEnumType.biologyTest: 'biologyTest'>, systemMapping='Blood Urea Nitrogen (BUN)', notFoundInSystemMapping=False, associatedNumberValueId=None, associatedNumberValue=None)
OCRe

In [69]:
numericValues: List[OCResponse] = list(
    filter(lambda x: x.typeOfContent == OCRElementEnumType.number, document)
)

for doc in document:
    if doc.typeOfContent == OCRElementEnumType.biologyTest:
        y_threshold = doc.high / 2
        relatedNumericalValues = list(sorted(
            list(
                filter(
                    lambda filter: abs((doc.y + doc.high) - (filter.y + filter.high))
                    <= y_threshold,
                    numericValues,
                )
            ),
            key=lambda sort: sort.x,
            reverse=False,
        ))
        
        if len(relatedNumericalValues) > 0 : 
            relatedNumericalValue = relatedNumericalValues[0]
            doc.associatedNumberValue = relatedNumericalValue.text
            doc.associatedNumberValueId= relatedNumericalValue.id
            
            print( doc.text, doc.associatedNumberValue)

lonogramme Kaliemie (k) 3.76
Ionogramme Natremie (Na) 139.4
Uree sanguine 0.32
Creatine sanguine 7.3
TP (inr) taux de prothrombine 21.3
TQ (inr) temps de quick 34.3
INR (Interational Normalizated Ratio) 5.18
TSH 3eme generation 0.81
Globules Rouges 4.36
Hemoglobine 12.4
Hematocrite 37.8
VGM 86.8
CCMH 32.7
TGMH 28.4
Globules Blancs 4.1
PN_neutrophiles 2.6
PNN 64.0
PN_Eosinophiles 0.08
PNE 02
PN_Basophiles 00
PNB 00
LYM 26.2
Lymphocytes 1.1
Monocytes 0.4
