In [1]:
import json
import pandas as pd

In [2]:
# Open your JSON file and load it as a list of dictionaries
with open('data/export_epita.json') as f:
    data_list = json.load(f)

# Convert each dictionary to a JSON string and store in a new list
json_strings = [json.dumps(item) for item in data_list]

# Convert this list to a DataFrame
df = pd.DataFrame(json_strings, columns=['json_string'])

# Print the first few rows to check
df.head()

Unnamed: 0,json_string
0,"{""0"": [{""line"": {""words"": [{""text"": ""cerfa"", ""..."
1,"{""0"": [{""line"": {""words"": [{""text"": ""Madame"", ..."
2,"{""0"": [{""line"": {""words"": [{""text"": ""CARTE"", ""..."
3,"{""0"": [{""line"": {""words"": [{""text"": ""SECURITY""..."
4,"{""0"": [{""line"": {""words"": [{""text"": ""Docteur"",..."


In [3]:
def has_document_type_id(input_dict, document_type_id):
    if isinstance(input_dict, dict):
        for key, value in input_dict.items():
            if key == "document_type_id" and value == document_type_id:
                return True
            if isinstance(value, dict):
                if has_document_type_id(value, document_type_id):
                    return True
            elif isinstance(value, list):
                for item in value:
                    if isinstance(item, dict):
                        if has_document_type_id(item, document_type_id):
                            return True
    return False

df = df[df['json_string'].apply(lambda x: has_document_type_id(json.loads(x), 8))]

In [4]:
df

Unnamed: 0,json_string
5,"{""0"": [{""line"": {""words"": [{""text"": ""From"", ""w..."
75,"{""0"": [{""line"": {""words"": [{""text"": ""PARTIE"", ..."
99,"{""0"": [{""line"": {""words"": [{""text"": ""LABORATOI..."
117,"{""0"": [{""line"": {""words"": [{""text"": ""LE"", ""wid..."
119,"{""0"": [{""line"": {""words"": [{""text"": ""Mme"", ""wi..."
...,...
2477,"{""0"": [{""line"": {""words"": [{""text"": ""From"", ""w..."
2500,"{""0"": [{""line"": {""words"": [{""text"": ""22.5.2020..."
2530,"{""0"": [{""line"": {""words"": [{""text"": ""Laboratoi..."
2545,"{""0"": [{""line"": {""words"": [{""text"": ""Laboratoi..."


In [5]:
def find_unit_value(entity_name, entities, index, data):
    value = None
    unit = None

    # Recherche de la valeur et de l'unité dans les lignes suivantes
    for next_index in range(index, len(data)):
        next_item = data[next_index]
        next_line = next_item["line"]
        next_words = [word["text"] for word in next_line["words"]]
        next_text = " ".join(next_words)

        # Recherche de la valeur dans la ligne
        if value is None:
            for i in range(len(next_words)):
                if next_words[i].replace(".", "", 1).isdigit():
                    value = next_words[i]
                    break

        # Recherche de l'unité dans la ligne
        if unit is None:
            for i in range(len(next_words)):
                # Faire un truc pour les unités par rapport au entity_name
                if "fl" in next_words[i].lower() or "g/dl" in next_words[i].lower() or "pg" in next_words[i].lower():
                    unit = next_words[i]
                    break

        # Si la valeur et l'unité ont été trouvées, sortie de la boucle
        if value and unit:
            break

    # Si la valeur et l'unité ont été trouvées, stockage des informations dans le dictionnaire des entités
    if value and unit:
        if entity_name in entities :
            entities[entity_name + str(len(entities.keys()))] = {"value": value, "unit": unit}
        else:
            entities[entity_name] = {"value": value, "unit": unit}

# Stockage des entités pertinentes et des informations associées
entities = {}
for doc in df['json_string'] :
    doc_dict = json.loads(doc)
    for val in doc_dict.values():
        if type(val) is not list :
            continue
        for index, item in enumerate(val) :
            line = item["line"]
            words = [word["text"].lower() for word in line["words"]]
            text = " ".join(words)

            if "v.g.m" in words:
                find_unit_value("v.g.m", entities, index, val)

            elif "hémoglobine" in words:
                find_unit_value("hémoglobine", entities, index, val)

            elif "t.c.m.h" in words:
                find_unit_value("t.c.m.h", entities, index, val)

            elif "c.c.m.h" in words:
                find_unit_value("c.c.m.h", entities, index, val)

# Affichage des informations extraites
for entity, info in entities.items():
    print(f"{entity}:")
    print(f"Valeur: {info['value']}")
    print(f"Unité: {info['unit']}")
    print()

hémoglobine:
Valeur: 13.3
Unité: g/dL

v.g.m:
Valeur: 90
Unité: fL

t.c.m.h:
Valeur: 29.5
Unité: pg

c.c.m.h:
Valeur: 33.0
Unité: g/dL

hémoglobine4:
Valeur: 93
Unité: g/dL

hémoglobine5:
Valeur: 91
Unité: pg

v.g.m6:
Valeur: 91
Unité: pg

c.c.m.h7:
Valeur: 69.4
Unité: pg

t.c.m.h8:
Valeur: 69.4
Unité: pg

hémoglobine9:
Valeur: 290
Unité: g/dl

hémoglobine10:
Valeur: 12.0
Unité: g/dL

hémoglobine11:
Valeur: 91
Unité: g/dL

hémoglobine12:
Valeur: 88
Unité: pg

v.g.m13:
Valeur: 88
Unité: pg

c.c.m.h14:
Valeur: 1
Unité: pg

t.c.m.h15:
Valeur: 1
Unité: pg

hémoglobine16:
Valeur: 85
Unité: g/dL

hémoglobine17:
Valeur: 11.5
Unité: g/dL

hémoglobine18:
Valeur: 13.0
Unité: g/dL

c.c.m.h19:
Valeur: 31.0
Unité: g/dL

hémoglobine20:
Valeur: 39
Unité: pg

hémoglobine21:
Valeur: 34
Unité: pg

hémoglobine22:
Valeur: 39
Unité: pg

hémoglobine23:
Valeur: 13.8
Unité: fL

hémoglobine24:
Valeur: 90
Unité: pg/l

v.g.m25:
Valeur: 90
Unité: pg/l

c.c.m.h26:
Valeur: 56.4
Unité: pg/l

t.c.m.h27:
Valeur: 56.4
