In [1]:
import pandas as pd

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
pd.set_option("display.max_colwidth", None)

In [2]:
import sys
sys.path.append('../')
from llama_index.readers.file import MarkdownReader
from src.markdown_element import MarkdownElementNodeParser
from pathlib import Path

reader = MarkdownReader()
md_doc = reader.load_data(Path("../output/CDP_CHARTE_PRODUITS__llamaParse_cleaned.md"))

In [3]:
import asyncio
import nest_asyncio
import uvloop

if not isinstance(asyncio.get_event_loop(), uvloop.Loop):
    nest_asyncio.apply()

In [42]:
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import VectorStoreIndex
from llama_index.core import Settings

embed_model = OpenAIEmbedding(model="text-embedding-3-small")
llm = OpenAI(model="gpt-4o")

Settings.llm = llm
Settings.embed_model = embed_model

node_parser = MarkdownElementNodeParser(llm=OpenAI(model="gpt-4o"))

In [43]:
import os
import pickle

if not os.path.exists("./output/charte.pkl"):
    raw_nodes_charte = node_parser.get_nodes_from_documents(md_doc, show_progress= False)
    pickle.dump(raw_nodes_charte, open("../output/charte.pkl", "wb"))
else:
    raw_nodes_charte = pickle.load(open("../output/charte.pkl", "rb"))

0it [00:00, ?it/s]
0it [00:00, ?it/s]
2it [00:00, 46345.90it/s]
100%|██████████| 1/1 [00:02<00:00,  2.42s/it]
1it [00:00, 29330.80it/s]
0it [00:00, ?it/s]
1it [00:00, 31300.78it/s]
0it [00:00, ?it/s]
1it [00:00, 35848.75it/s]
0it [00:00, ?it/s]
3it [00:00, 93902.33it/s]
100%|██████████| 1/1 [00:04<00:00,  4.45s/it]
1it [00:00, 45590.26it/s]
0it [00:00, ?it/s]
1it [00:00, 43690.67it/s]
0it [00:00, ?it/s]
1it [00:00, 38479.85it/s]
0it [00:00, ?it/s]
1it [00:00, 47662.55it/s]
0it [00:00, ?it/s]
1it [00:00, 49932.19it/s]
0it [00:00, ?it/s]
1it [00:00, 49932.19it/s]
0it [00:00, ?it/s]
1it [00:00, 55924.05it/s]
0it [00:00, ?it/s]
1it [00:00, 37117.73it/s]
0it [00:00, ?it/s]
1it [00:00, 53092.46it/s]
0it [00:00, ?it/s]
1it [00:00, 59918.63it/s]
0it [00:00, ?it/s]
1it [00:00, 52428.80it/s]
0it [00:00, ?it/s]
1it [00:00, 55924.05it/s]
0it [00:00, ?it/s]
2it [00:00, 68759.08it/s]
100%|██████████| 1/1 [00:03<00:00,  3.15s/it]
2it [00:00, 47662.55it/s]
100%|██████████| 1/1 [00:03<00:00,  3.25s/it]

In [44]:
base_nodes_charte, node_mappings_charte = node_parser.get_base_nodes_and_mappings(
    raw_nodes_charte
)

In [74]:
from llama_index.core.schema import IndexNode, TextNode, MetadataMode
example_index_node = [b for b in base_nodes_charte if isinstance(b, TextNode)][
    1
]

print(
    f"\n--------\n{example_index_node.get_content(metadata_mode=MetadataMode.ALL)}\n--------\n"
)


--------
col_schema: Column: Date
Type: Date
Summary: None

Column: Mises à jour
Type: String
Summary: None

This table outlines the updates made to the product charter for Coup de Pâtes, detailing changes and additions from its creation in 2014 to the latest update in December 2023.,
with the following table title:
CHARTE PRODUITS COUP DE PATES,
with the following columns:
- Date: None
- Mises à jour: None
--------



In [48]:
import pandas as pd
from llama_index.core.schema import IndexNode
import ast

index_nodes: list[IndexNode] = [b for b in base_nodes_charte if isinstance(b, IndexNode)]
data = index_nodes[2].relationships['3'].metadata["table_df"]
data_dict = ast.literal_eval(data)
df = pd.DataFrame(data_dict)
df

Unnamed: 0,Additif,Code
0,Tartrazine,E 102
1,Jaune de quinoléine,E 104
2,Sunset Yellow FCF/Jaune orange S,E 110
3,"Azorubine, carmoisine",E 122
4,Amarante,E 123
5,"Ponceau 4R, rouge cochenille A",E 124
6,Erythrosine,E 127
7,Rouge allura AC,E 129
8,"Indigotine, carmin d’indigo",E 132
9,Bleu brillant FCF,E 133


In [49]:
from llama_index.core.schema import IndexNode, TextNode, MetadataMode
example_index_node = [b for b in base_nodes_charte if isinstance(b, IndexNode)][
    2
]

# Index Node
print(
    f"\n--------\n{example_index_node.get_content(metadata_mode=MetadataMode.ALL)}\n--------\n"
)
# Index Node ID
print(f"\n--------\nIndex ID: {example_index_node.index_id}\n--------\n")
# Referenceed Table
print(
    f"\n--------\n{node_mappings_charte[example_index_node.index_id].get_content()}\n--------\n"
)


--------
col_schema: Column: Additif
Type: string
Summary: None

Column: Code
Type: string
Summary: None

Column: Additif
Type: string
Summary: None

Column: Code
Type: string
Summary: None

This table lists various food additives along with their corresponding codes, highlighting those that have been reported to have potential carcinogenicity or involvement in serious diseases.,
with the following table title:
ANNEXE 1 : Additifs rouges : additifs pour lesquels les rapports scientifiques rapportent une potentielle cancérogénicité ou une implication dans les pathologies lourdes,
with the following columns:
- Additif: None
- Code: None
- Additif: None
- Code: None
--------


--------
Index ID: ffd4ef71-f19e-43bb-bd6d-4e013d846d18
--------


--------
This table lists various food additives along with their corresponding codes, highlighting those that have been reported to have potential carcinogenicity or involvement in serious diseases.,
with the following table title:
ANNEXE 1 : Addit

In [55]:
from llama_index.core.retrievers import RecursiveRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core import VectorStoreIndex

# construct top-level vector index + query engine
vector_index = VectorStoreIndex(base_nodes_charte)
vector_retriever = vector_index.as_retriever(similarity_top_k=5)
vector_query_engine = vector_index.as_query_engine(similarity_top_k=5)

In [56]:
from llama_index.core.retrievers import RecursiveRetriever

recursive_retriever = RecursiveRetriever(
    "vector",
    retriever_dict={"vector": vector_retriever},
    node_dict=node_mappings_charte,
    verbose=True,
)
query_engine = RetrieverQueryEngine.from_args(recursive_retriever)

In [57]:
response = query_engine.query("Quel est le code et la classification (rouge, vert, orange) du Chlorure d'étain?")
print(str(response))

[1;3;34mRetrieving with query id None: Quel est le code et la classification (rouge, vert, orange) du Chlorure d'étain?
[0m[1;3;38;5;200mRetrieved node with id, entering: a9d5552e-9383-4754-9e39-5d408b2744a6
[0m[1;3;34mRetrieving with query id a9d5552e-9383-4754-9e39-5d408b2744a6: Quel est le code et la classification (rouge, vert, orange) du Chlorure d'étain?
[0m[1;3;38;5;200mRetrieved node with id, entering: ffd4ef71-f19e-43bb-bd6d-4e013d846d18
[0m[1;3;34mRetrieving with query id ffd4ef71-f19e-43bb-bd6d-4e013d846d18: Quel est le code et la classification (rouge, vert, orange) du Chlorure d'étain?
[0m[1;3;38;5;200mRetrieved node with id, entering: f8b501cb-c096-4809-9af2-f9afef3d91bf
[0m[1;3;34mRetrieving with query id f8b501cb-c096-4809-9af2-f9afef3d91bf: Quel est le code et la classification (rouge, vert, orange) du Chlorure d'étain?
[0m[1;3;38;5;200mRetrieving text node: Table des matières

1. Exigence recette .................................................. 2  
2.

In [58]:
response = query_engine.query("Quel est le code et la classification (rouge, vert, orange) du Bleu patenté V?")
print(str(response))

[1;3;34mRetrieving with query id None: Quel est le code et la classification (rouge, vert, orange) du Bleu patenté V?
[0m[1;3;38;5;200mRetrieved node with id, entering: a9d5552e-9383-4754-9e39-5d408b2744a6
[0m[1;3;34mRetrieving with query id a9d5552e-9383-4754-9e39-5d408b2744a6: Quel est le code et la classification (rouge, vert, orange) du Bleu patenté V?
[0m[1;3;38;5;200mRetrieving text node: CHARTE PRODUITS COUP DE PATES

**Codification**: CDP_QUA_CHART_01  
**Version**: 5  
**Date d'application**: 13/12/2023
[0m[1;3;38;5;200mRetrieving text node: Table des matières

1. Exigence recette .................................................. 2  
2. Produits soumis à certification ou allégations ........ 3  
   2.1. Produits « sans gluten » ................................................. 3  
   2.2. Produits issus de l’agriculture biologique ............ 3  
3. Exigences générales relatives au fournisseur .......... 4  
4. Exigences relatives aux sites de production ...........

In [60]:
response = query_engine.query("Est ce que la Cire de carnauba est un additif de type orange? Sinon de quel type est il? Quelle est son code E ?")
print(str(response))

[1;3;34mRetrieving with query id None: Est ce que la Cire de carnauba est un additif de type orange? Sinon de quel type est il? Quelle est son code E ?
[0m[1;3;38;5;200mRetrieved node with id, entering: ffd4ef71-f19e-43bb-bd6d-4e013d846d18
[0m[1;3;34mRetrieving with query id ffd4ef71-f19e-43bb-bd6d-4e013d846d18: Est ce que la Cire de carnauba est un additif de type orange? Sinon de quel type est il? Quelle est son code E ?
[0m[1;3;38;5;200mRetrieving text node: ANNEXE II : Additifs oranges : additifs pour lesquels les rapports scientifiques sont contradictoires
[0m[1;3;38;5;200mRetrieved node with id, entering: f8b501cb-c096-4809-9af2-f9afef3d91bf
[0m[1;3;34mRetrieving with query id f8b501cb-c096-4809-9af2-f9afef3d91bf: Est ce que la Cire de carnauba est un additif de type orange? Sinon de quel type est il? Quelle est son code E ?
[0m[1;3;38;5;200mRetrieving text node: Table des matières

1. Exigence recette .................................................. 2  
2. Produit