In [None]:
## Read PDF files
import nest_asyncio

nest_asyncio.apply()
#GET LLAMA_CLOUD_API_KEY
import os
from llama_parse import LlamaParse

api_key = os.getenv("LLAMA_CLOUD_API_KEY")

parsing_instructions = "This is a documents that contains tables. Extract the tables in the right format. "

parser = LlamaParse(
    api_key= api_key,  # can also be set in your env as LLAMA_CLOUD_API_KEY
    result_type="markdown",  # "markdown" and "text" are available
    num_workers=4,  # if multiple files passed, split in `num_workers` API calls
    verbose=True,
    language="fr",  # Optionally you can define a language, default=en
    #parsing_instruction=parsing_instructions,  # Optionally you can define a parsing instruction
)
# sync
documents = parser.load_data("./docs/BTM_paper.pdf")
print(documents)


### Parse PDF usinf pdf2docx 

In [None]:
from pdf2docx import Converter
pdf_file = "./docs/rib-chloe.pdf"
cv = Converter(pdf_file)
cv.convert() #type: ignore
cv.close()

In [None]:
# from docx import Document

# def formatTable(table):
#     formatted_table = []
#     for row in table.rows:
#         list_row = []
#         for cell in row.cells:
#             for paragraph in cell.paragraphs:
#                 list_row.append(paragraph.text)
#         formatted_table.append(tuple(list_row))
#     return tuple(formatted_table)

In [None]:
# document = Document('./docs/CHARTE_QUALITE_PRODUIT2.docx')
# unique_tables = set()
# pruned_tables = []
# for i,table in enumerate(document.tables):
#     formatted_table = formatTable(table)
#     if formatted_table not in unique_tables:
#         pruned_tables.append(formatted_table)
#         unique_tables.add(formatted_table)
#     else:
#         print("Duplicate table found")
#         # Remove the table's associated XML elements
#         table._element.getparent().remove(table._element)

# print(pruned_tables)
#document.save('./docs/MODIFIED_CHARTE_QUALITE.docx')

In [None]:
import mammoth
from markdownify import markdownify as md

with open('./docs/CHARTE_QUALITE_PRODUIT2.docx', "rb") as docx_file:
    result = mammoth.convert_to_html(docx_file)

with open('./docs/CHARTE_QUALITE_PRODUIT2.html', "rb") as html_file:
    html_file.write(result.value)

md_result = md(result.value)
#save markdown to file
with open('./docs/CHARTE_QUALITE_PRODUIT2.md', "w") as md_file:
    md_file.write(md_result)

print(f"DOC LENGTH : {len(md_result)} characters")



In [None]:
from bs4 import BeautifulSoup

def remove_duplicate_tables(html_file):
    with open(html_file, 'r') as file:
        html_content = file.read()

    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all tables
    tables = soup.find_all('table')

    # Keep track of tables that have appeared
    seen_tables = set()

    # Iterate over tables in reverse order to delete duplicate occurrences
    for table in tables:
        # Convert the table to a string for comparison
        table_str = str(table)
        # Check if table has appeared before
        if table_str in seen_tables:
            table.extract()  # Remove duplicate table
        else:
            seen_tables.add(table_str)
    return soup

def remove_page_status(soup):

    #soup = BeautifulSoup(html_content, 'html.parser')

    # Find all elements containing the text "Page"
    page_elements = soup.find_all(text=lambda text: 'Page' in text)

    # Remove the parent element of each page element
    for page_element in page_elements:
        parent = page_element.find_parent()
        parent.extract()

    return soup

def merge_adjacent_tables(soup):
    """
    Merge adjacent tables with the same number of columns and unnest nested tables in the HTML content.

    Args:
    soup (BeautifulSoup): The BeautifulSoup object representing the parsed HTML.

    Returns:
    BeautifulSoup: The modified BeautifulSoup object with merged tables and unnested tables.
    """

    # Find all tables
    tables = soup.find_all('table')

    # Unnest nested tables
    for table in tables:
        nested_tables = table.find_all('table')
        if nested_tables:
            for nested_table in nested_tables:
                # Move the nested table after the outer table
                table.insert_after(nested_table.extract())
            table.extract()  # Remove the outer table

    # Find all tables again after unnesting
    tables = soup.find_all('table')

    # Merge adjacent tables with the same number of columns
    i = 0
    while i < len(tables) - 1:
        current_table = tables[i]
        next_table = tables[i + 1]

        # Check if the tables are directly adjacent
        if current_table.next_sibling == next_table:
            # Check if both tables have the same number of columns
            current_table_cols = len(current_table.find_all('tr')[0].find_all(['th', 'td']))
            next_table_cols = len(next_table.find_all('tr')[0].find_all(['th', 'td']))
            if current_table_cols == next_table_cols:
                # Merge the tables
                current_table.extend(next_table.find_all('tr'))
                next_table.extract()
                tables.pop(i + 1)  # Remove the merged table from the list
                continue

        i += 1

    return soup


In [None]:
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain.output_parsers import PydanticOutputParser

class TableValidator(BaseModel):
    description: str = Field(description="Description of the table content")
    to_change: str = Field(description="Explanation of invalidity and changes to be made to the table content if it is invalid")
    is_valid: bool = Field(description="Validation status of the table")
    correction: str = Field(description="Correction of the input html table")

In [None]:
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI

parser = PydanticOutputParser(pydantic_object= TableValidator)
template = """You are an HTML table cleaning bot. Your task is to analyze the provided HTML content and determine if it is a valid table.

If you determine that the HTML is not a valid table (i.e., the content doesn't fit the structure or semantics of a table), you should modify the HTML by removing the table tags and converting the content into a more appropriate HTML structure, such as paragraphs or lists.
Do not force change if it is readable like this.\n
If everything is correct but a line, modify the line to make it correct, do not delete it. Think about the homogeneity of the table.\n
In description you should provide a brief description of the table and if incorrect, a clear explaination of why it is incorrect and how it may impact the reader comprehension.\n
Note that is_valid cannot be true if at least one element is not correctly formatted.\n
If it looks like a text that could me simply put in a paragraph do it.\n
If you determine that the HTML is a normal table and doesn't require any modifications, you should return the entire original HTML in the output.\n
{format_instructions}\n
Return only the JSON object.
Here is the table:
{table}"""

prompt = PromptTemplate(
    template=template,
    input_variables=["table"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)
llm = ChatOpenAI(model="gpt-4o", temperature = 0.0)
llm_chain = prompt | llm | parser

In [None]:
import tqdm
def process_tables(soup):
    #get all tables
    tables = soup.find_all('table')
    #for each table send to OpenAI and ask to format
    for table in tqdm.tqdm(tables):
        table_str = str(table)
        if len(table_str) < 10000:
            formatted_table : TableValidator = llm_chain.invoke({"table" : table_str})
            #if table is not valid, replace the table with the formatted table
            if not formatted_table.is_valid:
                table.replace_with(BeautifulSoup(formatted_table.correction, 'html.parser'))
    
    return soup


In [None]:
# Usage
result = remove_duplicate_tables('./docs/CHARTE_QUALITE_PRODUIT2.html')
result = remove_page_status(result)
result = merge_adjacent_tables(result)


In [None]:
result = process_tables(result)

In [None]:
md_result = md(str(result))
#save html to file

with open('./docs/CHARTE_QUALITE_PRODUIT2_Processed.html', "w") as html_file:
    html_file.write(str(result))
#save markdown to file
with open('./docs/CHARTE_QUALITE_PRODUIT2_Processed.md', "w") as md_file:
    md_file.write(md_result)

### Parse PDF using UnstructuredPDFLoader

In [None]:
from langchain.document_loaders import UnstructuredPDFLoader

loader = UnstructuredPDFLoader("./docs/CHARTE_QUALITE_PRODUIT2.pdf", strategy="hi_res")
doc = loader.load()


In [None]:
from typing import List

def summary(docs):
    print(f"There are {len(docs)} documents")
    print(f"There are {sum([len(doc.page_content) for doc in docs])} characters in the documents")
    print()
    print(f"Preview : n {docs[0].page_content[:100]}")

summary(doc)


In [None]:
with open('./docs/CHARTE_QUALITE_PRODUIT2.txt', "w") as txt_file:
    txt_file.write(doc[0].page_content)

### Post Process the loaded content

In [None]:
import re
# function to Delete elements of more then 30 char that repeats itself in the text
def remove_headers_footers(ocr_text, min_length=40, min_repetitions=3):
    segments = ocr_text.split("\n")  # Split OCR text into segments (assuming newline-separated)
    repeated_segments = {}  # Dictionary to store repeated segments and their counts

    # Count repetitions of each segment
    for segment in segments:
        if len(segment) >= min_length and re.match(r'^\|( --- \|)*$', segment) is None:
            if segment in repeated_segments:
                repeated_segments[segment] += 1
            else:
                repeated_segments[segment] = 1

    # Identify segments to remove based on repetition count
    segments_to_remove = []
    for segment, count in repeated_segments.items():
        if count >= min_repetitions:
            segments_to_remove.append(segment)

    print(segments_to_remove)
    # Remove identified segments from OCR text
    cleaned_text = ocr_text
    for segment in segments_to_remove:
        cleaned_text = cleaned_text.replace(segment, "")

    return cleaned_text # Strip leading and trailing whitespace





In [None]:
cleaned_text = remove_headers_footers(md_result, min_length=10, min_repetitions=5)
#cleaned_text = documents[0].text

In [None]:
import re
cleaned_text = re.sub(r"^Page.*$", "\n", cleaned_text, flags=re.MULTILINE)#delete the entire line that start with "page .."
cleaned_text =  re.sub(r"\n\n+", "\n\n", cleaned_text) #delete multiple new lines
#cleaned_text =  re.sub(r"---\n","", cleaned_text) #delete the "---" that are present in the text (page jump)
print(f"Cleaned text LENGTH : {len(cleaned_text)} characters")
print(cleaned_text[0:1000])

In [None]:
with open("./docs/CHARTE_QUALITE_PRODUIT_non_traite.md", "w") as text_file:
    if type(cleaned_text) == str:
        text_file.write(cleaned_text)

In [None]:
# # Ask an llm to rewrite the text in a more readable & understandable way
# from langchain_core.prompts import PromptTemplate
# from langchain_openai import ChatOpenAI

# template = """You are a markdown file reformater. Only touch markdown tags so it is more readable. Note that title, subtitles and annexes must have a '#', '##', .. Do NOT change the content (do not change or add titles) nor the order of the elements, everything must stay in the file. Keep the language of the sources provided (french). Try to think intelligently on the layout.\n Format the following source: {text} \n\n"""

# prompt = PromptTemplate.from_template(template)

# llm = ChatOpenAI(model="gpt-4-turbo", temperature = 0.0)

# llm_chain = prompt | llm


In [None]:
# from langchain_text_splitters import RecursiveCharacterTextSplitter
# text_splitter = RecursiveCharacterTextSplitter(
#     chunk_size=5000,
#     chunk_overlap=20,
#     length_function=len,
#     is_separator_regex=True,
#     keep_separator=False,
#     separators=[
        
#         r'\|\n([^|])',
#         "\n\n",
#         "\n",
#         " ",
#     ],
# )
# split_text = text_splitter.split_text(cleaned_text)
# split_text

In [None]:
# import tqdm
# llm_cleaned_text = ""
# for chunk in tqdm.tqdm(split_text):
#     llm_cleaned_chunk = llm_chain.invoke(chunk)
#     if type(llm_cleaned_chunk.content) == str:
#         llm_cleaned_text += llm_cleaned_chunk.content

# with open("./docs/CHARTE_QUALITE_PRODUIT.md", "w") as text_file:
#     if type(llm_cleaned_text) == str:
#         text_file.write(llm_cleaned_text)


### Use of Langchain to chunk correctly the data

In [None]:
from langchain.text_splitter import MarkdownHeaderTextSplitter
with open("./docs/CHARTE_QUALITE_PRODUIT2_Processed.md", "r") as text_file:
    markdown_text = text_file.read()

markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=[("**", "Header 1")])
md_header_splits = markdown_splitter.split_text(markdown_text)
md_header_splits


In [None]:
len(md_header_splits)

### Llama index Joint Tabular/Semantic QA

In [1]:
import pandas as pd

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
pd.set_option("display.max_colwidth", None)

In [2]:
from llama_index.readers.file import MarkdownReader
from pathlib import Path

reader = MarkdownReader()
charte = reader.load_data(Path("./docs/CHARTE_QUALITE_PRODUIT2_Processed.md"))

In [5]:
#from llama_index.core.node_parser import MarkdownElementNodeParser
from markdown_element import MarkdownElementNodeParser

node_parser = MarkdownElementNodeParser()

In [4]:
import asyncio
import nest_asyncio
import uvloop

if not isinstance(asyncio.get_event_loop(), uvloop.Loop):
    nest_asyncio.apply()

In [3]:
import os
import pickle

if not os.path.exists("./docs/charte.pkl"):
    raw_nodes_charte = node_parser.get_nodes_from_documents(charte, show_progress= True)
    pickle.dump(raw_nodes_charte, open("./docs/charte.pkl", "wb"))
else:
    raw_nodes_charte = pickle.load(open("./docs/charte.pkl", "rb"))

In [6]:
base_nodes_charte, node_mappings_charte = node_parser.get_base_nodes_and_mappings(
    raw_nodes_charte
)

In [7]:
from llama_index.core.schema import IndexNode
example_index_node = [b for b in base_nodes_charte if isinstance(b, IndexNode)][
    5
]

# Index Node
print(
    f"\n--------\n{example_index_node.get_content(metadata_mode='all')}\n--------\n"
)
# Index Node ID
print(f"\n--------\nIndex ID: {example_index_node.index_id}\n--------\n")
# Referenceed Table
print(
    f"\n--------\n{node_mappings_charte[example_index_node.index_id].get_content()}\n--------\n"
)


--------
col_schema: Column: Additive Name
Type: string
Summary: None

Column: Additive Code
Type: string
Summary: None

List of orange food additives with conflicting scientific reports.,
with the following table title:
Additifs oranges : additifs pour lesquels les rapports scientifiques sont contradictoires,
with the following columns:
- Additive Name: None
- Additive Code: None
--------


--------
Index ID: d3379ac8-ba1c-46ba-ab6b-81c13cfa1841
--------


--------
List of orange food additives with conflicting scientific reports.,
with the following table title:
Additifs oranges : additifs pour lesquels les rapports scientifiques sont contradictoires,
with the following columns:
- Additive Name: None
- Additive Code: None

| Esters lactiques des mono- et diglycérides d’acides gras | E 472b |
|---|---|
| Esters citriques des mono- et diglycérides d’acides gras | E 472c |
| Esters tartriques des mono- et diglycérides d’acides gras | E 472d |
| Esters monoacétyltartriques et diacétylta

### Recursive Retriever

In [8]:
from llama_index.core.retrievers import RecursiveRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core import VectorStoreIndex

In [13]:
# construct top-level vector index + query engine
vector_index = VectorStoreIndex(base_nodes_charte)
vector_retriever = vector_index.as_retriever(similarity_top_k=10)
vector_query_engine = vector_index.as_query_engine(similarity_top_k=10)

In [14]:
from llama_index.core.retrievers import RecursiveRetriever

recursive_retriever = RecursiveRetriever(
    "vector",
    retriever_dict={"vector": vector_retriever},
    node_dict=node_mappings_charte,
    verbose=True,
)
query_engine = RetrieverQueryEngine.from_args(recursive_retriever)

In [15]:
response = query_engine.query("Est ce que les aromes artificiels sont accépté dans les produits haut de gamme ?")
print(str(response))

[1;3;34mRetrieving with query id None: Est ce que les aromes artificiels sont accépté dans les produits haut de gamme ?
[0m[1;3;38;5;200mRetrieving text node: **ANNEXE I : Additifs rouges : additifs pour lesquels les rapports scientifiques rapportent une potentielle**

**cancérogénicité ou une implication dans les pathologies lourdes**

**ANNEXE II : Additifs oranges : additifs pour lesquels les rapports scientifiques sont contradictoires**

**ANNEXE III : Additifs verts : additifs identifiés à ce jour comme non dangereux pour la santé.**

**ANNEXE IV : Ingrédients controversés**

**7**

**10**

**11**

**12**

**Liste des abréviations**

**AFDIAG** : Association Française Des Intolérants Au Gluten **FCD** : Fédération du Commerce et de la Distribution

**AOECS** : Association of European Coeliac Societies **GFSI**: Global Food Safety Initiative

**COFRAC**: Comité français d'accréditation **ILAC** : International Laboratory Accreditation Cooperation

**DGHM**: Deutschen Gesellschaf

In [23]:
response = query_engine.query("Est ce que la Cire de carnauba est un additif de type orange? Sinon, de quel type s'agit il ? Quelle est son code E ?")
print(str(response))

[1;3;34mRetrieving with query id None: Est ce que la Cire de carnauba est un additif de type orange? Sinon, de quel type s'agit il ? Quelle est son code E ?
[0m[1;3;38;5;200mRetrieved node with id, entering: d3379ac8-ba1c-46ba-ab6b-81c13cfa1841
[0m[1;3;34mRetrieving with query id d3379ac8-ba1c-46ba-ab6b-81c13cfa1841: Est ce que la Cire de carnauba est un additif de type orange? Sinon, de quel type s'agit il ? Quelle est son code E ?
[0m[1;3;38;5;200mRetrieving text node: **ANNEXE II : Additifs oranges : additifs pour lesquels les rapports scientifiques sont**

**contradictoires**
[0m[1;3;38;5;200mRetrieving text node: **ANNEXE I : Additifs rouges : additifs pour lesquels les rapports scientifiques rapportent une potentielle**

**cancérogénicité ou une implication dans les pathologies lourdes**

**ANNEXE II : Additifs oranges : additifs pour lesquels les rapports scientifiques sont contradictoires**

**ANNEXE III : Additifs verts : additifs identifiés à ce jour comme non danger

In [24]:
response = query_engine.query("Est ce que la Maltodextrine est un ingrédient controversé ? et pour quelle raison ?")
print(str(response))

[1;3;34mRetrieving with query id None: Est ce que la Maltodextrine est un ingrédient controversé ? et pour quelle raison ?
[0m[1;3;38;5;200mRetrieving text node: **ANNEXE I : Additifs rouges : additifs pour lesquels les rapports scientifiques rapportent une potentielle**

**cancérogénicité ou une implication dans les pathologies lourdes**

**ANNEXE II : Additifs oranges : additifs pour lesquels les rapports scientifiques sont contradictoires**

**ANNEXE III : Additifs verts : additifs identifiés à ce jour comme non dangereux pour la santé.**

**ANNEXE IV : Ingrédients controversés**

**7**

**10**

**11**

**12**

**Liste des abréviations**

**AFDIAG** : Association Française Des Intolérants Au Gluten **FCD** : Fédération du Commerce et de la Distribution

**AOECS** : Association of European Coeliac Societies **GFSI**: Global Food Safety Initiative

**COFRAC**: Comité français d'accréditation **ILAC** : International Laboratory Accreditation Cooperation

**DGHM**: Deutschen Gesellsc

In [22]:
# compare against the baseline retriever
response = vector_query_engine.query("Est ce que la Cire de carnauba est un additif de type orange? Quelle est son code E ?")
print(str(response))

La Cire de carnauba est un additif de type orange. Son code E n'est pas spécifié dans le provided context information.


## Document Comparaison using this [llama parse ref](https://github.com/run-llama/llama_parse/blob/main/examples/demo_table_comparisons.ipynb) 

In [25]:
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import VectorStoreIndex
from llama_index.core import Settings

embed_model = OpenAIEmbedding(model="text-embedding-3-small")
llm = OpenAI(model="gpt-4-turbo")

Settings.llm = llm
Settings.embed_model = embed_model

In [26]:
from llama_index.core.node_parser import MarkdownElementNodeParser

node_parser = MarkdownElementNodeParser(
    llm=OpenAI(model="gpt-4-turbo"), num_workers=8
)

In [27]:
import pickle
from llama_index.postprocessor.flag_embedding_reranker import (
    FlagEmbeddingReranker,
)

reranker = FlagEmbeddingReranker(
    top_n=10,
    model="BAAI/bge-reranker-large",
)


def create_query_engine_over_doc(docs, nodes_save_path=None):
    """Big function to go from document path -> recursive retriever."""
    if nodes_save_path is not None and os.path.exists(nodes_save_path):
        raw_nodes = pickle.load(open(nodes_save_path, "rb"))
    else:
        raw_nodes = node_parser.get_nodes_from_documents(docs)
        if nodes_save_path is not None:
            pickle.dump(raw_nodes, open(nodes_save_path, "wb"))

    base_nodes, objects = node_parser.get_nodes_and_objects(raw_nodes)

    ### Construct Retrievers
    # construct top-level vector index + query engine
    vector_index = VectorStoreIndex(nodes=base_nodes + objects)
    query_engine = vector_index.as_query_engine(
        similarity_top_k=15, node_postprocessors=[reranker]
    )
    return query_engine, base_nodes

ModuleNotFoundError: No module named 'llama_index.postprocessor'

In [None]:
query_engine, nodes = create_query_engine_over_doc(
   cleaned_text, nodes_save_path="charte_produit_nodes.pkl"
)

In [None]:
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.core.query_engine import SubQuestionQueryEngine


# setup base query engine as tool
query_engine_tools = [
    QueryEngineTool(
        query_engine=query_engine,
        metadata=ToolMetadata(
            name="charte_produit",
            description=("Donne des informations sur la charte produit à respecter."),
        ),
    )
]

sub_query_engine = SubQuestionQueryEngine.from_defaults(
    query_engine_tools=query_engine_tools,
    llm=llm,
    use_async=True,
)

In [None]:
response = sub_query_engine.query(
    "Qu'est qu'il y a dans l'annexe IV ? "
)

In [None]:
print(str(response))