In [1]:
import sys
sys.path.append('../')
from megaparse.cdp.utils.query_engine import get_query_engine
from pathlib import Path
import numpy as np

query_engine = get_query_engine(Path("../megaparse/cdp/CDP_QUAL_CHART_01_CHARTE_PRODUITS.md"))

In [2]:
from megaparse.cdp.utils.question_generator import QuestionGenerator

qg = QuestionGenerator()
questions = qg.generate_questions(Path("../megaparse/cdp/exemple_fournisseur_quiche.xlsx"), 'Fiche recette', language_verification= True)


Generating Questions ...
Verifying language and translating questions ...


In [3]:
questions_list = str(questions).split("\n")

In [4]:
test  = questions_list[14]
questions_list

['Voici la liste des questions traduites en français :',
 '',
 '1. La "Farine" est-elle conforme aux exigences ?',
 '2. Les "huiles et graisses végétales non hydrogénées de palme et de colza" sont-elles conformes aux exigences ?',
 '3. L\'"eau" est-elle conforme aux exigences ?',
 '4. Le "sel" est-il conforme aux exigences ?',
 '5. L\'"émulsifiant (E471)" est-il conforme aux exigences ?',
 '6. L\'"acidifiant (E330)" est-il conforme aux exigences ?',
 '7. La "poudre de LAIT entier" est-elle conforme aux exigences ?',
 '8. L\'"émulsifiant (E322 (SOJA))" est-il conforme aux exigences ?',
 '9. Les "stabilisants (E339, E332)" sont-ils conformes aux exigences ?',
 '10. Les "Œufs" sont-ils conformes aux exigences ?',
 '11. La "CREME légère 98.5% (LAIT)" est-elle conforme aux exigences ?',
 '12. L\'"amidon modifié" est-il conforme aux exigences ?',
 '13. L\'"émulsifiant: E472e" est-il conforme aux exigences ?',
 '14. Les "stabilisants: E440, E407" sont-ils conformes aux exigences ?',
 '15. L\'

In [6]:
response = query_engine.query("Based uniquely on the text content, answer the question : Est-ce que 'Stabilisant: E 407' est conforme aux exigences ?")
print(str(response))

{"name": "E 407", "detailed_answer": "E 407, also known as carrageenan, is not listed in the provided annexes. Therefore, it cannot be determined if it is considered safe or potentially harmful based on the given information.", "decision": "To Avoid"}


In [7]:
from typing import List
import pandas as pd
def compare(questions: List[str], category: str, verbose: bool = False):
    analysis = []
    
    for question in tqdm(questions):
        for i in range(3):
            try:
                response = query_engine.query(f"{question[:-1]} pour les produits de la catégorie {category} ?").response
            
            except Exception as e:
                if verbose:
                    print(f"Error with question: {question}")
                    print("Retry ...")
                    continue
            break
            

        analysis.append({
            'decision': response.decision,
            'name': response.name,
            'detailed_answer': response.detailed_answer
        })
     
    return pd.DataFrame(analysis)   

In [8]:
from enum import Enum

class DecisionEnum(str, Enum):
    authorized = 'Authorized'
    to_avoid = 'To Avoid'
    forbidden = 'Forbidden'

In [10]:
from tqdm import tqdm
generated_df = pd.DataFrame()
for i in range(1):
    print(f"Running iteration {i}")
    for category in ['Entree de Gamme','Coeur de Gamme', 'Haut de Gamme']:
        analysis = compare(questions_list, category = category, verbose=True)
        if "name" not in generated_df.columns:
            generated_df["name"] = analysis["name"]
        generated_df[f"{category}.{i}"] = analysis["decision"]

Running iteration 0


100%|██████████| 34/34 [02:12<00:00,  3.91s/it]
  0%|          | 0/34 [00:00<?, ?it/s]

Error with question: Voici la liste des questions traduites en français :
Retry ...


100%|██████████| 34/34 [02:06<00:00,  3.71s/it]
100%|██████████| 34/34 [01:54<00:00,  3.37s/it]


In [199]:
#generated_df.to_csv("../megaparse/cdp/generated_1.csv", index = False)

In [206]:
def print_analysis(df: pd.DataFrame):
    print("En tant que produit de la catégorie : ")
    print("------------------------------------------------\n")

    print("\nProduit Autorisé : ")
    print("---------------------")
    authorized_products = df[df['decision'] == DecisionEnum.authorized]
    for _, row in authorized_products.iterrows():
        print(f"{row['name']} : {row['detailed_answer']}")

    print("\nProduit Toléré : ")
    print("---------------------")
    tolerated_products = df[df['decision'] == DecisionEnum.to_avoid]
    for _, row in tolerated_products.iterrows():
        print(f"{row['name']} : {row['detailed_answer']}")

    print("\nProduit Interdit : ")
    print("---------------------")
    forbidden_products = df[df['decision'] == DecisionEnum.forbidden]
    for _, row in forbidden_products.iterrows():
        print(f"{row['name']} : {row['detailed_answer']}")

In [131]:
print_analysis(pd.DataFrame(analysis))

En tant que produit de la catégorie : 
------------------------------------------------


Produit Autorisé : 
---------------------
Framboise : Les framboises sont des fruits naturels et ne contiennent pas d'additifs, de colorants artificiels, d'arômes artificiels ou d'autres ingrédients interdits ou à éviter selon les exigences de la catégorie Haut de Gamme. Elles ne sont pas soumises à déclaration OGM, ne sont pas traitées par ionisation, et ne contiennent pas de nanoparticules, de glutamates, d'exhausteurs de goût, de matières grasses partiellement hydrogénées, d'acides gras trans non naturellement présents, d'édulcorants de synthèse, de gélatine porcine ou animale, d'huile de palme, d'arômes ou de colorants artificiels, d'additifs rouges ou oranges, de nitrites, de polyphosphates, ou de viande et volaille d'origine hors UE. Par conséquent, les framboises sont conformes aux exigences pour les produits de la catégorie Haut de Gamme.
Sucre : Le sucre est un ingrédient couramment utili

### Fill a table (simple)

In [11]:
cleaned_generated_df = generated_df.dropna()
def transform(x):
    return x.value if hasattr(x, 'value') else x

cleaned_generated_df.loc[:, cleaned_generated_df.columns != 'name'] = cleaned_generated_df.loc[:, cleaned_generated_df.columns != 'name'].applymap(transform)
cleaned_generated_df

  cleaned_generated_df.loc[:, cleaned_generated_df.columns != 'name'] = cleaned_generated_df.loc[:, cleaned_generated_df.columns != 'name'].applymap(transform)


Unnamed: 0,name,Entree de Gamme.0,Coeur de Gamme.0,Haut de Gamme.0
0,Ingrédients soumis à déclaration OGM,Forbidden,Forbidden,Forbidden
1,Ingrédients soumis à déclaration OGM,Forbidden,Authorized,Forbidden
2,Farine,Authorized,Authorized,Authorized
3,huiles et graisses végétales non hydrogénées d...,Authorized,To Avoid,Forbidden
4,eau,Authorized,Authorized,Authorized
5,sel,Authorized,Authorized,Authorized
6,E471,Authorized,To Avoid,To Avoid
7,acidifiant (E330),Authorized,Authorized,Authorized
8,poudre de LAIT entier,Authorized,Authorized,Authorized
9,émulsifiant (E322 (SOJA)),Authorized,Authorized,Authorized


In [38]:
ground_truth_df = pd.read_csv('../megaparse/cdp/ground_truth_quiche.csv')
ground_truth_df

Unnamed: 0,name,Entree de Gamme,Coeur de Gamme,Haut de Gamme
0,Farine,Authorized,Authorized,Authorized
1,huiles et graisses végétales non hydrogénées d...,Forbidden,Forbidden,Forbidden
2,Eau,Authorized,Authorized,Authorized
3,sel,Authorized,Authorized,Authorized
4,E471,To Avoid,To Avoid,To Avoid
5,E330,Authorized,Authorized,Authorized
6,poudre de lait entier,Authorized,Authorized,Authorized
7,E322,Authorized,Authorized,Authorized
8,E339,To Avoid,Forbidden,Forbidden
9,E332,Authorized,Authorized,Authorized


In [39]:
import re

def clean_name(name):
    name = name.lower()
    
    name = re.sub(r'[^a-z0-9\s]', '', name)
    
    name = re.sub(r'\s+', ' ', name)
    
    return name.strip()  

cleaned_generated_df['cleaned_name'] = cleaned_generated_df['name'].apply(clean_name)
ground_truth_df['cleaned_name'] = ground_truth_df['name'].apply(clean_name)

In [40]:
import difflib

# Function to perform fuzzy matching using difflib
def fuzzy_match(name, choices, threshold=0.5):
    matches = difflib.get_close_matches(name, choices, n=1, cutoff=threshold)
    if matches:
        return matches
    else:
        return None

# Apply fuzzy matching to each name in df1 and find the closest match in df2
for name1 in ground_truth_df['cleaned_name']:
    matched_names = fuzzy_match(name1, cleaned_generated_df['cleaned_name'])
    if matched_names:
        print(f"The best match found: {name1} -> {matched_names[0]}")
    else:
        print(f"No match found for: {name1}")

The best match found: farine -> farine
The best match found: huiles et graisses vgtales non hydrognes de palme et de colza -> huiles et graisses vgtales non hydrognes de palme et de colza
The best match found: eau -> eau
The best match found: sel -> sel
The best match found: e471 -> e471
The best match found: e330 -> e330
The best match found: poudre de lait entier -> poudre de lait entier
The best match found: e322 -> e202
The best match found: e339 -> e339
The best match found: e332 -> e339
The best match found: creme legere 985 lait -> creme lgre 985 lait
The best match found: amidon modifi -> amidon modifi
The best match found: e472e -> e472e
The best match found: e440 -> e440
The best match found: e407 -> e471
The best match found: oignon -> oignon 669
The best match found: poivron rouge -> poivron rouge 12
No match found for: tomates concasses
The best match found: poivron vert -> poivron vert 4
The best match found: huile dolive extra vierge -> huile dolive extra vierge
The best

In [41]:
cleaned_generated_df

Unnamed: 0,name,Entree de Gamme.0,Coeur de Gamme.0,Haut de Gamme.0,cleaned_name
0,Ingrédients soumis à déclaration OGM,Forbidden,Forbidden,Forbidden,ingrdients soumis dclaration ogm
1,Ingrédients soumis à déclaration OGM,Forbidden,Authorized,Forbidden,ingrdients soumis dclaration ogm
2,Farine,Authorized,Authorized,Authorized,farine
3,huiles et graisses végétales non hydrogénées d...,Authorized,To Avoid,Forbidden,huiles et graisses vgtales non hydrognes de pa...
4,eau,Authorized,Authorized,Authorized,eau
5,sel,Authorized,Authorized,Authorized,sel
6,E471,Authorized,To Avoid,To Avoid,e471
7,acidifiant (E330),Authorized,Authorized,Authorized,acidifiant e330
8,poudre de LAIT entier,Authorized,Authorized,Authorized,poudre de lait entier
9,émulsifiant (E322 (SOJA)),Authorized,Authorized,Authorized,mulsifiant e322 soja


In [42]:
# Function to perform fuzzy matching using difflib
def fuzzy_match(name, choices, threshold=0.5):
    matches = difflib.get_close_matches(name, choices, n=1, cutoff=threshold)
    if matches:
        return matches[0]  # Return the best match
    else:
        return None

# List to hold matched rows
matched_rows = []

# Apply fuzzy matching to each name in ground_truth_df and find the closest match in cleaned_generated_df
for name1 in ground_truth_df['cleaned_name']:
    matched_name = fuzzy_match(name1, cleaned_generated_df['cleaned_name'])
    if matched_name:
        # Get the corresponding row for the matched name
        matched_row = cleaned_generated_df.loc[cleaned_generated_df['cleaned_name'] == matched_name].copy()
        # Change the name in matched_row to the one from ground_truth_df
        matched_row['cleaned_name'] = name1
        matched_rows.append(matched_row)
    else:
        print("no match found for: ", name1)
        empty_row = pd.DataFrame([[None]*len(cleaned_generated_df.columns)], columns=cleaned_generated_df.columns)
        matched_rows.append(empty_row)


# Create the matched_generated_df DataFrame from the list of matched rows
if matched_rows:
    matched_generated_df = pd.concat(matched_rows, ignore_index=True)
else:
    matched_generated_df = pd.DataFrame(columns=cleaned_generated_df.columns)

# Display the matched_generated_df DataFrame
matched_generated_df

no match found for:  tomates concasses
no match found for:  emmental


Unnamed: 0,name,Entree de Gamme.0,Coeur de Gamme.0,Haut de Gamme.0,cleaned_name
0,Farine,Authorized,Authorized,Authorized,farine
1,huiles et graisses végétales non hydrogénées d...,Authorized,To Avoid,Forbidden,huiles et graisses vgtales non hydrognes de pa...
2,eau,Authorized,Authorized,Authorized,eau
3,sel,Authorized,Authorized,Authorized,sel
4,E471,Authorized,To Avoid,To Avoid,e471
5,E330,Authorized,Authorized,Authorized,e330
6,poudre de LAIT entier,Authorized,Authorized,Authorized,poudre de lait entier
7,E202,Authorized,Authorized,Forbidden,e322
8,E339,To Avoid,To Avoid,Forbidden,e339
9,E339,To Avoid,To Avoid,Forbidden,e332


In [45]:
correct_predictions = {}
for column in matched_generated_df.columns:
    if column == 'name' or column == 'cleaned_name':
        continue
    correct_predictions[column] = (matched_generated_df[column] == ground_truth_df[column[:-2]]).sum()

total_predictions = len(matched_generated_df)

accuracy = {category : correct_predictions[category] / total_predictions for category in correct_predictions.keys()}

for category, acc in accuracy.items():
    print(f'Accuracy for {category}: {acc * 100:.2f}%')

Accuracy for Entree de Gamme.0: 76.47%
Accuracy for Coeur de Gamme.0: 79.41%
Accuracy for Haut de Gamme.0: 76.47%


In [46]:
print("Mean Accuracy on all categories: ", np.mean(list(accuracy.values())))

for gamme in ["Entree de Gamme", "Coeur de Gamme", "Haut de Gamme"]:
    mean_gamme = np.mean([value for key, value in accuracy.items() if gamme in key])
    print(f"Mean Accuracy on {gamme} columns: ", mean_gamme)

Mean Accuracy on all categories:  0.7745098039215685
Mean Accuracy on Entree de Gamme columns:  0.7647058823529411
Mean Accuracy on Coeur de Gamme columns:  0.7941176470588235
Mean Accuracy on Haut de Gamme columns:  0.7647058823529411


In [47]:
errors_count = {"Entree de Gamme": {}, "Coeur de Gamme": {}, "Haut de Gamme": {}}

for category in matched_generated_df.columns:
    if category in ['cleaned_name', 'name']:
        continue
    ground_truth_category = category[:-2]
    errors = matched_generated_df[matched_generated_df[category] != ground_truth_df[ground_truth_category]]
    
    for index, row in errors.iterrows():
        key = ground_truth_df.at[index, 'cleaned_name']  # Using cleaned_name as the key
        if key in errors_count[ground_truth_category]:
            errors_count[ground_truth_category][key] += 1
        else:
            errors_count[ground_truth_category][key] = 1

errors_count

{'Entree de Gamme': {'huiles et graisses vgtales non hydrognes de palme et de colza': 1,
  'e471': 1,
  'e332': 1,
  'e472e': 1,
  'e407': 1,
  'tomates concasses': 1,
  'amidon modifi de pomme de terre': 1,
  'emmental': 1},
 'Coeur de Gamme': {'huiles et graisses vgtales non hydrognes de palme et de colza': 1,
  'e339': 1,
  'e332': 1,
  'amidon modifi': 1,
  'tomates concasses': 1,
  'amidon modifi de pomme de terre': 1,
  'emmental': 1},
 'Haut de Gamme': {'e322': 1,
  'e332': 1,
  'amidon modifi': 1,
  'e472e': 1,
  'tomates concasses': 1,
  'amidon modifi de pomme de terre': 1,
  'e202': 1,
  'emmental': 1}}

In [129]:
print("The predictions were incorrect for the following products:")
for category in ['Entree de Gamme', 'Coeur de Gamme', 'Haut de Gamme']:
    incorrect_predictions = matched_generated_df[matched_generated_df[category] != ground_truth_df[category]]
    print(f"\nIncorrect predictions for {category}:")
    print(incorrect_predictions)


The predictions were incorrect for the following products:

Incorrect predictions for Entree de Gamme:
   cleaned_name Entree de Gamme Coeur de Gamme Haut de Gamme
12        e 466      Authorized       To Avoid    Authorized
31        e 407      Authorized       To Avoid    Authorized

Incorrect predictions for Coeur de Gamme:
      cleaned_name Entree de Gamme Coeur de Gamme Haut de Gamme
10           e 406      Authorized       To Avoid    Authorized
11           e 440      Authorized       To Avoid    Authorized
17    amidon de bl      Authorized       To Avoid    Authorized
24  glatine bovine        To Avoid      Forbidden     Forbidden

Incorrect predictions for Haut de Gamme:
      cleaned_name Entree de Gamme Coeur de Gamme Haut de Gamme
12           e 466      Authorized       To Avoid    Authorized
15           e 202      Authorized     Authorized     Forbidden
24  glatine bovine        To Avoid      Forbidden     Forbidden
31           e 407      Authorized       To Avoid    