# Named Entity Recognotion

Noah Meißner 18.05.2025

This Jupyter Notebook analyse four different Named Entity Recognition possibilities, to see which fits best for this Use Case.

We use:
1. Handlabelling as Ground Truth
2. ChatGPT 4o mini 
3. Gemini -- free version
4. Spacy (train our own model)

In [1]:
from train_ner import train_model, test_model
from load_annotation import load_dataset
from request_gemini import request
from request_gpt import chat_with_openai
import pandas as pd
from DataLoader_Ingredients import DataLoader
from enum import Enum
from prompts import ingredients_extraction
import json
import os
import ast
import re

## Create Dataset

In [2]:
df_whole = pd.read_csv("Data/dataset.csv")
loader_obj = DataLoader(df_whole)
train, test = loader_obj.get_set()

## Analysis

We now analyse different approaches, starting with Gemini. We split the test Data in 20 ingredients per prompt, for less requests. For the Gemini and OpenAI approach we are using the same prompt, to evaluate the system model

In [3]:
class DataType(Enum):
    eval = "Evaluation"
    train = "Training"

In [4]:
def save_response(approach="ModelName",type=DataType, data=[{}]):
    path = f"Data/Ingredients/{type.value}/{approach}.json"
    with open(path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4, ensure_ascii=False)

In [5]:
def load_response(approach="ModelName", type=DataType):
    path = f"Data/Ingredients/{type.value}/{approach}.json"
    try:
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
        return data
    except FileNotFoundError:
        print(f"Datei nicht gefunden: {path}")
        return None
    except json.JSONDecodeError as e:
        print(f"Fehler beim Parsen von JSON: {e}")
        return None

In [6]:
def pack_twenty(data):
    return [data[i:i+20] for i in range(0, len(data), 20)]

In [7]:
prompt = ingredients_extraction.prompt_ingredients

In [8]:
def clean_model_output(str):
    match = re.search(r'\[(.*)\]', str, re.DOTALL)
    if match:
        cleaned_json = "[" + match.group(1).strip() + "]"
        print(cleaned_json)
    fixed_string = re.sub(r'\(([^()]+?)\)', r'[\1]', cleaned_json)
    try:
        parsed_data = ast.literal_eval(fixed_string)
        return parsed_data
    except Exception as e:
        print("Fehler beim Parsen:", e)


### Use Gemini

In [16]:
if not os.path.exists("Data/Ingredients/Evaluation/Gemini.json"):
    packages = pack_twenty(test)
    res = []
    for p in packages:
        try:
            new_prompt = prompt.replace("$DATA$", str(p))
            res.extend(clean_model_output(request(new_prompt)))
        except Exception as e:
            print("Model Error:", e)


In [13]:
save_response("Gemini",DataType.eval,res)

### GPT

In [15]:
if not os.path.exists("Data/Ingredients/Evaluation/OpenAI.json"):
    packages = pack_twenty(test)
    res = []
    for p in packages:
        try:
            new_prompt = prompt.replace("$DATA$", str(p))
            res.extend(clean_model_output(chat_with_openai(new_prompt)))
        except Exception as e:
            print("Model Error:", e)

In [10]:
save_response("OpenAI",DataType.eval,res)

## Evaluation

In [13]:
openai_ann = load_response("OpenAI",DataType.eval)
gemini_ann = load_response("Gemini",DataType.eval)

In [32]:
def make_one_dict(ls):
    dict = {}
    for value in ls:
        key = list(value.keys())[0]
        dict[key] = value[key]
    return dict

In [None]:
dict_gemini[0]

{'250 gr.:schweinerücken': {'entities': [['250', 'Number'],
   ['gr.', 'Units'],
   ['schweinerücken', 'Ingredients']]}}

In [33]:
dict_gemini = make_one_dict(gemini_ann)
dict_open = make_one_dict(openai_ann)

In [None]:
def compare(dict_one, dict_two):
    entity_one = dict_one['entities']
    entity_two = dict_two['entities']
    if len(entity_one) == entity_two:
        ls_entity_one = [value[1] for value in entity_one]
        ls_entity_two = [value[1] for value in entity_two]
        return [ls_entity_one, ls_entity_two]
    return 1

In [54]:
from sklearn.metrics import cohen_kappa_score

def compare(dict_one, dict_two):
    entity_one = dict_one['entities']
    entity_two = dict_two['entities']
    dict_one_map = {token: label for token, label in entity_one}
    dict_two_map = {token: label for token, label in entity_two}
    all_tokens = sorted(set(dict_one_map.keys()) | set(dict_two_map.keys()))
    labels_one = [dict_one_map.get(token, 'O') for token in all_tokens]
    labels_two = [dict_two_map.get(token, 'O') for token in all_tokens]
    return [labels_one, labels_two]

In [55]:
key = '250 gr.:schweinerücken'
dict_one = dict_gemini[key]
dict_two = dict_open[key]
compare(dict_one, dict_two)

[['Number', 'Units', 'Ingredients'], ['Number', 'Units', 'Ingredients']]

In [77]:
ls_one = []
ls_two = []
no_match = 0
ls = []
for key, value in dict_gemini.items():
    try:
        dict_two = {}
        if key in dict_open:
            dict_two = dict_open[key]
        else:
            tokens = [(token, 'O') for token, label in value['entities']]
            dict_two = {'entities': tokens}
        one, two = compare(value, dict_two)
        if len(one) < 1 or len(two) < 1:
            no_match+=1
        ls_one.extend(one)
        ls_two.extend(two)
    except Exception as e:
        print("ajkhsdf")
        ls.append(1)
        print(dict_two)
        print(key)
        print(dict_one)
kappa = cohen_kappa_score(ls_one, ls_two)
print(kappa)

ajkhsdf
{'entities': [['0.5', 'Number'], ['glas', 'Units'], ['preiselbeeren', 'Ingredients']]}
0.5 glas:preiselbeeren
{'entities': [['250', 'Number'], ['gr.', 'Units'], ['schweinerücken', 'Ingredients']]}
ajkhsdf
{'entities': [['1', 'Number'], ['rote', 'Type'], ['paprika', 'Ingredients']]}
1 rote:paprika
{'entities': [['250', 'Number'], ['gr.', 'Units'], ['schweinerücken', 'Ingredients']]}
ajkhsdf
{'entities': [['2', 'Number'], ['stück', 'Units'], ['frühlingszwiebeln', 'Ingredients']]}
2 stück:frühlingszwiebeln
{'entities': [['250', 'Number'], ['gr.', 'Units'], ['schweinerücken', 'Ingredients']]}
ajkhsdf
{'entities': [['300', 'Number'], ['g', 'Units'], ['erdbeeren', 'Ingredients']]}
300 g:erdbeeren
{'entities': [['250', 'Number'], ['gr.', 'Units'], ['schweinerücken', 'Ingredients']]}
ajkhsdf
{'entities': [['250', 'Number'], ['gramm', 'Units'], ['weizenmehl', 'Ingredients']]}
250 gramm:weizenmehl
{'entities': [['250', 'Number'], ['gr.', 'Units'], ['schweinerücken', 'Ingredients']]}
ajkh

In [9]:
if not os.path.exists("Data/Ingredients/Training/Gemini.json"):
    packages = pack_twenty(train)
    res = []
    for p in packages:
        try:
            new_prompt = prompt.replace("$DATA$", str(p))
            res.extend(clean_model_output(request(new_prompt)))
        except Exception as e:
            print("Model Error:", e)

[{
    "5:schwarze olive": {
      "entities": [
        ("5", "Number"),
        ("schwarze olive", "Ingredients")
      ]
    }
  },
  {
    "750 ml:brühe": {
      "entities": [
        ("750", "Number"),
        ("ml", "Units"),
        ("brühe", "Ingredients")
      ]
    }
  },
  {
    "3 stück:paprikaschoten (rot, gelb,grün)": {
      "entities": [
        ("3", "Number"),
        ("stück", "Units"),
        ("paprikaschoten", "Ingredients"),
        ("(rot, gelb,grün)", "Type")
      ]
    }
  },
  {
    ":basilikumblätter gehackt": {
      "entities": [
        ("basilikumblätter", "Ingredients"),
        ("gehackt", "Type")
      ]
    }
  },
  {
    "450 gramm:blattspinat": {
      "entities": [
        ("450", "Number"),
        ("gramm", "Units"),
        ("blattspinat", "Ingredients")
      ]
    }
  },
  {
    "100 g:roggenschrot": {
      "entities": [
        ("100", "Number"),
        ("g", "Units"),
        ("roggenschrot", "Ingredients")
      ]
    }
  },
  {
    "

In [10]:
save_response("Gemini",DataType.train,res)