In [39]:
import os
import json
import requests
import cv2
import base64
import numpy as np

In [None]:
def send_request_to(url, content, type="post"):
    """
    Sends request to url and returns response json. If return status code is not 200, will return dictionary with key
    "ERROR".
    :param url: url to sent content to
    :param content: content to sent to url
    :param type: post, get, or delete
    :return: json response from endpoint
    """
    try:
        if type == "get":
            response = requests.get(url, json=content, timeout=100)  # timeout of 100 seconds
        elif type == "post":
            response = requests.post(url, json=content, timeout=100)
        elif type == "delete":
            response = requests.delete(url, timeout=100)
        else:
            return {"ERROR": "invalid request type"}, False
    except requests.exceptions.Timeout:
        return {"ERROR": "timed out"}, False
    if response.status_code == 200 or response.status_code == 201:
        if type == "get" or type == "post":
            return response.json(), True
        elif type == "delete":
            return True, True
        else:
            return None, False
    else:
        return {"ERROR": str(response.content)}, False


def send_request_to_database(resource, content=None, type="post"):
    """
    Sends request database resource and returns response json. If return status code is not 200, will return dictionary
    with key "ERROR".
    :param resource: the REST resource to be called, e.g. /drawing/get/1 (include leading /)
    :param content: the payload of the request, e.g. json data for saving a drawing
    :param type: post, get, or delete
    :return: json response from endpoint
    """
    url = "http://0.0.0.0:7201" + resource  # change this to your url
    return send_request_to(url, content, type)


def send_request_to_llm_backend(resource, content=None, type="post"):
    """
    Sends request database resource and returns response json. If return status code is not 200, will return dictionary
    with key "ERROR".
    :param resource: the REST resource to be called, e.g. /drawing/get/1 (include leading /)
    :param content: the payload of the request, e.g. json data for saving a drawing
    :param type: post, get, or delete
    :return: json response from endpoint
    """
    url = "http://0.0.0.0:9201" + resource  # change this to your url
    return send_request_to(url, content, type)

def convert_bytestring_to_cv2(bytestring):
    """
    Converts an image bytestring to a cv2 image
    :param bytestring: bytestring of an image file
    :return: np array
    """
    bytestring = str(bytestring).replace("b'", "").replace("'", "")
    arr = np.frombuffer(base64.b64decode(bytestring), dtype=np.uint8)
    return cv2.imdecode(arr, flags=1)

In [3]:
project_name = "tool_example"

### load prompts

In [4]:
with open("resources/complex_prompts.txt") as f:
    complex_prompts = f.readlines()
with open("resources/prompts.txt") as f:
    prompts = f.readlines()
print(len(prompts))
print(len(complex_prompts))

32
20


### get search results for all prompts

In [6]:
def get_search_results_for_prompts(prompt_list):
    response_data = []
    for line in prompt_list:
        [tag, prompt] = line.split(":")
        print(prompt)
        response = send_request_to_llm_backend("/retrieve", {"query": prompt})
        results, is_ok = response
        retrieved_ids = results["results"]
        response_data.append({
            "prompt": prompt,
            "tag": tag,
            "retrieved_ids": retrieved_ids,
        })
    return response_data
# normal prompts
response_data = get_search_results_for_prompts(prompts)
with open(f"{project_name}_results.json", "w") as f:
    json.dump(response_data, f)
# complex prompts
response_data_complex = get_search_results_for_prompts(complex_prompts)
with open(f"{project_name}_results_complex.json", "w") as f:
    json.dump(response_data_complex, f)

Find Deckel parts.

Retrieve Schalterkörper.

Find Adapter that were manufactured.

Look for Flansch.

Show me an Aufnehmer.

Retrieve parts that only specify ISO-2768 medium and nothing else.

Show parts that reference ISO-2768 medium and normal in their tolerance specification.

Look for parts that dont specify their general tolerances.

Which parts have fine and tight 2768 tolerances?

Find parts with a surface roughness of Ra 3.2.

Which parts have roughness Ra 20?

List parts that have roughness Ra 0.8.

Show parts that include a surface roughness of Ra 1.6.

Which parts reference roughness using Rz?

Which parts have M12 threads?

List parts containing thread M16.

Find parts that contain thread M5.

Show parts that include NPT threads.

Which parts have G 1/4 threads?

Show me parts made of material 3.2315.

Give me parts whose material is AlMgSi1.

Which parts are produced from stainless steel X2CrNiMo17-13-2?

Show parts that use material 1.4571.

Look for parts made of X6CrNi

In [8]:
with open(f"{project_name}_results.json", "r") as f:
    response_data = json.load(f)
with open(f"{project_name}_results_complex.json", "r") as f:
    response_data_complex = json.load(f)
# make dirs for images
image_dir = f"./{project_name}"
image_dir_complex = f"./{project_name}_complex"

os.makedirs(image_dir, exist_ok=True)
os.makedirs(image_dir_complex, exist_ok=True)

# new list of dicts like {"data": {"prompt": prompt, "tag": tag, "llm_text": llm_text, "image": path_to_image}}
# basically we convert the saerch results into a format that labelstudio understands, so we can upload the file and label the relevancy of each result
def convert_results_to_labelstudio(responses, image_dir):
    data_list = []
    for prompt_data in responses:
        prompt = prompt_data["prompt"]
        print(prompt)
        retrieved_ids = prompt_data["retrieved_ids"]
        tag = prompt_data["tag"]
        for retrieved_id in retrieved_ids[:5]:  # we only look at the first five results
            print(retrieved_id)
            # get data from database
            searchdata = send_request_to_database(f"/drawing/get/{retrieved_id}", type="get")
            drawing_id = str(searchdata[0]["drawing_id"])
            # save the image in appropriate dir
            img = convert_bytestring_to_cv2(searchdata[0]["original_drawing"])
            cv2.imwrite(f"./{image_dir}/" + drawing_id + ".png", img)
            # add entry to data_list
            data_list.append({
                "data": {
                    "prompt": prompt,
                    "tag": tag,
                    "llm_text": searchdata[0]["searchdata"]["llm_text"],
                    "image": str(os.path.join(f"data/local-files?d={image_dir}/", f"{drawing_id}.png")),
                }
            })
    json.dump(data_list, open(f"{project_name}.json", "w"))

convert_results_to_labelstudio(response_data, image_dir)
convert_results_to_labelstudio(response_data_complex, image_dir_complex)

Find Deckel parts.

22398
18564
21763
20055
20057
Retrieve Schalterkörper.

23682
19641
23208
19821
20494
Find Adapter that were manufactured.

21852
19400
21822
10700
22729
Look for Flansch.

18797
13310
22131
22100
23756
Show me an Aufnehmer.

20774
21026
21774
21023
20884
Retrieve parts that only specify ISO-2768 medium and nothing else.

18120
24085
23018
22225
18247
Show parts that reference ISO-2768 medium and normal in their tolerance specification.

24085
21437
19524
21974
24012
Look for parts that dont specify their general tolerances.

19358
23018
23671
12983
10938
Which parts have fine and tight 2768 tolerances?

19085
19257
23018
24085
21237
Find parts with a surface roughness of Ra 3.2.

23759
21827
19085
23311
23453
Which parts have roughness Ra 20?

23311
21418
21284
18843
23415
List parts that have roughness Ra 0.8.

23311
23018
22542
24085
21754
Show parts that include a surface roughness of Ra 1.6.

23346
23453
23311
22978
14466
Which parts reference roughness using R

# Evaluation

### if you want to reproduce, you need to label the resulting json files.
* we used labelstudio for this
* this is our labeling setup for the normal prompts:
```
<View style="font-size: 2rem;">
  <Header value="Prompt"/>
  <Text name="prompt" value="$prompt"/>
  <Image name="image" value="$image"/>
  <Header value="RAG result"/>
  <Choices name="relevance" toName="image">
    <Choice value="Relevant" style="zoom: 2;"/>
    <Choice value="NOT Relevant" style="zoom: 2;"/>
  </Choices>
  <Header value="Extraction"/>
  <Text name="llm_text" value="$llm_text"/>
  <Choices name="extraction" toName="image">
    <Choice value="extraction OK" style="zoom: 2;"/>
    <Choice value="extraction Faulty" style="zoom: 2;"/>
  </Choices>
</View>
```
* this is our labeling setup for the complex prompts:
```
<View style="font-size: 2rem;">
  <Header value="Prompt"/>
  <Text name="prompt" value="$prompt"/>
  <Image name="image" value="$image"/>
  <Header value="RAG result"/>
  <Choices name="relevance" toName="image">
    <Choice value="Both Relevant" style="zoom: 2;"/>
    <Choice value="Only Feature 1 Relevant" style="zoom: 2;"/>
    <Choice value="Only Feature 2 Relevant" style="zoom: 2;"/>
    <Choice value="NOT Relevant" style="zoom: 2;"/>
  </Choices>
  <Header value="Extraction"/>
  <Text name="llm_text" value="$llm_text"/>
  <Choices name="extraction" toName="image">
    <Choice value="extraction OK" style="zoom: 2;"/>
    <Choice value="extraction Faulty" style="zoom: 2;"/>
  </Choices>
</View>
```

# Normal prompts

In [9]:
labels = json.load(open("./your_prompt_labels.json", "r"))

In [12]:
len(labels) == len(prompts) * 5

True

In [13]:
# convert the labels into a representation that is easier to work with
converted_labels = []
current_labels = []
for i, label in enumerate(labels):
    relevancy = 1
    if "NOT" in label["relevance"]:
        relevancy = -1
    current_labels.append(relevancy)

    if (i+1) % 5 == 0:
        converted_labels.append(current_labels)
        current_labels = []
converted_labels

[[1, 1, 1, 1, 1],
 [1, 1, 1, 1, 1],
 [1, 1, 1, 1, 1],
 [1, 1, -1, -1, 1],
 [1, 1, 1, 1, 1],
 [1, -1, -1, -1, -1],
 [1, 1, 1, 1, 1],
 [1, -1, 1, -1, -1],
 [-1, 1, -1, -1, -1],
 [1, -1, -1, 1, 1],
 [-1, -1, -1, -1, -1],
 [1, -1, 1, 1, 1],
 [-1, 1, 1, -1, -1],
 [1, 1, -1, 1, 1],
 [1, 1, 1, 1, 1],
 [1, 1, 1, 1, 1],
 [1, 1, 1, 1, 1],
 [1, 1, 1, -1, 1],
 [1, 1, 1, 1, 1],
 [1, -1, -1, -1, -1],
 [1, 1, 1, -1, -1],
 [1, 1, 1, 1, 1],
 [-1, -1, -1, -1, -1],
 [1, 1, 1, -1, 1],
 [1, 1, -1, 1, 1],
 [1, 1, 1, 1, 1],
 [-1, -1, -1, -1, 1],
 [-1, 1, -1, -1, -1],
 [-1, 1, 1, 1, -1],
 [1, 1, 1, 1, -1],
 [1, 1, 1, -1, -1],
 [-1, -1, 1, 1, 1]]

In [14]:
def get_prec_at_k(label_representation, k):
    precs = []
    # a prompt_result is a list of binary values that represents the relevancy of each retrieved drawing for a prompt
    for prompt_result in label_representation:
        print(prompt_result)
        relevant = 0
        for label in prompt_result[:k]:
            if label == 1:
                relevant += 1
        precs.append(relevant/k)

    return np.asarray(precs)

In [18]:
prec_at_3 = get_prec_at_k(converted_labels, 3)
prec_at_5 = get_prec_at_k(converted_labels, 5)
print("Precisions: ")
print("@3: ", prec_at_3.mean(), "|| @5: ", prec_at_5.mean())

[1, 1, 1, 1, 1]
[1, 1, 1, 1, 1]
[1, 1, 1, 1, 1]
[1, 1, -1, -1, 1]
[1, 1, 1, 1, 1]
[1, -1, -1, -1, -1]
[1, 1, 1, 1, 1]
[1, -1, 1, -1, -1]
[-1, 1, -1, -1, -1]
[1, -1, -1, 1, 1]
[-1, -1, -1, -1, -1]
[1, -1, 1, 1, 1]
[-1, 1, 1, -1, -1]
[1, 1, -1, 1, 1]
[1, 1, 1, 1, 1]
[1, 1, 1, 1, 1]
[1, 1, 1, 1, 1]
[1, 1, 1, -1, 1]
[1, 1, 1, 1, 1]
[1, -1, -1, -1, -1]
[1, 1, 1, -1, -1]
[1, 1, 1, 1, 1]
[-1, -1, -1, -1, -1]
[1, 1, 1, -1, 1]
[1, 1, -1, 1, 1]
[1, 1, 1, 1, 1]
[-1, -1, -1, -1, 1]
[-1, 1, -1, -1, -1]
[-1, 1, 1, 1, -1]
[1, 1, 1, 1, -1]
[1, 1, 1, -1, -1]
[-1, -1, 1, 1, 1]
[1, 1, 1, 1, 1]
[1, 1, 1, 1, 1]
[1, 1, 1, 1, 1]
[1, 1, -1, -1, 1]
[1, 1, 1, 1, 1]
[1, -1, -1, -1, -1]
[1, 1, 1, 1, 1]
[1, -1, 1, -1, -1]
[-1, 1, -1, -1, -1]
[1, -1, -1, 1, 1]
[-1, -1, -1, -1, -1]
[1, -1, 1, 1, 1]
[-1, 1, 1, -1, -1]
[1, 1, -1, 1, 1]
[1, 1, 1, 1, 1]
[1, 1, 1, 1, 1]
[1, 1, 1, 1, 1]
[1, 1, 1, -1, 1]
[1, 1, 1, 1, 1]
[1, -1, -1, -1, -1]
[1, 1, 1, -1, -1]
[1, 1, 1, 1, 1]
[-1, -1, -1, -1, -1]
[1, 1, 1, -1, 1]
[1, 1, -1, 1

### Feature wise

In [19]:
# dict with feature: a list of search results, already in their label_representation
tags = {
    "name": [],
    "tolerance": [],
    "surface": [],
    "thread": [],
    "material": [],
    "dimension": [],
    "gdt": []
}

for conv_label, prompt in zip(converted_labels, prompts):
    [tag, query] = str(prompt).split(":")
    tags[tag].append(conv_label)
print(tags)

{'name': [[1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, -1, -1, 1], [1, 1, 1, 1, 1]], 'tolerance': [[1, -1, -1, -1, -1], [1, 1, 1, 1, 1], [1, -1, 1, -1, -1], [-1, 1, -1, -1, -1]], 'surface': [[1, -1, -1, 1, 1], [-1, -1, -1, -1, -1], [1, -1, 1, 1, 1], [-1, 1, 1, -1, -1], [1, 1, -1, 1, 1]], 'thread': [[1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, -1, 1], [1, 1, 1, 1, 1]], 'material': [[1, -1, -1, -1, -1], [1, 1, 1, -1, -1], [1, 1, 1, 1, 1], [-1, -1, -1, -1, -1], [1, 1, 1, -1, 1]], 'dimension': [[1, 1, -1, 1, 1], [1, 1, 1, 1, 1], [-1, -1, -1, -1, 1]], 'gdt': [[-1, 1, -1, -1, -1], [-1, 1, 1, 1, -1], [1, 1, 1, 1, -1], [1, 1, 1, -1, -1], [-1, -1, 1, 1, 1]]}


In [20]:
for tag, relevancies in tags.items():
    precs = get_prec_at_k(relevancies, 3)
    print(f"prec@3 for {tag}: ", precs.mean())

[1, 1, 1, 1, 1]
[1, 1, 1, 1, 1]
[1, 1, 1, 1, 1]
[1, 1, -1, -1, 1]
[1, 1, 1, 1, 1]
prec@3 for name:  0.9333333333333332
[1, -1, -1, -1, -1]
[1, 1, 1, 1, 1]
[1, -1, 1, -1, -1]
[-1, 1, -1, -1, -1]
prec@3 for tolerance:  0.5833333333333334
[1, -1, -1, 1, 1]
[-1, -1, -1, -1, -1]
[1, -1, 1, 1, 1]
[-1, 1, 1, -1, -1]
[1, 1, -1, 1, 1]
prec@3 for surface:  0.4666666666666666
[1, 1, 1, 1, 1]
[1, 1, 1, 1, 1]
[1, 1, 1, 1, 1]
[1, 1, 1, -1, 1]
[1, 1, 1, 1, 1]
prec@3 for thread:  1.0
[1, -1, -1, -1, -1]
[1, 1, 1, -1, -1]
[1, 1, 1, 1, 1]
[-1, -1, -1, -1, -1]
[1, 1, 1, -1, 1]
prec@3 for material:  0.6666666666666666
[1, 1, -1, 1, 1]
[1, 1, 1, 1, 1]
[-1, -1, -1, -1, 1]
prec@3 for dimension:  0.5555555555555555
[-1, 1, -1, -1, -1]
[-1, 1, 1, 1, -1]
[1, 1, 1, 1, -1]
[1, 1, 1, -1, -1]
[-1, -1, 1, 1, 1]
prec@3 for gdt:  0.6666666666666667


In [21]:
for tag, relevancies in tags.items():
    precs = get_prec_at_k(relevancies, 5)
    print(f"prec@5 for {tag}: ", precs.mean())

[1, 1, 1, 1, 1]
[1, 1, 1, 1, 1]
[1, 1, 1, 1, 1]
[1, 1, -1, -1, 1]
[1, 1, 1, 1, 1]
prec@5 for name:  0.9199999999999999
[1, -1, -1, -1, -1]
[1, 1, 1, 1, 1]
[1, -1, 1, -1, -1]
[-1, 1, -1, -1, -1]
prec@5 for tolerance:  0.45
[1, -1, -1, 1, 1]
[-1, -1, -1, -1, -1]
[1, -1, 1, 1, 1]
[-1, 1, 1, -1, -1]
[1, 1, -1, 1, 1]
prec@5 for surface:  0.5199999999999999
[1, 1, 1, 1, 1]
[1, 1, 1, 1, 1]
[1, 1, 1, 1, 1]
[1, 1, 1, -1, 1]
[1, 1, 1, 1, 1]
prec@5 for thread:  0.96
[1, -1, -1, -1, -1]
[1, 1, 1, -1, -1]
[1, 1, 1, 1, 1]
[-1, -1, -1, -1, -1]
[1, 1, 1, -1, 1]
prec@5 for material:  0.52
[1, 1, -1, 1, 1]
[1, 1, 1, 1, 1]
[-1, -1, -1, -1, 1]
prec@5 for dimension:  0.6666666666666666
[-1, 1, -1, -1, -1]
[-1, 1, 1, 1, -1]
[1, 1, 1, 1, -1]
[1, 1, 1, -1, -1]
[-1, -1, 1, 1, 1]
prec@5 for gdt:  0.56


# Complex Prompts

In [22]:
complex_labels = json.load(open("./your_complex_prompt_labels.json", "r"))

In [23]:
# we do the same here as we did for the normal prompts, but this time we add a state for a partially correct retrieval
converted_complex_labels = []
current_complex_labels = []
for i, label in enumerate(complex_labels):
    relevancy = 1
    if "Feature 1" in label["relevance"]:
        relevancy = 0
    elif "Feature 2" in label["relevance"]:
        relevancy = 0
    elif "NOT" in label["relevance"]:
        relevancy = -1
    current_complex_labels.append(relevancy)

    if (i+1) % 5 == 0:
        converted_complex_labels.append(current_complex_labels)
        current_complex_labels = []
print(converted_complex_labels)

[[1, 1, 0, 0, 1], [0, -1, -1, -1, -1], [0, 0, 0, 0, 1], [1, 0, 1, 1, 0], [0, 1, 0, 0, 0], [0, 0, 1, 0, 0], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 0, 0, 0, 0], [0, 1, 0, 0, 0], [0, 1, 0, 1, 0], [1, 0, 0, -1, 1], [-1, 1, 1, -1, 1], [0, 0, 0, 0, 0], [1, 1, 1, 0, 0], [0, -1, -1, -1, 0], [1, -1, 1, -1, 1], [0, 0, 1, -1, 0], [1, 0, 0, 0, 0], [0, 1, 0, 1, 1]]


In [24]:
# for this we also define a new function that computes prec with the partially correct results as either incorrect or correct
def get_prec_at_k_with_partiality(label_representation, k):
    precs_with_partly = []
    precs_without_partly = []
    # a prompt_result is a list of binary values that represents the relevancy of each retrieved drawing for a prompt
    for prompt_result in label_representation:
        print(prompt_result)
        relevant_with_partly = 0
        relevant_without_partly = 0
        for label in prompt_result[:k]:
            if label == 1:
                relevant_with_partly += 1
                relevant_without_partly += 1
            elif label == 0:
                relevant_with_partly += 1
        precs_with_partly.append(relevant_with_partly/k)
        precs_without_partly.append(relevant_without_partly/k)

    return np.asarray(precs_with_partly), np.asarray(precs_without_partly)

In [25]:
# get results for complex prompts only
precs_with_partly, precs_without_partly = get_prec_at_k_with_partiality(converted_complex_labels, 3)
print("prec@3 with the partly correct results: ", precs_with_partly.mean())
print("prec@3 without the partly correct results: ", precs_without_partly.mean())

[1, 1, 0, 0, 1]
[0, -1, -1, -1, -1]
[0, 0, 0, 0, 1]
[1, 0, 1, 1, 0]
[0, 1, 0, 0, 0]
[0, 0, 1, 0, 0]
[1, 1, 1, 1, 1]
[1, 1, 1, 1, 1]
[1, 0, 0, 0, 0]
[0, 1, 0, 0, 0]
[0, 1, 0, 1, 0]
[1, 0, 0, -1, 1]
[-1, 1, 1, -1, 1]
[0, 0, 0, 0, 0]
[1, 1, 1, 0, 0]
[0, -1, -1, -1, 0]
[1, -1, 1, -1, 1]
[0, 0, 1, -1, 0]
[1, 0, 0, 0, 0]
[0, 1, 0, 1, 1]
prec@3 with the partly correct results:  0.9
prec@3 without the partly correct results:  0.4333333333333334


In [26]:
precs_with_partly, precs_without_partly = get_prec_at_k_with_partiality(converted_complex_labels, 5)
print("prec@5 with the partly correct results: ", precs_with_partly.mean())
print("prec@5 without the partly correct results: ", precs_without_partly.mean())

[1, 1, 0, 0, 1]
[0, -1, -1, -1, -1]
[0, 0, 0, 0, 1]
[1, 0, 1, 1, 0]
[0, 1, 0, 0, 0]
[0, 0, 1, 0, 0]
[1, 1, 1, 1, 1]
[1, 1, 1, 1, 1]
[1, 0, 0, 0, 0]
[0, 1, 0, 0, 0]
[0, 1, 0, 1, 0]
[1, 0, 0, -1, 1]
[-1, 1, 1, -1, 1]
[0, 0, 0, 0, 0]
[1, 1, 1, 0, 0]
[0, -1, -1, -1, 0]
[1, -1, 1, -1, 1]
[0, 0, 1, -1, 0]
[1, 0, 0, 0, 0]
[0, 1, 0, 1, 1]
prec@5 with the partly correct results:  0.8699999999999999
prec@5 without the partly correct results:  0.39


### combine prompts and get results for all

In [27]:
print(len(converted_complex_labels), len(converted_labels))
all_converted_labels = converted_labels + converted_complex_labels
print(len(all_converted_labels))

20 32
52


In [28]:
precs_with_partly, precs_without_partly = get_prec_at_k_with_partiality(all_converted_labels, 3)
print("prec@3 with the partly correct results: ", precs_with_partly.mean())
print("prec@3 without the partly correct results: ", precs_without_partly.mean())

[1, 1, 1, 1, 1]
[1, 1, 1, 1, 1]
[1, 1, 1, 1, 1]
[1, 1, -1, -1, 1]
[1, 1, 1, 1, 1]
[1, -1, -1, -1, -1]
[1, 1, 1, 1, 1]
[1, -1, 1, -1, -1]
[-1, 1, -1, -1, -1]
[1, -1, -1, 1, 1]
[-1, -1, -1, -1, -1]
[1, -1, 1, 1, 1]
[-1, 1, 1, -1, -1]
[1, 1, -1, 1, 1]
[1, 1, 1, 1, 1]
[1, 1, 1, 1, 1]
[1, 1, 1, 1, 1]
[1, 1, 1, -1, 1]
[1, 1, 1, 1, 1]
[1, -1, -1, -1, -1]
[1, 1, 1, -1, -1]
[1, 1, 1, 1, 1]
[-1, -1, -1, -1, -1]
[1, 1, 1, -1, 1]
[1, 1, -1, 1, 1]
[1, 1, 1, 1, 1]
[-1, -1, -1, -1, 1]
[-1, 1, -1, -1, -1]
[-1, 1, 1, 1, -1]
[1, 1, 1, 1, -1]
[1, 1, 1, -1, -1]
[-1, -1, 1, 1, 1]
[1, 1, 0, 0, 1]
[0, -1, -1, -1, -1]
[0, 0, 0, 0, 1]
[1, 0, 1, 1, 0]
[0, 1, 0, 0, 0]
[0, 0, 1, 0, 0]
[1, 1, 1, 1, 1]
[1, 1, 1, 1, 1]
[1, 0, 0, 0, 0]
[0, 1, 0, 0, 0]
[0, 1, 0, 1, 0]
[1, 0, 0, -1, 1]
[-1, 1, 1, -1, 1]
[0, 0, 0, 0, 0]
[1, 1, 1, 0, 0]
[0, -1, -1, -1, 0]
[1, -1, 1, -1, 1]
[0, 0, 1, -1, 0]
[1, 0, 0, 0, 0]
[0, 1, 0, 1, 1]
prec@3 with the partly correct results:  0.782051282051282
prec@3 without the partly correct results:

In [29]:
precs_with_partly, precs_without_partly = get_prec_at_k_with_partiality(all_converted_labels, 5)
print("prec@5 with the partly correct results: ", precs_with_partly.mean())
print("prec@5 without the partly correct results: ", precs_without_partly.mean())

[1, 1, 1, 1, 1]
[1, 1, 1, 1, 1]
[1, 1, 1, 1, 1]
[1, 1, -1, -1, 1]
[1, 1, 1, 1, 1]
[1, -1, -1, -1, -1]
[1, 1, 1, 1, 1]
[1, -1, 1, -1, -1]
[-1, 1, -1, -1, -1]
[1, -1, -1, 1, 1]
[-1, -1, -1, -1, -1]
[1, -1, 1, 1, 1]
[-1, 1, 1, -1, -1]
[1, 1, -1, 1, 1]
[1, 1, 1, 1, 1]
[1, 1, 1, 1, 1]
[1, 1, 1, 1, 1]
[1, 1, 1, -1, 1]
[1, 1, 1, 1, 1]
[1, -1, -1, -1, -1]
[1, 1, 1, -1, -1]
[1, 1, 1, 1, 1]
[-1, -1, -1, -1, -1]
[1, 1, 1, -1, 1]
[1, 1, -1, 1, 1]
[1, 1, 1, 1, 1]
[-1, -1, -1, -1, 1]
[-1, 1, -1, -1, -1]
[-1, 1, 1, 1, -1]
[1, 1, 1, 1, -1]
[1, 1, 1, -1, -1]
[-1, -1, 1, 1, 1]
[1, 1, 0, 0, 1]
[0, -1, -1, -1, -1]
[0, 0, 0, 0, 1]
[1, 0, 1, 1, 0]
[0, 1, 0, 0, 0]
[0, 0, 1, 0, 0]
[1, 1, 1, 1, 1]
[1, 1, 1, 1, 1]
[1, 0, 0, 0, 0]
[0, 1, 0, 0, 0]
[0, 1, 0, 1, 0]
[1, 0, 0, -1, 1]
[-1, 1, 1, -1, 1]
[0, 0, 0, 0, 0]
[1, 1, 1, 0, 0]
[0, -1, -1, -1, 0]
[1, -1, 1, -1, 1]
[0, 0, 1, -1, 0]
[1, 0, 0, 0, 0]
[0, 1, 0, 1, 1]
prec@5 with the partly correct results:  0.7423076923076923
prec@5 without the partly correct results

### Expand on feature wise, combine normal and complex accuracy

In [30]:
# similar to converted_labels, but this time we have 4 states:
# 0: both features we retrieved correctly
# 1: only feature 1 was retrieved correctly
# 2: only feature 2 was retrieved correctly
# -1: neither feature 1 or 2 was correctly retrieved
feature_correct = []
current_feature_correct = []
for i, label in enumerate(complex_labels):
    relevancy = 0
    if "Feature 1" in label["relevance"]:
        relevancy = 1
    elif "Feature 2" in label["relevance"]:
        relevancy = 2
    elif "NOT" in label["relevance"]:
        relevancy = -1
    current_feature_correct.append(relevancy)

    if (i+1) % 5 == 0:
        feature_correct.append(current_feature_correct)
        current_feature_correct = []
print(feature_correct)

[[0, 0, 1, 1, 0], [1, -1, -1, -1, -1], [1, 1, 1, 1, 0], [0, 2, 0, 0, 1], [2, 0, 2, 2, 2], [2, 2, 0, 1, 2], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 2, 1, 2, 2], [1, 0, 1, 1, 1], [1, 0, 1, 0, 1], [0, 1, 1, -1, 0], [-1, 0, 0, -1, 0], [2, 2, 2, 2, 1], [0, 0, 0, 1, 2], [2, -1, -1, -1, 2], [0, -1, 0, -1, 0], [1, 1, 0, -1, 1], [0, 2, 2, 1, 1], [1, 0, 1, 0, 0]]


In [31]:
len(complex_prompts) == len(feature_correct)

True

In [32]:
# then we can convert this to a feature wise relevency with the tags in the prompts file
def get_relevancy_for_feature(feature_id, feature_relevancy):
    result = []
    for code in feature_relevancy:
        if code == 0:  # both are relevant
            result.append(1)
        elif code == feature_id:  # only this feature is relevant, the other one isnt
            result.append(1)
        else:
            result.append(-1)

    return result

# we add this to the already existing tags from the normal prompts
for prompt, feature_relevancy in zip(complex_prompts, feature_correct):
    [prompt_tags, query] = str(prompt).split(":")
    [tag1, tag2] = prompt_tags.split(",")

    relevancy1 = get_relevancy_for_feature(1, feature_relevancy)
    relevancy2 = get_relevancy_for_feature(2, feature_relevancy)

    tags[tag1].append(relevancy1)
    tags[tag2].append(relevancy2)
tags

{'name': [[1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1],
  [1, 1, -1, -1, 1],
  [1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1],
  [1, -1, -1, -1, -1],
  [1, 1, 1, 1, 1],
  [1, -1, 1, 1, 1],
  [-1, 1, -1, -1, -1],
  [-1, -1, 1, 1, -1],
  [1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1],
  [1, -1, 1, -1, -1],
  [1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1],
  [1, 1, 1, -1, 1],
  [-1, 1, 1, -1, 1]],
 'tolerance': [[1, -1, -1, -1, -1],
  [1, 1, 1, 1, 1],
  [1, -1, 1, -1, -1],
  [-1, 1, -1, -1, -1],
  [1, 1, -1, -1, 1],
  [-1, -1, -1, -1, -1],
  [-1, -1, -1, -1, 1],
  [1, 1, 1, 1, -1]],
 'surface': [[1, -1, -1, 1, 1],
  [-1, -1, -1, -1, -1],
  [1, -1, 1, 1, 1],
  [-1, 1, 1, -1, -1],
  [1, 1, -1, 1, 1],
  [-1, -1, -1, -1, 1],
  [1, 1, 1, 1, -1],
  [-1, -1, -1, -1, -1],
  [1, -1, 1, -1, 1]],
 'thread': [[1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1],
  [1, 1, 1, -1, 1],
  [1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1],
  [1, 1, 1, -1, 1],
  [1, 1, 1, 1, -1],
  [1, -1, -1, -1, 1],
  [1, 1, 1, -1, 1]],
 'material': [[1, -1, -1,

In [37]:
for tag, relevancies in tags.items():
    precs = get_prec_at_k(relevancies, 3)
    print(f"overall prec@3 for {tag}", precs.mean())

[1, 1, 1, 1, 1]
[1, 1, 1, 1, 1]
[1, 1, 1, 1, 1]
[1, 1, -1, -1, 1]
[1, 1, 1, 1, 1]
[1, 1, 1, 1, 1]
[1, -1, -1, -1, -1]
[1, 1, 1, 1, 1]
[1, -1, 1, 1, 1]
[-1, 1, -1, -1, -1]
[-1, -1, 1, 1, -1]
[1, 1, 1, 1, 1]
[1, 1, 1, 1, 1]
[1, -1, 1, -1, -1]
[1, 1, 1, 1, 1]
[1, 1, 1, 1, 1]
[1, 1, 1, -1, 1]
[-1, 1, 1, -1, 1]
overall prec@3 for name 0.8148148148148148
[1, -1, -1, -1, -1]
[1, 1, 1, 1, 1]
[1, -1, 1, -1, -1]
[-1, 1, -1, -1, -1]
[1, 1, -1, -1, 1]
[-1, -1, -1, -1, -1]
[-1, -1, -1, -1, 1]
[1, 1, 1, 1, -1]
overall prec@3 for tolerance 0.49999999999999994
[1, -1, -1, 1, 1]
[-1, -1, -1, -1, -1]
[1, -1, 1, 1, 1]
[-1, 1, 1, -1, -1]
[1, 1, -1, 1, 1]
[-1, -1, -1, -1, 1]
[1, 1, 1, 1, -1]
[-1, -1, -1, -1, -1]
[1, -1, 1, -1, 1]
overall prec@3 for surface 0.4444444444444444
[1, 1, 1, 1, 1]
[1, 1, 1, 1, 1]
[1, 1, 1, 1, 1]
[1, 1, 1, -1, 1]
[1, 1, 1, 1, 1]
[1, 1, 1, 1, 1]
[1, 1, 1, -1, 1]
[1, 1, 1, 1, -1]
[1, -1, -1, -1, 1]
[1, 1, 1, -1, 1]
overall prec@3 for thread 0.9333333333333333
[1, -1, -1, -1, -1]
[1,

In [38]:
for tag, relevancies in tags.items():
    precs = get_prec_at_k(relevancies, 5)
    print(f"overall prec@5 for {tag}", precs.mean())

[1, 1, 1, 1, 1]
[1, 1, 1, 1, 1]
[1, 1, 1, 1, 1]
[1, 1, -1, -1, 1]
[1, 1, 1, 1, 1]
[1, 1, 1, 1, 1]
[1, -1, -1, -1, -1]
[1, 1, 1, 1, 1]
[1, -1, 1, 1, 1]
[-1, 1, -1, -1, -1]
[-1, -1, 1, 1, -1]
[1, 1, 1, 1, 1]
[1, 1, 1, 1, 1]
[1, -1, 1, -1, -1]
[1, 1, 1, 1, 1]
[1, 1, 1, 1, 1]
[1, 1, 1, -1, 1]
[-1, 1, 1, -1, 1]
overall prec@5 for name 0.7777777777777778
[1, -1, -1, -1, -1]
[1, 1, 1, 1, 1]
[1, -1, 1, -1, -1]
[-1, 1, -1, -1, -1]
[1, 1, -1, -1, 1]
[-1, -1, -1, -1, -1]
[-1, -1, -1, -1, 1]
[1, 1, 1, 1, -1]
overall prec@5 for tolerance 0.42500000000000004
[1, -1, -1, 1, 1]
[-1, -1, -1, -1, -1]
[1, -1, 1, 1, 1]
[-1, 1, 1, -1, -1]
[1, 1, -1, 1, 1]
[-1, -1, -1, -1, 1]
[1, 1, 1, 1, -1]
[-1, -1, -1, -1, -1]
[1, -1, 1, -1, 1]
overall prec@5 for surface 0.4666666666666667
[1, 1, 1, 1, 1]
[1, 1, 1, 1, 1]
[1, 1, 1, 1, 1]
[1, 1, 1, -1, 1]
[1, 1, 1, 1, 1]
[1, 1, 1, 1, 1]
[1, 1, 1, -1, 1]
[1, 1, 1, 1, -1]
[1, -1, -1, -1, 1]
[1, 1, 1, -1, 1]
overall prec@5 for thread 0.8600000000000001
[1, -1, -1, -1, -1]
[1,