# Merge dataset to our dataset format

## Global variables:

In [65]:
dataset_name = "rewe-roboflow" # CHANGE
dataset_path = "datasets/rewe-roboflow/train" # CHANGE
dataset_labels = "datasets/rewe-roboflow/data.yaml"

# Source Dataset Paths
id_path = "Dataset/id.txt"
prices_path = "Dataset/prices.txt"
destination = "Dataset"

## Price & label preparation

Find the current image and label id count to label the new images and classes correctly

In [95]:
with open(id_path, 'r') as f:
    lines = f.readlines()
    id_count = int(lines[0].split()[1]) + 1
    label_count = int(lines[1].split()[1]) + 1

print("Id count: ", id_count)
print("Label count: ", label_count)

Id count:  2841
Label count:  7


Extract labels (This assumes yolo-like setup)

In [78]:
import yaml

# Open the labels_path file and load its content
with open(dataset_labels , 'r') as file:
    labels_data = yaml.safe_load(file)

# Extract the files under the 'name' key
labels = labels_data.get('names', [])
print(labels)

['ActiveO2_Orange_Bio', 'AlpensalzJodFluor', 'Barilla_Collezione', 'Burste', 'ChristiansGrod-Rotegruetze-sauerkirsch', 'CupNoodlesSobaWok', 'DallmayrProdomo-naturmild', 'DrOetketZitronenWolke', 'Eissbergsalat', 'HarryVitalFit', 'Heinz_Curry_mango', 'Heinz_tomato', 'Hitchcock-Zitrone', 'Ja-DinkelSpaghetti', 'Kinder_Riegel_Big', 'Kuehne_RoteBeete', 'Kuehne_Schlemmer_Topfchen_Balsamico', 'Lenor_Waschmittel', 'Maggi-PutenWok', 'Mesmer-HimbeerLemon', 'NicNacsDouble', 'Pedigree_Markies_Original', 'PringlesHotSpicy', 'ReweBW_Besto_Balsamico', 'ShebaSauceSpeciale', 'Speisekartoffeln_Regional', 'SweetKiss', 'hella_wellness', 'ja-Gouda', 'ja-H-Milch-0', 'ja-Salami', 'ja-_Kiechererbsen']


Enter the price and give the labels new labels if desired

In [96]:
label_to_price = {}
num_to_label = {}
i = 0
for label in labels:
    new_label = input(f"Enter the new label for {label} (Skip): ")
    price = float(input(f"Enter the price for {label}: "))
    label_to_price[label_count] = [new_label,price]
    num_to_label[i] = label_count
    label_count += 1
    print("Added label and price:", new_label, price)
    i += 1

print(label_to_price )

Added label and price: vitamine water 1.19
Added label and price: salt  1.29
Added label and price: barilla noodles 2.69
Added label and price: brush 0.79
Added label and price: red fruit jelly 2.29
Added label and price: cup noodles  2.19
Added label and price: coffee beans  7.99
Added label and price: lemon cake baking mix  3.49
Added label and price: lettuce 1.11
Added label and price: bread 1.89
Added label and price: curry sauce 2.29
Added label and price: tomato ketchup 2.29
Added label and price: lemon juice 1.59
Added label and price: spaghetti 1.29
Added label and price: kinder riegel 2.99
Added label and price: beetroot 1.79
Added label and price: pickles 3.49
Added label and price: detergent 3.79
Added label and price: maggi wok 1.05
Added label and price: tea 2.35
Added label and price: nic nacs 1.99
Added label and price: pedigree dog food 2.29
Added label and price: pringles 2.49
Added label and price: balsamico 2.69
Added label and price: cat food bowl 0.65
Added label a

## Saving the new dataset

Process the files this assumes the images to be in images and labels in a labels folder

In [98]:
from PIL import Image
import os

def process_and_save_files(source_folder, destination_folder, start_id, label_to_price, num_to_label):
    id = start_id
    for filename in os.listdir(source_folder+"/labels"):
        if filename.endswith(".txt"):
            print(filename)
            # Read the content of the txt file
            class_occurrences = {}
            with open(os.path.join(source_folder+"/labels", filename), 'r') as file:
                # Change this part if the labels are given differently then yolo
                for line in file:
                    first_number = int(line.split()[0])
                    new_label = num_to_label[first_number]
                    if new_label not in class_occurrences:
                        class_occurrences[new_label] = 1
                    else:    
                        class_occurrences[new_label] += 1
            
            # Save the new txt file in the destination folder with a new name
            new_filename = f"image_{id}.txt"
            overall_price = 0
            for key in class_occurrences:
                overall_price += label_to_price[key][1] * class_occurrences[key]

            with open(os.path.join(destination_folder, new_filename), 'w') as new_file:
                new_file.write("Objects: "+ str(class_occurrences)+ "\nTotal Price: "+ "{:.2f}".format(overall_price))
            
            # Assuming there is a corresponding jpg file with the same name
            jpg_filename = filename.replace(".txt", ".jpg")
            if os.path.exists(os.path.join(source_folder+"/images", jpg_filename)):
                # Open the image file
                image = Image.open(os.path.join(source_folder+"/images", jpg_filename))
                
                # Save the new image in the destination folder with a new name
                new_image_filename = f"image_{id}.jpg"
                image.save(os.path.join(destination_folder, new_image_filename))
            id += 1
    return id         


id_count = process_and_save_files(dataset_path, destination+"/"+dataset_name, id_count, label_to_price, num_to_label)

WhatsApp-Bild-2023-11-28-um-11-20-47_c2fbca43_jpg.rf.975a6412adec16afe517a6ef6f6115d3.txt
WhatsApp-Bild-2023-11-28-um-11-21-00_0cae93df_jpg.rf.e83ebf6d81f3db6929e6f38f66c97da1.txt
WhatsApp-Bild-2023-11-28-um-11-21-34_dada88be_jpg.rf.dde265621d169f080abd276dcee3eb23.txt
WhatsApp-Bild-2023-12-11-um-18-06-04_336d0fad_jpg.rf.f448b4579d427b50ed9d8ab626f70a9a.txt
WhatsApp-Bild-2023-12-11-um-18-05-49_20c91d8c_jpg.rf.93c109555350693e9da0453363f1cab6.txt
WhatsApp-Bild-2023-11-28-um-11-20-12_a042c62f_jpg.rf.521551c1c74dbb88ef16aa6b452eb372.txt
WhatsApp-Bild-2023-11-28-um-11-20-43_af3cb7bd_jpg.rf.d68efae71c1cdecf560c32d0c24818a3.txt
WhatsApp-Bild-2023-11-28-um-11-20-22_17b05296_jpg.rf.e0bc0b791bc10bea69bc299dce9263c9.txt
WhatsApp-Bild-2023-12-11-um-18-06-00_b92f78b8_jpg.rf.5a44e27b73f0167cbfbb0240cd862211.txt
WhatsApp-Bild-2023-11-28-um-11-21-33_8ed8c7a2_jpg.rf.b2f5a5f2ecfff70215b7d6bb55c41fff.txt
WhatsApp-Bild-2023-12-11-um-18-06-01_0d6d46f0_jpg.rf.42114c113f6e9b5cc24c91c95485ff40.txt
WhatsApp-B

Save new id and price dict

In [107]:
with open(id_path, 'w') as f:
    f.write(f"image_id: {id_count-1}\n")
    f.write(f"label_id: {label_count-1}")

with open(prices_path, 'a+') as file:
    # Move to the start of the file and read the content
    file.seek(0)
    content = file.read()

    # If the file is not empty and does not end with a newline, add one
    if content and not content.endswith('\n'):
        file.write('\n')

    # Write the new content on a new line
    file.write(dataset_name + ": " + str(label_to_price) + "\n")

## Automatic price finding testing 

In [None]:
import pandas as pd
from difflib import get_close_matches
def get_prices(labels):
    new_label_price = {}
    # Load the CSV file into a DataFrame
    csv_path = "schleswig-holstein-prices.csv"
    csv_data = pd.read_csv(csv_path)

    # Assuming the CSV has columns 'Name' and 'Price'
    csv_names = csv_data['name'].tolist()
    csv_prices = csv_data['price'].tolist()

    # Function to find the closest match and its price
    def find_closest_price(label, csv_names, csv_prices):
        closest_match = get_close_matches(label, csv_names, n=1, cutoff=0.0)
        if closest_match:
            index = csv_names.index(closest_match[0])
            print(label, "|",closest_match[0], '|', csv_prices[index])
            return csv_prices[index]
        return None

    # Update label_price with the closest prices from the CSV
    for label in labels:
        closest_price = find_closest_price(label, csv_names, csv_prices)
        if closest_price is not None:
            new_label_price[label] = closest_price

    return new_label_price

In [None]:
from translate import Translator
vegetables = [
    "avocado",
    "beans",
    "beet",
    "bell pepper",
    "broccoli",
    "brus capusta",
    "cabbage",
    "carrot",
    "cayliflower",
    "celery",
    "corn",
    "cucumber",
    "eggplant",
    "fasol",
    "garlic",
    "hot pepper",
    "onion",
    "peas",
    "potato",
    "pumpkin",
    "rediska",
    "redka",
    "salad",
    "squash-patisson",
    "tomato",
    "vegetable marrow"
]
for veggie in vegetables:
    translator = Translator(to_lang='de')
    translation = translator.translate(veggie)
    print(veggie, "|", translation)
#get_prices(vegetables)

avocado | Avocado
beans | Bohnen
beet | Rüben
bell pepper | Glockenpfeffer
broccoli | Brokkoli
brus capusta | brus capusta
cabbage | Kohl
carrot | Karotte
cayliflower | cayliflower
celery | Sellerie
corn | Mais
cucumber | Gurke
eggplant | aubergine
fasol | fasol
garlic | Knoblauch
hot pepper | Paprika
onion | Zwiebel
peas | Erbsen
potato | Kartoffel
pumpkin | Kürbis
rediska | rediska
redka | redka
salad | Salat
squash-patisson | squash-patisson
tomato | Tomate
vegetable marrow | Markkuerbis


In [None]:
import os

def find_highest_number_in_txt_files(directory):
    highest_number = None
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            number_str = filename.split('_')[-1].split('.')[0]
            try:
                number = int(number_str)
                if highest_number is None or number > highest_number:
                    highest_number = number
            except ValueError:
                continue
    return highest_number

highest_number = find_highest_number_in_txt_files(dataset_destination_path)
print(f"The highest number in the txt files is: {highest_number}")

The highest number in the txt files is: 2789
