In [1]:
import os
import pandas as pd
import shutil

In [2]:
def read_img_txt_labels(folder_path):
    # List to hold all the data
    data = []

    # Loop over all files in the folder
    for file_name in os.listdir(folder_path):
        # Check if the file is a .txt file
        if file_name.endswith('.txt') and file_name !='classes.txt':
            file_path = os.path.join(folder_path, file_name)
            with open(file_path, 'r') as file:
                # Read the file line by line
                for line in file:
                    # Split the line into components
                    components = line.strip().split()
                    # Ensure the line has the expected number of components
                    if len(components) == 5:
                        object_class, x_center, y_center, width, height = components
                        # Append the data along with the file name to the list
                        data.append({
                            'file_name': file_name,
                            'object_class': int(object_class),
                            'x_center': float(x_center),
                            'y_center': float(y_center),
                            'width': float(width),
                            'height': float(height)
                        })

    # Create a DataFrame from the data
    df = pd.DataFrame(data)
    return df

In [3]:
# Specify the folder path
load_folder_path = '../datasets/training_data_complete'
# Read the YOLO label files and create a DataFrame
imgs_txt_df = read_img_txt_labels(load_folder_path)
classes_txt_df = pd.read_csv(f'{load_folder_path}/classes.txt', sep=" ", header=None)
classes_txt_df.columns = ['class_name']
classes_txt_df = classes_txt_df.reset_index(names = ['object_class'])

# Display the DataFrame
df = pd.merge(classes_txt_df, imgs_txt_df, on = "object_class")
df

Unnamed: 0,object_class,class_name,file_name,x_center,y_center,width,height
0,0,bio_hexagon,IMG_20240515_175800.txt,0.515625,0.610938,0.056250,0.046875
1,0,bio_hexagon,IMG_20240515_175844.txt,0.076563,0.741667,0.078125,0.066667
2,0,bio_hexagon,IMG_20240515_175844.txt,0.625000,0.729167,0.041667,0.068750
3,0,bio_hexagon,IMG_20240515_175844.txt,0.350000,0.725000,0.070833,0.066667
4,0,bio_hexagon,IMG_20240515_180008.txt,0.042708,0.807292,0.037500,0.041667
...,...,...,...,...,...,...,...
454,37,demeter,IMG_20240515_180200.txt,0.404687,0.894271,0.098958,0.071875
455,37,demeter,IMG_20240515_180234.txt,0.436458,0.785417,0.050000,0.097917
456,37,demeter,IMG_20240515_180512.txt,0.171875,0.264583,0.139583,0.066667
457,37,demeter,IMG_20240515_180917.txt,0.304688,0.419271,0.107292,0.080208


In [4]:
classes_txt_df

Unnamed: 0,object_class,class_name
0,0,bio_hexagon
1,1,eco_stars
2,2,vegan_international
3,3,fairtrade_international
4,4,rainforest_alliance
5,5,bioland
6,6,regional_fenster
7,7,ohne_gentechnik
8,8,naturland
9,9,nutriscore_b


In [5]:
len(df['class_name'].unique())

38

In [6]:
# Classes with too few labels --> shall be deleted
# List of class names to be deleted
classes_to_delete = [
    'q_milch', 
    'naturland_fair', 
    'cocoa_for_future', 
    'blauer_engel', 
    'bw_gesicherte_qualitaet', 
    'vegan_vegan_society', 
    'bio_kreis']

# Deleting rows where 'class' is in 'class_names'
df = df[~df['class_name'].isin(classes_to_delete)]

len(df['class_name'].unique())

31

In [7]:
# merge labels
classes_to_merge = {
    'haltungsform': ['haltungsform_2', 'haltungsform_3', 'haltungsform_4'],
    'nutriscore': ['nutriscore_a', 'nutriscore_b', 'nutriscore_c', 'nutriscore_d'],
}

for new_class, old_classes in classes_to_merge.items():
    df.loc[df['class_name'].isin(old_classes), 'class_name'] = new_class

len(df['class_name'].unique())

26

In [8]:
# Poor Performing Classes

classes_to_improve = [
    'regional_fenster',
    'fsc',
    'green_point',
    'demeter'
]

In [9]:
save_folder = "../datasets/test"
os.makedirs(save_folder, exist_ok=True)
# Match object_class and class_name correctly and starting from 0
save_df = df.copy().drop('object_class', axis = 1)
save_df['object_class'] = pd.factorize(save_df['class_name'])[0]
save_df = save_df.sort_values('object_class').reset_index(drop =True)
# Save new classes.txt
save_df[['class_name']].drop_duplicates().to_csv(f'{save_folder}/classes.txt', index=False, header=False)

# Create new imgs.txt files
for file_name in save_df.file_name.unique():
    subset_df = save_df[save_df.file_name==file_name][['object_class', "x_center", "y_center", "width", "height"]]
    subset_df.to_csv(f'{save_folder}/{file_name}', sep=' ', index=False, header=False)

# Copy image files into new folder
files = os.listdir(load_folder_path)

for f in files:
    if ('.jpg' in f):
        shutil.copy(f'{load_folder_path}/{f}', f'{save_folder}/{f}')
