In [46]:
import os
import pandas as pd
import shutil

In [47]:
def read_img_txt_labels(folder_path):
    # List to hold all the data
    data = []

    # Loop over all files in the folder
    for file_name in os.listdir(folder_path):
        # Check if the file is a .txt file
        if file_name.endswith('.txt') and file_name !='classes.txt':
            file_path = os.path.join(folder_path, file_name)
            with open(file_path, 'r') as file:
                # Read the file line by line
                for line in file:
                    # Split the line into components
                    components = line.strip().split()
                    # Ensure the line has the expected number of components
                    if len(components) == 5:
                        object_class, x_center, y_center, width, height = components
                        # Append the data along with the file name to the list
                        data.append({
                            'file_name': file_name,
                            'object_class': int(object_class),
                            'x_center': float(x_center),
                            'y_center': float(y_center),
                            'width': float(width),
                            'height': float(height)
                        })

    # Create a DataFrame from the data
    df = pd.DataFrame(data)
    return df

In [48]:
# Specify the folder path
load_folder_path = 'C:/Users/Felix/Documents/00_Master/03_SS-24/10_AISS_CV/datasets/training_data_augmented_V4_deleted_not_merged'
# Read the YOLO label files and create a DataFrame
imgs_txt_df = read_img_txt_labels(load_folder_path)
classes_txt_df = pd.read_csv(f'{load_folder_path}/classes.txt', sep=" ", header=None)
classes_txt_df.columns = ['class_name']
classes_txt_df = classes_txt_df.reset_index(names = ['object_class'])

# Display the DataFrame
df = pd.merge(classes_txt_df, imgs_txt_df, on = "object_class")
df

Unnamed: 0,object_class,class_name,file_name,x_center,y_center,width,height
0,0,bio_hexagon,IMG_20240515_175800_aug_1.txt,0.394132,0.520283,0.047265,0.055821
1,0,bio_hexagon,IMG_20240515_175800_aug_2.txt,0.747228,0.101662,0.119734,0.186722
2,0,bio_hexagon,IMG_20240515_175800_aug_3.txt,0.484375,0.610938,0.056250,0.046875
3,0,bio_hexagon,IMG_20240515_175800_aug_4.txt,0.525521,0.387500,0.051042,0.052083
4,0,bio_hexagon,IMG_20240515_175844_aug_1.txt,0.348282,0.714372,0.070702,0.066783
...,...,...,...,...,...,...,...
1795,30,demeter,IMG_20240515_180917_aug_4.txt,0.323958,0.598437,0.114583,0.090625
1796,30,demeter,IMG_20240515_183205_aug_1.txt,0.232084,0.605754,0.076337,0.075648
1797,30,demeter,IMG_20240515_183205_aug_2.txt,0.883268,0.495413,0.103761,0.121101
1798,30,demeter,IMG_20240515_183205_aug_3.txt,0.119792,0.643750,0.083333,0.068750


In [49]:
len(df['class_name'].unique())

31

In [50]:
# Classes with too few labels --> shall be deleted
# List of class names to be deleted
classes_to_delete = [
    'q_milch', 
    'naturland_fair', 
    'cocoa_for_future', 
    'blauer_engel', 
    'bw_gesicherte_qualitaet', 
    'vegan_vegan_society', 
    'bio_kreis']

# Deleting rows where 'class' is in 'class_names'
df = df[~df['class_name'].isin(classes_to_delete)]

len(df['class_name'].unique())

31

In [51]:
# merge labels
classes_to_merge = {
    'haltungsform': ['haltungsform_2', 'haltungsform_3', 'haltungsform_4'],
    'nutriscore': ['nutriscore_a', 'nutriscore_b', 'nutriscore_c', 'nutriscore_d'],
}

for new_class, old_classes in classes_to_merge.items():
    df.loc[df['class_name'].isin(old_classes), 'class_name'] = new_class

In [52]:
# Poor Performing Classes

classes_to_improve = [
    'regional_fenster',
    'fsc',
    'green_point',
    'demeter'
]

In [53]:
save_folder = "C:/Users/Felix/Documents/00_Master/03_SS-24/10_AISS_CV/datasets/test"
os.makedirs(save_folder, exist_ok=True)
# Match object_class and class_name correctly and starting from 0
save_df = df.copy().drop('object_class', axis = 1)
save_df['object_class'] = pd.factorize(save_df['class_name'])[0]
save_df = save_df.sort_values('object_class').reset_index(drop =True)
# Save new classes.txt
save_df[['class_name']].drop_duplicates().to_csv(f'{save_folder}/classes.txt', index=False, header=False)

# Create new imgs.txt files
for file_name in save_df.file_name.unique():
    subset_df = save_df[save_df.file_name==file_name][['object_class', "x_center", "y_center", "width", "height"]]
    subset_df.to_csv(f'{save_folder}/{file_name}', sep=' ', index=False, header=False)

# Copy image files into new folder
files = os.listdir(load_folder_path)

for f in files:
    if ('.jpg' in f):
        shutil.copy(f'{load_folder_path}/{f}', f'{save_folder}/{f}')
