# Formatting the source dataset into YOLOv5 training format

The point of this notebook is to re-format the source dataset from [kaggle by Esteban](https://www.kaggle.com/estebanpacanchique/cigarette-butt) as YOLOv5 compatible format.

\


##Prerequsites
To use this notebook on your own google colab, the dataset from [here](https://www.kaggle.com/estebanpacanchique/cigarette-butt) should be downloaded and unzipped in your google drive with the path `/cig butts detection/kaggle dataset` with two sub directories `train`, `test`, and two label files `train_labels.csv` and `test_labels.csv`

In [1]:
# Mounting the drive in order to access the image directories
import os
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# create the directories for the output dataset dirs
!mkdir -p 'drive/MyDrive/cig butts detection/dataset in yolov5 format'
!mkdir -p 'drive/MyDrive/cig butts detection/dataset in yolov5 format/images'
!mkdir -p 'drive/MyDrive/cig butts detection/dataset in yolov5 format/labels'
!mkdir -p 'drive/MyDrive/cig butts detection/dataset in yolov5 format/images/train'
!mkdir -p 'drive/MyDrive/cig butts detection/dataset in yolov5 format/labels/val'
!mkdir -p 'drive/MyDrive/cig butts detection/dataset in yolov5 format/images/train'
!mkdir -p 'drive/MyDrive/cig butts detection/dataset in yolov5 format/labels/val'

In [5]:
# directories with all the train images
train_dirs = ['drive/MyDrive/cig butts detection/kaggle dataset/train']
#train_dirs = ['drive/MyDrive/cig butts detection/kaggle dataset/train', 
#              'drive/MyDrive/cig butts detection/immersive dataset/train/images']
# directories with all the validation images
val_dirs = ['drive/MyDrive/cig butts detection/kaggle dataset/test']
#val_dirs = ['drive/MyDrive/cig butts detection/kaggle dataset/test', 
#            'drive/MyDrive/cig butts detection/immersive dataset/val/images']

In [6]:
image_train_path = 'drive/MyDrive/cig butts detection/dataset in yolov5 format/images/train'
image_val_path = 'drive/MyDrive/cig butts detection/dataset in yolov5 format/images/val'

In [7]:
import shutil

# first we need to prepare train.txt and val.txt
# two text files that contains path to each individual train/validation images
all_train_fils = [os.listdir(d) for d in train_dirs]
#train_fil_paths = []
train_item_ct = 0
# go through the two train directories
for i in range(len(all_train_fils)):
    dir_path = train_dirs[i]
    for fname in all_train_fils[i]:
        if fname[-4:] != ".jpg": # skip if not a .jpg file
            continue
        # use shutil to copy the file over
        full_path = os.path.join(dir_path, fname)
        out_path = os.path.join(image_train_path, fname)
        shutil.copyfile(full_path, out_path)
        train_item_ct += 1

all_val_fils = [os.listdir(d) for d in val_dirs]
#val_fil_paths = []
val_item_ct = 0
# go through the two val directories
for i in range(len(all_val_fils)):
    dir_path = val_dirs[i]
    for fname in all_val_fils[i]:
        if fname[-4:] != ".jpg": # skip if not a .jpg file
            continue
        # use shutil to copy the file over
        full_path = os.path.join(dir_path, fname)
        out_path = os.path.join(image_val_path, fname)
        shutil.copyfile(full_path, out_path)
        val_item_ct += 1

# show the number
print("number of files in train set: ", train_item_ct)
print("number of files in validation set: ", val_item_ct)

'''
# then write to file
train_txt = open('/content/drive/cig butts detection/dataset in yolov5 format/train_imgs.txt', 'w')
train_txt.write('\n'.join(train_fil_paths))
train_txt.close()

val_txt = open('/content/drive/cig butts detection/dataset in yolov5 format/val_imgs.txt', 'w')
val_txt.write('\n'.join(val_fil_paths))
val_txt.close()
'''

number of files in train set:  3643
number of files in validation set:  399


"\n# then write to file\ntrain_txt = open('/content/drive/cig butts detection/dataset in yolov5 format/train_imgs.txt', 'w')\ntrain_txt.write('\n'.join(train_fil_paths))\ntrain_txt.close()\n\nval_txt = open('/content/drive/cig butts detection/dataset in yolov5 format/val_imgs.txt', 'w')\nval_txt.write('\n'.join(val_fil_paths))\nval_txt.close()\n"

In [13]:
# now we need to create the yolo dataset labels
# format: 1 txt file per image, each txt file have 0 or more rows of 5 vals seperated by space
# like this: 0 0.3814 0.2551 0.144 0.129
# they mean: [item class] [center_x] [center_y] [width] [height]
# coordinates are normalized to [0.0, 1.0]

# we start with the kaggle dataset, which labels are in one single csv file
kaggle_train_csv = 'drive/MyDrive/cig butts detection/kaggle dataset/train_labels.csv'
kaggle_val_csv = 'drive/MyDrive/cig butts detection/kaggle dataset/test_labels.csv'

# function to read a line in the labels
def img_label_line_read(splits):
    # there will be 8 items
    # first is imgae name
    img_name = splits[0]
    # then width and height (they're all 1000*80 here)
    width = float(splits[1])
    height = float(splits[2])
    # then class (it's all 'CigaretteButt')
    # then xmin, ymin, xmax, ymax
    xmin = float(splits[4])
    ymin = float(splits[5])
    xmax = float(splits[6])
    ymax = float(splits[7])
    # so we calculate the normalized xcenter, ycenter, boxwidth, boxheight
    x_center = ((xmax + xmin) / 2) / width
    y_center = ((ymax + ymin) / 2) / height
    box_width = (xmax - xmin) / width
    box_height = (ymax - ymin) / height

    return (0, x_center, y_center, box_width, box_height)

# function to read one dataset label .csv file
def kaggle_dataset_line_process(rlines, map):
    for line in rlines[1:]: # skip the first line cause it's header
        splits = line.replace('\n', '').split(',')
        img_name = splits[0]
        mark = img_label_line_read(splits)

        # then, we append to each list in the map(dict)
        # create it if it haven't existed
        if img_name not in map:
            map[img_name] = []
    
        # append to list
        map[img_name].append(mark)

# read the entire train dataset's labels
kaggle_train_content = open(kaggle_train_csv, 'r')
kaggle_train_map = {}
kaggle_dataset_line_process(kaggle_train_content.readlines(), kaggle_train_map)

# then the validation dataset's labels
kaggle_val_content = open(kaggle_val_csv, 'r')
kaggle_val_map = {}
kaggle_dataset_line_process(kaggle_val_content.readlines(), kaggle_val_map)



# now we will have train map and val map
# each key is name of image file, pointing to list of labels
train_export_path = '/content/drive/MyDrive/cig butts detection/dataset in yolov5 format/labels/train'
val_export_path = '/content/drive/MyDrive/cig butts detection/dataset in yolov5 format/labels/val'

def export_map_to_path(map, path):
    for key in map:
        objs = map[key]
        # join the by rows, each row have the 5 items
        fil_content = '\n'.join(['\t'.join([str(b) for b in o]) for o in objs])
        # now create a file and write the content into it
        opath = os.path.join(path, key[:-4] + '.txt')
        ofil = open(opath, 'w')
        ofil.write(fil_content)
        # close after writing
        ofil.close()

export_map_to_path(kaggle_train_map, train_export_path)
export_map_to_path(kaggle_val_map, val_export_path)

In [23]:
print(len(os.listdir('drive/MyDrive/cig butts detection/kaggle dataset/train')))

3285


In [25]:
print(len(os.listdir('drive/MyDrive/cig butts detection/dataset in yolov5 format/images/train')))
print(len(os.listdir('drive/MyDrive/cig butts detection/dataset in yolov5 format/images/val')))

1643
399


In [27]:
print(len(os.listdir('drive/MyDrive/cig butts detection/dataset in yolov5 format/labels/train')))
print(len(os.listdir('drive/MyDrive/cig butts detection/dataset in yolov5 format/labels/val')))

1637
399
