# 1. Import Dependencies

In [1]:
import cv2 
import uuid
import os
import time

# 2. Define Images Label

In [2]:
COLLECTED_IMG_PATH = r'.\Tensorflow\workspace\images\collectedimages'

labels = os.listdir(COLLECTED_IMG_PATH)
labels

['botolkaca',
 'botolplastik',
 'kaleng',
 'kardus',
 'karet',
 'kertas',
 'plastik',
 'sedotan']

# 3. Setup Folders 

In [3]:
IMAGES_PATH = os.path.join('Tensorflow', 'workspace', 'images', 'collectedimages')

In [4]:
if not os.path.exists(IMAGES_PATH):
    !mkdir {IMAGES_PATH}
        
for label in labels:
    path = os.path.join(IMAGES_PATH, label)
    if not os.path.exists(path):
        os.mkdir(path)

# 4. Image Labelling

In [5]:
!pip install --upgrade pyqt5 lxml



In [6]:
LABELIMG_PATH = os.path.join('Tensorflow', 'labelimg')

In [7]:
if not os.path.exists(LABELIMG_PATH):
    !mkdir {LABELIMG_PATH}
    !git clone https://github.com/tzutalin/labelImg {LABELIMG_PATH}

In [8]:
if os.name == 'posix':
    !make qt5py3
if os.name =='nt':
    !cd {LABELIMG_PATH} && pyrcc5 -o libs/resources.py resources.qrc

In [None]:
# Nanti muncul python gui, pakai python gui itu buat extract fitur
!cd {LABELIMG_PATH} && python labelImg.py

In [29]:
# Melihat jumlah foto yang kita punya
for label in labels:
    print(label, '\t=' if len(label) > 6 else '\t\t=' , int(len(os.listdir(os.path.join(COLLECTED_IMG_PATH, label)))/2))

botolkaca 	= 353
botolplastik 	= 311
kaleng 		= 220
kardus 		= 137
karet 		= 147
kertas 		= 275
plastik 	= 232
sedotan 	= 28


# 5. Copy them into a Training and Testing Partition

First delete the contents of the train and test directories!!

In [16]:
TRAIN_PATH = os.path.join('Tensorflow', 'workspace', 'images', 'train')
TEST_PATH = os.path.join('Tensorflow', 'workspace', 'images', 'test')
ARCHIVE_PATH = os.path.join('Tensorflow', 'workspace', 'images', 'archive.tar.gz')

In [17]:
if not os.path.exists(TRAIN_PATH):
    os.mkdir(TRAIN_PATH)

if not os.path.exists(TEST_PATH):
    os.mkdir(TEST_PATH)

## 5.1 Hasea

In [None]:
# Shuffle and copy collected image to train and test dir
import random
import shutil

TRAIN_TO_TOTAL_RATIO = 0.8


for file_folder_name in labels:
    file_dir = os.path.join(IMAGES_PATH, file_folder_name)
    xml_list = [file for file in os.listdir(file_dir) if file.endswith('xml')]
    train_num = int((TRAIN_TO_TOTAL_RATIO*len(xml_list))//1)
    file_for_training = list(random.sample(xml_list, train_num))
    file_for_test = [file for file in xml_list if file not in file_for_training]
    
    for file in file_for_training:
        file_xml = os.path.join(file_dir, file)
        file_jpg = os.path.join(file_dir, file[:-3]) + 'jpg'
        jpg_target = os.path.join(TRAIN_PATH, file[:-3]) + 'jpg'
        xml_target = os.path.join(TRAIN_PATH, file)
        shutil.copyfile(file_xml, xml_target)
        shutil.copyfile(file_jpg, jpg_target)
    
    for file in file_for_test:
        file_xml = os.path.join(file_dir, file)
        file_jpg = os.path.join(file_dir, file[:-3]) + 'jpg'
        jpg_target = os.path.join(TEST_PATH, file[:-3]) + 'jpg'
        xml_target = os.path.join(TEST_PATH, file)
        shutil.copyfile(file_xml, xml_target)
        shutil.copyfile(file_jpg, jpg_target)


## 5.2 Krisna

In [18]:
import numpy as np 
import re
import shutil

np.random.seed(101)
# moving random data from classes folder to test and train
test_size = .1
pattern1 = r'(.*)\.[jpg|xml]'

for label_dir in os.listdir(IMAGES_PATH):
    label_dir_path = os.path.join(IMAGES_PATH, label_dir)
    num_files = len( os.listdir( label_dir_path ) )
    print(label_dir, num_files)
    
    if num_files == 0:
        print(f'{label_dir} is copied, continue ...')
        continue
    
    # Karena ada 2 ekstensi file untuk 1 foto (jpg dan xml) 
    # maka diambil set nya
    data_name_temp = []
    for data in os.listdir(label_dir_path):
        searched = re.search(pattern1, data)
        data_name_temp.append(searched.groups()[0])
    
    data_name_temp = np.array(list(set(data_name_temp)), dtype=np.str)
    
    # untuk setiap data akan diacak 
    np.random.shuffle(data_name_temp)
    
    # Todo: Melakukan pengambilan data 
    break_point = int(len(data_name_temp) * test_size)
    
    test_candidate = data_name_temp[:break_point]
    
    # untuk setiap kandidat data test akan dipindahkan ke folder test 
    for data_test in test_candidate:
        # melakukan pencarian tiap data test pada direktori label dir path
        pattern2 = r'(' + data_test + r'\.[xmljpg]*)'
        
        for train_test_data in os.listdir(label_dir_path):
            searched = re.search(pattern2, train_test_data)
            # kalau bukan none maka test
            if searched is not None:
                test_data_use = searched.groups()[0]
                source_path = os.path.join(label_dir_path, test_data_use)
                # print("Test : ", source_path)
                shutil.copy(source_path, TEST_PATH)
            # jika none maka train
            else:
                source_path = os.path.join(label_dir_path, train_test_data)
                # print("Train : ", source_path)
                shutil.copy(source_path, TRAIN_PATH)
    # break

botolkaca 706


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  data_name_temp = np.array(list(set(data_name_temp)), dtype=np.str)


botolplastik 622
kaleng 440
kardus 274
karet 294
kertas 550
plastik 464
sedotan 52


Cek ukuran test dan train

In [24]:
print(f"Train data : {len(os.listdir(TRAIN_PATH))} data")
print(f"Test data : {len(os.listdir(TEST_PATH))} data")

Train data : 3400 data
Test data : 310 data


# (OPSIONAL) 7. File tar jika mau menyimpan data dalam bentuk TAR dan diakses dari Google Collab

In [None]:
import tarfile

def make_tarfile(output_filename, source_dir):
    with tarfile.open(output_filename, "w:gz") as tar:
        for source in source_dir:
            tar.add(source, arcname=os.path.basename(source))

In [None]:
make_tarfile(ARCHIVE_PATH, [TRAIN_PATH, TEST_PATH])