-
Notifications
You must be signed in to change notification settings - Fork 0
/
sort_imgs.py
115 lines (96 loc) · 4.45 KB
/
sort_imgs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import os
import sys
from os.path import join
import shutil
import numpy as np
from pandas import read_csv, DataFrame
import random
def move_img(img_list, new_dir, old_dir):
for i in img_list:
#shutil.move(join(old_dir, i), new_dir)
shutil.copy(join(old_dir, i), new_dir)
def rename_img(path, nr_list, testing=False):
interrater = "_interrater" if testing==True else ""
i = 0
for filename in os.listdir(path):
os.rename(join(path,filename), join(path,f'{nr_list[i]}{interrater}_{filename}'))
i = i + 1
def copy_img(source, dest):
for f in os.listdir(source):
shutil.copy(join(source,f), dest)
def sort_images(data_dir, out_path, test_nr, label_nr):
"""
Divide the raw images into separate label folders and an (interrater) testing folder.
Each image receives a unique position (f.e. 00365_). Testing images are at the same position for all label folders.
Example Output:
00000_interrater_dataset_filename.jpg
00001_dataset_filename.jpg
00002_dataset_filename.jpg
00003_interrater_dataset_filename.jpg
...
"""
# Create 5 folders
katya_dir = join(out_path, 'imgs_katya_02')
ulku_dir = join(out_path, 'imgs_ulku_02')
josefine_dir = join(out_path, 'imgs_josefine_02')
rest_dir = join(out_path, 'imgs_rest_02')
testing_dir = join(out_path, 'imgs_testing_02')
for img_dir in [katya_dir, ulku_dir, josefine_dir, rest_dir, testing_dir]:
if not os.path.exists(img_dir):
os.mkdir(img_dir)
else:
print("ERROR: Output directory already exists. Please rename or delete.")
sys.exit()
# Load image names as a list (from metainfo_df)
metadata_df = read_csv(join(data_dir, "img_metadata.csv"), sep=",")
path_len = int(len(data_dir)+1)
# Filter for unwanted or duplicate image names
img_unfiltered = [str(s)[path_len:] for s in metadata_df["labelimg"]]
img_double = [s for s in img_unfiltered if s not in ("SESS06", "")]
img_all = [val for idx, val in enumerate(img_double) if val not in img_double[:idx]]
# Take 5 random groups of images (not overlapping)
random.shuffle(img_all)
testing_list = img_all[:test_nr]
katya_list = img_all[test_nr:test_nr+label_nr]
ulku_list = img_all[test_nr+label_nr:test_nr+label_nr+label_nr]
josefine_list = img_all[test_nr+label_nr+label_nr:test_nr+label_nr+label_nr+label_nr]
rest_list = img_all[test_nr+label_nr+label_nr+label_nr:]
# Save list of leftover image names into imgs_rest folder
rest_pathlist = [join(data_dir, f) for f in rest_list]
rest_dict = {"labelimg": rest_pathlist}
rest_df = DataFrame(rest_dict)
rest_df.to_csv(f'{rest_dir}/img_metadata.csv')
# Sort the corresponding images into new folder
move_img(testing_list, testing_dir, data_dir)
move_img(katya_list, katya_dir, data_dir)
move_img(ulku_list, ulku_dir, data_dir)
move_img(josefine_list, josefine_dir, data_dir)
move_img(rest_list, rest_dir, data_dir)
# Give the test images the correct number/their future position
t_third = int(test_nr/3) # f.e. =100
t_range = int((test_nr/3)*2) # f.e. =200
n_max = int(test_nr+label_nr) # f.e. =2000
test_batch = list(map(str, random.sample(range(1, t_range), t_third)))
test_middlebatch = list(map(str, random.sample(range(t_range, n_max-t_range), t_third)))
test_endbatch = list(map(str, random.sample(range(n_max-t_range, n_max), t_third)))
test_batch.extend(test_endbatch)
test_batch.extend(test_middlebatch)
testbatch = [s.zfill(5) for s in test_batch]
rename_img(testing_dir, testbatch, testing=True)
# Give label images the correct numbers/their future position
all_nrs = list(map(str, np.arange(test_nr+label_nr)))
label_batch = set(all_nrs) - set(test_batch)
labelbatch = [s.zfill(5) for s in label_batch]
rename_img(katya_dir, labelbatch)
rename_img(ulku_dir, labelbatch)
rename_img(josefine_dir, labelbatch)
# Move a copy of the test images into each label folder
copy_img(testing_dir, katya_dir)
copy_img(testing_dir, ulku_dir)
copy_img(testing_dir, josefine_dir)
if __name__ == "__main__":
data_dir = "/LOCAL/jzerbe/imgs/imgs_rest"
out_path = '/LOCAL/jzerbe/imgs'
test_nr = 150 #how many interrater images do you want?
label_nr = 1850 #how many unique label images do you want?
sort_images(data_dir, out_path, test_nr, label_nr)