In [1]:
import numpy as np
import pandas as pd
import glob
import sys
from tqdm import tqdm
from pathlib import Path
from PIL import Image

### 0.API

In [2]:
model2logic = {
    0: 10,  # 鞋子
    1: 8,  # 包包
    2: 4,  # 上装
    3: 5,  # 裤子
    4: 6,  # 裙子
    5: 7,  # 连体装
}

id9_map_logic6 = {
    0: 10,  # 鞋子
    1: 8,  # 包包
    2: 4,  # 上装_短款
    3: 5,  # 裤子_长款
    4: 6,  # 裙子
    5: 7,  # 连体装
    6: 4,  # 上装_长款
    7: 5,  # 超短裤
    8: 4,  # 超短裙
}

id2chi9 = {
    0: '鞋子',
    1: '包包',
    2: '上装',
    3: '裤子',
    4: '半身裙',
    5: '连体装',
    6: '上装-长款',
    7: '超短裤',
    8: '超短裙',
}

id2eng9 = {
    0: 'shoe',  
    1: 'bag',  
    2: 'upper_short', 
    3: 'paints',  
    4: 'skirt',  
    5: 'wholebody',  
    6: 'upper_long',
    7: 'paints_hot',
    8: 'miniskirt',
}

In [3]:
def get_list2dict(k_list, v_list):    
    return dict(zip(k_list, v_list))

chi2id9 = dict(zip(id2chi9.values(), id2chi9.keys()))
eng2id9 = get_list2dict(id2eng9.values(), id2eng9.keys())

### 1. Dataset Information.

#### csv读取和保存

In [4]:
root = Path('/nas/chenyi/datasets_nas/deploy_system/deploy_manual_sep/')
!ls $root

images	xml  xml_rewrite


In [5]:
csv_fname = 'anomoly2k_aug'
root_csv = Path('/nas/chenyi/datasets_nas/deploy_system/deploy_manual_sep/xml_rewrite/anomaly0826_xml/csv/')
csv_p = root_csv /csv_fname
csv_p = csv_p.with_suffix('.csv')
bboxes_df = pd.read_csv(csv_p)
len(bboxes_df)

2238

#### 对现有数据拷贝软连接

In [106]:
!tree -d -L 1 /home/chenyi/workspace/yolov5/datasets/query/

[01;34m/home/chenyi/workspace/yolov5/datasets/query/[00m
└── [01;34mqueryv7_1[00m

1 directory


In [6]:
fname = 'dadetv6_3'
img_list = glob.glob(f'/home/chenyi/workspace/yolov5/datasets/{fname}/images/*/*')
label_list = glob.glob(f'/home/chenyi/workspace/yolov5/datasets/{fname}/labels/*/*')
len(img_list), len(label_list)

(122800, 122800)

In [7]:
target_folder = 'query/queryv7_1'
img_t_list = [p.replace(fname, target_folder) for p in img_list]
label_t_list = [p.replace(fname, target_folder) for p in label_list]

In [8]:
df = pd.DataFrame()
df['img_sp'] = img_list
df['label_sp'] = label_list
df['img_tp'] = img_t_list
df['label_tp'] = label_t_list

In [9]:
df.head(1).values

array([['/home/chenyi/workspace/yolov5/datasets/dadetv6_3/images/train/20140305213907823.jpg',
        '/home/chenyi/workspace/yolov5/datasets/dadetv6_3/labels/train/551df69f15ca760f19dd6f40bfda2aa7.txt',
        '/home/chenyi/workspace/yolov5/datasets/query/queryv7_1/images/train/20140305213907823.jpg',
        '/home/chenyi/workspace/yolov5/datasets/query/queryv7_1/labels/train/551df69f15ca760f19dd6f40bfda2aa7.txt']],
      dtype=object)

### 2. 训练集和测试集划分

#### 测试集均匀分布

In [None]:
df.value_counts('label_eng'), len(df.value_counts('label_eng'))

In [None]:
sample_num_s = 100
df_temp = df
select_bx_df = pd.DataFrame()

In [None]:
cls_fetch = 'miniskirt'
sample_num = sample_num_s - 0
cls1_bx_df = df_temp[df_temp['label_eng']==cls_fetch]
select_bx_df = pd.concat([select_bx_df, cls1_bx_df.sample(sample_num)], axis=0)
select_img_df = df[df['stem'].isin(select_bx_df['stem'])]
counter = select_img_df.value_counts('label_eng')
df_temp = df_temp[~df_temp['stem'].isin(select_img_df['stem'])]

In [None]:
counter[cls_fetch], cls_fetch

In [None]:
cls_fetch = 'paints_hot'
sample_num = sample_num_s - counter[cls_fetch] if cls_fetch in counter.index else sample_num_s
sample_num = 0 if sample_num<0 else sample_num
cls1_bx_df = df_temp[df_temp['label_eng']==cls_fetch]
select_bx_df = pd.concat([select_bx_df, cls1_bx_df.sample(sample_num)], axis=0)
select_img_df = df[df['stem'].isin(select_bx_df['stem'])]
counter = select_img_df.value_counts('label_eng')
df_temp = df[~df['stem'].isin(select_img_df['stem'])]

In [None]:
counter

In [None]:
cls_fetch = 'wholebody'
sample_num = sample_num_s - counter[cls_fetch] if cls_fetch in counter.index else sample_num_s
sample_num = 0 if sample_num<0 else sample_num
cls1_bx_df = df_temp[df_temp['label_eng']==cls_fetch]
select_bx_df = pd.concat([select_bx_df, cls1_bx_df.sample(sample_num)], axis=0)
select_img_df = df[df['stem'].isin(select_bx_df['stem'])]
counter = select_img_df.value_counts('label_eng')
df_temp = df[~df['stem'].isin(select_img_df['stem'])]

In [None]:
counter

In [None]:
cls_fetch = 'paints'
sample_num = sample_num_s - counter[cls_fetch] if cls_fetch in counter.index else sample_num_s
sample_num = 0 if sample_num<0 else sample_num
cls1_bx_df = df_temp[df_temp['label_eng']==cls_fetch]
select_bx_df = pd.concat([select_bx_df, cls1_bx_df.sample(sample_num)], axis=0)
select_img_df = df[df['stem'].isin(select_bx_df['stem'])]
counter = select_img_df.value_counts('label_eng')
df_temp = df[~df['stem'].isin(select_img_df['stem'])]

In [None]:
counter

In [None]:
cls_fetch = 'upper_long'
sample_num = sample_num_s - counter[cls_fetch] if cls_fetch in counter.index else sample_num_s
sample_num = 0 if sample_num<0 else sample_num
cls1_bx_df = df_temp[df_temp['label_eng']==cls_fetch]
select_bx_df = pd.concat([select_bx_df, cls1_bx_df.sample(sample_num)], axis=0)
select_img_df = df[df['stem'].isin(select_bx_df['stem'])]
counter = select_img_df.value_counts('label_eng')
df_temp = df[~df['stem'].isin(select_img_df['stem'])]
counter

In [None]:
cls_fetch = 'skirt'
sample_num = sample_num_s - counter[cls_fetch] if cls_fetch in counter.index else sample_num_s
sample_num = 0 if sample_num<0 else sample_num
cls1_bx_df = df_temp[df_temp['label_eng']==cls_fetch]
select_bx_df = pd.concat([select_bx_df, cls1_bx_df.sample(sample_num)], axis=0)
select_img_df = df[df['stem'].isin(select_bx_df['stem'])]
counter = select_img_df.value_counts('label_eng')
df_temp = df[~df['stem'].isin(select_img_df['stem'])]
counter, len(counter)

In [None]:
len(select_bx_df), len(select_img_df), len(select_img_df.drop_duplicates('stem'))

In [None]:
df['mode'] = 'train'

In [None]:
df.loc[select_img_df.index, 'mode'] = 'val'

In [None]:
df.value_counts('mode')

In [None]:
df.columns

### 3. 生成txt label.

In [6]:
df = bboxes_df

In [13]:
!tree -d -L 2 /home/chenyi/workspace/dataset/labels/deploy_system/

[01;34m/home/chenyi/workspace/dataset/labels/deploy_system/[00m
└── [01;34mlabels6[00m
    ├── [01;34mlabels60_june[00m
    └── [01;34mlabels_july[00m

3 directories


In [20]:
# txt路径.
folder_name = csv_fname
root_label = Path('/home/chenyi/workspace/dataset/labels/deploy_system/labels6/')
label_path = root_label /f'{folder_name}/labels'
if not label_path.is_dir(): label_path.mkdir(parents=True)
!ls $root_label

anomoly2k_aug  labels60_june  labels_july


##### 训练集测试集划分

In [None]:
df.head(2)

In [None]:
df['mode'] = 'train'

In [None]:
# 切分训练集和测试集
import random
img_df = df.drop_duplicates('img_sp').copy(deep=True)
train_list = ['train' if random.random()<0.85 else 'val' for i in range(len(img_df))]
img_df['mode'] =  train_list
img2mode = get_list2dict(img_df['img_sp'].values, img_df['mode'].values)
df['mode'] = df['img_sp'].map(img2mode)

In [None]:
df.value_counts('mode')

##### label target path.

In [22]:
df['mode'] = 'train'

In [23]:
df['label_tp'] = df.apply(lambda row: Path('{0}/{1}/{2}.txt'.format(label_path, row['mode'], Path(row['img_sp']).stem)), axis=1)
df['label_tp'].values[0]

PosixPath('/home/chenyi/workspace/dataset/labels/deploy_system/labels6/anomoly2k_aug/labels/train/098c9124-0fbb-428f-afbf-09941194d177.txt')

#### 生成txt

In [24]:
sys.path.append('../../')
from utils.bbox.conversation import bbox_coco2voc,bbox_voc2yolo,bbox_yolo2voc

In [27]:
# df = xml_df
df.head(1)

Unnamed: 0,xml_path,width,height,labelimg,bx1,bx2,bx3,bx4,stem,label9,label_eng,label6,img_sp,label_model,mode,label_tp
0,/Users/chenyi/Desktop/data_aug/deploy_manual_s...,384,682,上装,2,104,376,419,098c9124-0fbb-428f-afbf-09941194d177,2,upper_short,4,/Users/chenyi/Desktop/data_aug/deploy_manual_s...,2,train,/home/chenyi/workspace/dataset/labels/deploy_s...


In [28]:
label_name = 'label_model'
bboxes_list = []
for ind,row in tqdm(df.iterrows()):
    bbox5 = row.loc[[label_name, 'bx1', 'bx2', 'bx3', 'bx4']].values.tolist()
    w, h = row['width'], row['height']
    bbox5[1:] = bbox_voc2yolo(bbox5[1:], w, h)
    bbox5_str = '{0:.0f} {1:.6f} {2:.6f} {3:.6f} {4:.6f}\n'.format(*bbox5)
    bboxes_list.append(bbox5_str)
    # break

2238it [00:00, 4507.95it/s]


In [29]:
def write_txt(label_path, text):
    # label_path, text = input_list[0], input_list[1]
    label_path = Path(label_path)
    p_parent = label_path.parent
    if not p_parent.is_dir():
        p_parent.mkdir(parents=True)
    if not p_parent.is_file():
        with open(label_path, 'a+') as f:
            f.write(text)

In [30]:
# 注意生成的测试label，要删除，再重新生成。
for label_path, text in tqdm(zip(df['label_tp'].values, bboxes_list)):
    write_txt(label_path, text)
    # break
!cat $label_path

2238it [00:00, 30173.20it/s]


##### 拷贝labels

In [None]:
label_troot = Path('/home/chenyi/workspace/dataset/labels/query6/')

In [None]:
!ls $label_troot

In [None]:
folder_name = 'dadetv6_2'
label_tp = label_troot / folder_name
# label_sp = /nas/chenyi/datasets_nas/deploy_system/labels

In [None]:
# !mkdir -p $label_tp

In [None]:
label_tp

In [None]:
# !cp -r $root_label $label_tp

In [None]:
# !cp -r /nas/chenyi/datasets_nas/deploy_system/labels \
#         /home/chenyi/workspace/dataset/labels

### 4. 构建slink_df

In [10]:
root_yolo = Path('/home/chenyi/workspace/yolov5/datasets/query/')
!tree -d -L 1 $root_yolo

[01;34m/home/chenyi/workspace/yolov5/datasets/query[00m
└── [01;34mqueryv7_1[00m

1 directory


In [36]:
root_img = Path('/home/chenyi/workspace/dataset/data/deploy_system/deploy_manual_sep/images/apparel/anomaly0826/')
# !tree -d -L 1 $root_data

In [41]:
root_label = Path('/home/chenyi/workspace/dataset/labels/deploy_system/labels6/anomoly2k_aug/')
!ls $root_label

labels


In [42]:
# df = xml_df
# df['img_sp'] = df['img_sp'].apply(lambda x: '{0}/{1}'.format(root_data, Path(x).name))
df['img_sp'] = df['img_sp'].apply(lambda x: Path(x))
df['img_sp'] = df['img_sp'].apply(lambda x: root_img /x.name)
# df['img_sp'] = df['img_sp'].apply(lambda x: root_img /x.parts[-4] /x.parts[-3] /x.parts[-2] /x.name)
# df['label_tp'] = df['label_tp'].apply(lambda x: '{0}/{1}/{2}'.format(root_label, Path(x).parts[-2], Path(x).name))

In [47]:
folder_name = 'queryv7_1'
troot = Path(root_yolo /folder_name)
if not troot.is_dir(): troot.mkdir(parents=True)
!ls $troot

In [48]:
slink_df = pd.DataFrame()
drop_df = df.drop_duplicates('img_sp')
slink_df['img_sp'] = drop_df['img_sp']
slink_df['label_sp'] = drop_df['label_tp']

In [49]:
slink_df['label_tp'] = slink_df['label_sp'].apply(lambda x: troot /'labels' /Path(x).parts[-2] /Path(x).parts[-1])
slink_df['img_tp'] = slink_df['label_sp'].apply(lambda x: troot /'images' /Path(x).parts[-2] /Path(x).parts[-1])
slink_df['img_tp'] = slink_df['img_tp'].apply(lambda x: x.with_suffix('.jpg'))

In [51]:
lp = slink_df['label_sp'].values[0]
!cat $lp

2 0.492188 0.383431 0.973958 0.461877
2 0.506510 0.696481 0.658854 0.307918


#### 直接读取slink.csv

In [None]:
!tree -d -L 1 /home/chenyi/workspace/dataset/labels/dada_det/labels202205/shoeBag5k0515/

In [None]:
# slink_df = pd.read_csv('/home/chenyi/workspace/myPyModule99/dataset/dada_det/data/sys_error_sample/sys_error_slin_dfk.csv')

In [None]:
slink_df.head(1).values

In [None]:
slink_df.head(1)

In [None]:
origin_folder = 'gallery2mv6'
folder_name = 'gallery2m7k'

In [None]:
slink_df['img_tp'] = slink_df['img_tp'].apply(lambda x: str(x).replace(f'/{origin_folder}/images/', f'/{folder_name}/images/'))
slink_df['label_tp'] = slink_df['label_tp'].apply(lambda x: str(x).replace(f'/{origin_folder}/labels/', f'/{folder_name}/labels/'))

### 5. 生成sym link

In [46]:
root = '/home/chenyi/workspace/yolov5/datasets/'
!tree  -d -L 1 $root

[01;34m/home/chenyi/workspace/yolov5/datasets/[00m
├── [01;34mbenchmark[00m
├── [01;34mcoco128[00m
├── [01;34mdadet100kv2[00m
├── [01;34mdadet100kv3_june[00m
├── [01;34mdadet100kv4_1_june[00m
├── [01;34mdadet100kv4_june[00m
├── [01;34mdadet_gallv4_june[00m
├── [01;34mdadetv5[00m
├── [01;34mdadetv5_query[00m
├── [01;34mdadetv6_1[00m
├── [01;34mdadetv6_2[00m
├── [01;34mdadetv6_3[00m
├── [01;34mdataExps[00m
├── [01;34mdeepedia[00m
├── [01;34mgallery_train[00m
├── [01;34mhaowei_bag_shoe[00m
├── [01;34mhwpedia[00m
├── [01;34mhwpedia70k[00m
├── [01;34mhwpedia80k[00m
├── [01;34mtuneGallery10k[00m
├── [01;34mtuneGallery5k[00m
├── [01;34mtuneQuery10k[00m
└── [01;34mvalidation_all[00m

23 directories


[01;34m/home/chenyi/workspace/yolov5/datasets/query[00m
└── [01;34mqueryv7_1[00m

1 directory


#### 对现有数据拷贝软连接

In [None]:
folder_name = 'gallery_train/gallery2mv6'
folder_copy = 'hwpedia80k'
folder_copy

In [None]:
img_list = glob.glob(f'/home/chenyi/workspace/yolov5/datasets/{folder_copy}/images/*/*')
len(img_list)

In [None]:
label_list = glob.glob(f'/home/chenyi/workspace/yolov5/datasets/{folder_copy}/labels/*/*')
len(label_list)

In [None]:
img_t_list = [p.replace(folder_copy, folder_name) for p in img_list]
label_t_list = [p.replace(folder_copy, folder_name)  for p in label_list]

In [None]:
copy_df = pd.DataFrame()
copy_df['img_sp'] = img_list
copy_df['label_sp'] = label_list
copy_df['img_tp'] = img_t_list
copy_df['label_tp'] = label_t_list

#### 生成软连接

##### 生成文件夹

In [41]:
root = '/home/chenyi/workspace/yolov5/datasets/query'
!tree  -d -L 1 $root
# folder_name = 'gallery2mv6_test'

[01;34m/home/chenyi/workspace/yolov5/datasets/query[00m
└── [01;34mqueryv7_1[00m

1 directory


In [42]:
folder_name

'queryv7_1'

In [43]:
# 产生路径
import shutil
# folder_name = 'gallery2mv6_test'
folder_name = folder_name
folder_root = Path(root) /folder_name
# folder_root = root
folder1 = ['images', 'labels']
folder2 = ['train', 'val']
# # if folder_root.is_dir() and False:
# #     shutil.rmtree(folder_root)
# for f1 in folder1:
#     for f2 in folder2:
#         tg = folder_root / f1 / f2
#         tg.mkdir(parents=True)   

In [44]:
!tree  -d -L 2 $folder_root

[01;34m/home/chenyi/workspace/yolov5/datasets/query/queryv7_1[00m
├── [01;34mimages[00m
│   ├── [01;34mtrain[00m
│   └── [01;34mval[00m
└── [01;34mlabels[00m
    ├── [01;34mtrain[00m
    └── [01;34mval[00m

6 directories


##### 生成训练集

In [45]:
slink_df = df

In [46]:
len(glob.glob(f'{folder_root}/images/*/*.jpg')), len(glob.glob(f'{folder_root}/labels/*/*.txt'))

(124869, 124869)

In [47]:
slink_df.head(1).values

array([['/home/chenyi/workspace/yolov5/datasets/dadetv6_3/images/train/20140305213907823.jpg',
        '/home/chenyi/workspace/yolov5/datasets/dadetv6_3/labels/train/551df69f15ca760f19dd6f40bfda2aa7.txt',
        '/home/chenyi/workspace/yolov5/datasets/query/queryv7_1/images/train/20140305213907823.jpg',
        '/home/chenyi/workspace/yolov5/datasets/query/queryv7_1/labels/train/551df69f15ca760f19dd6f40bfda2aa7.txt',
        '20140305213907823']], dtype=object)

In [48]:
%%time
for p in slink_df['label_tp'].values:
    if Path(p).is_symlink():
        Path(p).unlink()
_ = slink_df.apply(lambda row: Path.symlink_to(Path(row['label_tp']), Path(row['label_sp'])), axis=1)
# _ = slink_df.apply(lambda row: os.symlink(str(row['label_sp']), str(row['label_tp'])), axis=1)

CPU times: user 3.32 s, sys: 3.55 s, total: 6.87 s
Wall time: 7.18 s


In [49]:
%%time
for p in slink_df['img_tp'].values:
    if Path(p).is_symlink():
        Path(p).unlink()
_ = slink_df.apply(lambda row: Path.symlink_to(Path(row['img_tp']), Path(row['img_sp'])), axis=1)

CPU times: user 3.26 s, sys: 3.67 s, total: 6.93 s
Wall time: 7.83 s


In [50]:
len(slink_df),len(glob.glob(f'{folder_root}/images/*/*.jpg')), len(glob.glob(f'{folder_root}/labels/*/*.txt'))

(122800, 124869, 124869)

In [51]:
p = str(slink_df['label_tp'].values[1])
!cat $p

5 0.506840 0.487305 0.953488 0.849609


#### 核查软连接

In [52]:
folder_name = 'queryv7_1'

In [53]:
folder_name

'queryv7_1'

In [54]:
# folder_name = 'hwpedia'
folder_name = folder_name
root = f'/home/chenyi/workspace/yolov5/datasets/query/{folder_name}'
!tree  -d -L 2 $root

[01;34m/home/chenyi/workspace/yolov5/datasets/query/queryv7_1[00m
├── [01;34mimages[00m
│   ├── [01;34mtrain[00m
│   └── [01;34mval[00m
└── [01;34mlabels[00m
    ├── [01;34mtrain[00m
    └── [01;34mval[00m

6 directories


In [55]:
import os
def check_syslink(root, mode='images'):
    subfix = '.txt' if mode=='labels' else '.jpg'
    pathes = glob.glob(f'{root}/{mode}/*/*{subfix}')
    flag_list = [os.path.isfile(os.readlink(p)) for p in pathes]
    txt_df = pd.DataFrame()
    txt_df['flag'] = flag_list
    txt_df['path'] = pathes
    return txt_df

In [56]:
%%time
counter_images = check_syslink(root, mode='images')

CPU times: user 266 ms, sys: 391 ms, total: 657 ms
Wall time: 657 ms


In [57]:
%%time
counter_labels = check_syslink(root, mode='labels')

CPU times: user 241 ms, sys: 419 ms, total: 660 ms
Wall time: 660 ms


In [58]:
counter_images.value_counts('flag')

flag
True    124869
dtype: int64

In [59]:
counter_labels.value_counts('flag')

flag
True    124869
dtype: int64

In [None]:
flase_df = counter_images[~counter_images['flag']]
flase_df['stem'] =  flase_df['path'].apply(lambda x: Path(x).stem)

In [25]:
df['stem'] = df['img_sp'].apply(lambda x: Path(x).stem)

In [29]:
df1 = df[df['stem'].isin(flase_df['stem'])]

In [None]:
img_root = Path('/home/chenyi/workspace/dataset/data/dada_det/shoeBag5k0515/images/')
df1['img_sp'] = df1['img_sp'].apply(lambda x: '{0}/{1}'.format(img_root, Path(x).name))

In [38]:
df.loc[df1.index, 'img_sp'] = df1['img_sp'].values