In [7]:
from lxml import etree
import glob

In [8]:
from pathlib import Path
from tqdm import tqdm

In [9]:
import pandas as pd
import numpy as np
import os,sys

In [10]:
import re
from PIL import Image

### 接口

In [1]:
def voc2yolo():
    center_x = (xmin + (xmax-xmin)/2)/width
    center_y = (ymin + (ymax-ymin)/2)/height
    bdb_w = (xmax-xmin)/width
    bdb_h = (ymax-ymin)/height

In [2]:
def parse_xml(p):
    trees = etree.parse(p)
    width, height = int(trees.find('size/width').text), int(trees.find('size/height').text)
    bboxes = []
    for obj in trees.findall('object'):
        bx = []
        bx.append(p)
        bx.append(int(width))
        bx.append(int(height))
        bdb = obj.find('bndbox')
        bx.append(obj.find('name').text)
        bx.append(int(bdb.find('xmin').text))
        bx.append(int(bdb.find('ymin').text))
        bx.append(int(bdb.find('xmax').text))
        bx.append(int(bdb.find('ymax').text))
        bboxes.append(bx)
    return bboxes

In [64]:
model_label_map = {
    0: 10,  # 鞋子
    1: 8,  # 包包
    2: 4,  # 上装
    3: 5,  # 裤子
    4: 6,  # 裙子
    5: 7,  # 连体装
}
model_label_map8 = {
    0: 10,  # 鞋子
    1: 8,  # 包包
    2: 4,  # 上装_短款
    3: 5,  # 裤子_短款
    4: 6,  # 裙子
    5: 7,  # 连体装
    6: 4,  # 上装_长款
    7: 5,  # 裤子_长款
}

In [107]:
model_label_map9 = {
    0: 10,  # 鞋子
    1: 8,  # 包包
    2: 4,  # 上装_短款
    3: 5,  # 裤子_长款
    4: 6,  # 裙子
    5: 7,  # 连体装
    6: 4,  # 上装_长款
    7: 5,  # 超短裤
    8: 4,  # 超短裙
}

id2chi9 = {
    0: '鞋子',
    1: '包包',
    2: '上装_短款',
    3: '裤子',
    4: '裙子',
    5: '连体装',
    6: '上装_长款',
    7: '超短裤',
    8: '超短裙',
}

id2eng9 = {
    0: 'shoe',  # 鞋子
    1: 'bag',  # 包包
    2: 'upper_short',  # 上装
    3: 'paints',  # 裤子
    4: 'skirt',  # 裙子
    5: 'wholebody',  # 连体装
    6: 'upper_long',
    7: 'paints_hot',
    8: 'miniskirt',
}

In [66]:
def get_list2dict(k_list, v_list):
    d = {}
    for item in zip(k_list, v_list):
        d[item[0]] = item[1]
    return d

In [71]:
chi2id9 = get_list2dict(id2chi9.values(), id2chi9.keys())
eng2id9 = get_list2dict(id2eng9.values(), id2eng9.keys())

### 1.Data Loader.

In [13]:
!pwd

/Users/chenyi/Desktop/code/datasetMudule/dataset/dada_xml_parse


In [14]:
sys.path.append('../../../../')

In [21]:
root = Path('../../../../dada_data/gallery2m_clr_july/xml_labelimg/part5_img_xml/')
!ls $root

[34mimages[m[m [34mxml[m[m


In [22]:
imgs_list = glob.glob(str(root /'*/images/*.jpg'))
xml_list = glob.glob(str(root /'*/xml/*.xml'))
imgs_list += glob.glob(str(root /'*/images/*/*.jpg'))
xml_list += glob.glob(str(root /'*/xml/*/*.xml'))
len(imgs_list), len(xml_list)

(0, 0)

In [23]:
imgs_list = glob.glob(str(root /'images/*.jpg'))
xml_list = glob.glob(str(root /'xml/*.xml'))
len(imgs_list), len(xml_list)

(993, 908)

In [24]:
# 核查xml是否有重复
xml_df = pd.DataFrame()
xml_df['path'] = xml_list
xml_df['stem'] = xml_df['path'].apply(lambda x: Path(x).stem)
len(xml_df)

908

#### csv读取和保存

In [112]:
root_csv = Path('../../data/xml_csv/')
bboxes_df = pd.read_csv(root_csv /'gall7k_xml_20220715.csv')
# parts_df = pd.read_csv('../../data/xml_csv/gall7k_xml_20220714.csv')

In [111]:
# !mkdir -p $root_csv
# bboxes_df = bboxes_all_df
bboxes_df.to_csv(str(root_csv /'gall7k_xml_20220715.csv'), index=False)

In [109]:
bboxes_df['label9'] = bboxes_df['labelimg'].map(chi2id9)
bboxes_df['label_eng'] = bboxes_df['label9'].map(id2eng9)

#### 数据分析

In [113]:
bboxes_df.head(1)

Unnamed: 0,xml_path,width,height,labelimg,bx1,bx2,bx3,bx4,stem,img_sp,man,label9,label_eng
0,../../dada_data/gallery2m_clr_july/xml_labelim...,1279,1706,包包,440,596,664,1028,0d99bc28-a05b-11ec-8062-5cfb3aa6f5e6,../../dada_data/gallery2m_clr_july/xml_labelim...,part1_img_xml,1.0,bag


In [94]:
bboxes_df.value_counts('labelimg'), len(bboxes_df), len(bboxes_df.drop_duplicates('stem')), len(bboxes_df.value_counts('labelimg'))

(labelimg
 鞋子       10008
 上装_短款     5623
 包包        3304
 裤子        2843
 上装_长款     1132
 半身裙       1024
 连体装        948
 超短裤        736
 超短裙        723
 dtype: int64,
 26341,
 7230,
 9)

### 解析XML

#### 解析XML

In [28]:
bboxes_list = []
for p in tqdm(xml_list):
    row = parse_xml(p)
    bboxes_list += row
    # break

100%|██████████| 908/908 [00:00<00:00, 1663.56it/s]


In [29]:
bboxes_df = pd.DataFrame(bboxes_list)
name_stand = ['xml_path', 'width', 'height', 'labelimg', 'bx1', 'bx2','bx3', 'bx4']
bboxes_df.rename(columns={bboxes_df.columns[0]:name_stand[0],
                          bboxes_df.columns[1]:name_stand[1],
                          bboxes_df.columns[2]:name_stand[2],
                          bboxes_df.columns[3]:name_stand[3],
                          bboxes_df.columns[4]:name_stand[4],
                          bboxes_df.columns[5]:name_stand[5],
                          bboxes_df.columns[6]:name_stand[6],
                          bboxes_df.columns[7]:name_stand[7],
                         },
                          inplace=True)

In [114]:
bboxes_df.value_counts('labelimg'), len(bboxes_df), len(bboxes_df.value_counts('labelimg').index)

(labelimg
 鞋子       10008
 上装_短款     5623
 包包        3304
 裤子        2843
 上装_长款     1132
 半身裙       1024
 连体装        948
 超短裤        736
 超短裙        723
 dtype: int64,
 26341,
 9)

In [32]:
bboxes_df['stem'] = bboxes_df['xml_path'].apply(lambda x: Path(x).stem)

In [105]:
# bboxes_df['label86'] = bboxes_df['labelimg'].map(chi8_map_name8)

##### 修正label

In [33]:
len(bboxes_df.value_counts('labelimg'))

9

In [35]:
# label_name_chi  = '鞋'
# sample_df = bboxes_df[bboxes_df['labelimg']==label_name_chi]
# p_sample = sample_df['img_sp'].values[0]

In [121]:
# sample_df

In [122]:
p_sample

'../../dada_data/gallery2m_clr_july/xml_labelimg/part3_img_xml/images/1f/1f936fa2-0a71-4015-b09f-611234c3c327-490343.jpg'

In [123]:
name_new = '鞋子'
bboxes_df.loc[bboxes_df[bboxes_df['labelimg']==label_name_chi].index, 'labelimg'] = name_new

#### 获取对应图片信息

In [36]:
# imgs_list = glob.glob(str(root /'images/*/*/*/*'))
img_df = pd.DataFrame()
img_df['path'] = imgs_list
len(img_df)

993

In [37]:
img_df.head(1).values

array([['../../../../dada_data/gallery2m_clr_july/xml_labelimg/part5_img_xml/images/2ec86dde-f8c2-4213-bf67-be170e45ac1f-871416.jpg']],
      dtype=object)

In [38]:
img_df['stem'] = img_df['path'].apply(lambda x: Path(x).stem)

In [39]:
# img_df['parts3'] = img_df['path'].apply(lambda x: Path(x).parts[-3])

In [40]:
# img_df.value_counts('parts3')

In [41]:
len(img_df)

993

##### 获取bbox对应的图片

In [42]:
# xml是否都有对应的图片
cross_df = img_df[img_df['stem'].isin(bboxes_df['stem'])]
len(cross_df)

908

In [43]:
stem2path = get_list2dict(cross_df['stem'].values, cross_df['path'].values)

In [44]:
bboxes_df['img_sp'] = bboxes_df['stem'].map(stem2path)

In [45]:
bboxes_clr_df = bboxes_df[~bboxes_df['img_sp'].isna()]

In [46]:
len(bboxes_df), len(bboxes_clr_df)

(3687, 3687)

##### 获取图片的宽和高

In [55]:
bboxes_df = bboxes_clr_df 

In [56]:
w_list = []
h_list = []
for ind,row in tqdm(bboxes_df.iterrows()):
    img_sp = row['img_sp']
    try:
        img = Image.open(img_sp)
        w, h = img.size
    except:
        try:
            img = cv2.imread(sample_p)
            w,h = img.shape[1], img.shape[0]
        except:
            w,h = None, None
    w_list.append(w)
    h_list.append(h)
    # break

3687it [00:00, 7000.20it/s]


In [62]:
bboxes_df.head(1).values

array([['../../../../dada_data/gallery2m_clr_july/xml_labelimg/part5_img_xml/xml/0ebafc7c-a1ce-11ec-aa89-5cfb3aa6f5e6.xml',
        1242, 1638, '上装_短款', 658, 677, 807, 896,
        '0ebafc7c-a1ce-11ec-aa89-5cfb3aa6f5e6',
        '../../../../dada_data/gallery2m_clr_july/xml_labelimg/part5_img_xml/images/0ebafc7c-a1ce-11ec-aa89-5cfb3aa6f5e6.jpg']],
      dtype=object)

In [58]:
bboxes_df['width'] = w_list
bboxes_df['height'] = h_list

In [59]:
na_df = bboxes_df[bboxes_df['width'].isna()]

#### label处理

In [54]:
bboxes_df['label9'] = bboxes_df['labelimg'].map(id2eng9)

In [55]:
bboxes_df['label_model'] = bboxes_df['label86'].values
bboxes_df.loc[bboxes_df['label_model']==6, 'label_model'] = 2
bboxes_df.loc[bboxes_df['label_model']==7, 'label_model'] = 3

In [95]:
bboxes_df.head(1)

Unnamed: 0,xml_path,width,height,labelimg,bx1,bx2,bx3,bx4,stem,img_sp,man
0,../../dada_data/gallery2m_clr_july/xml_labelim...,1279,1706,包包,440,596,664,1028,0d99bc28-a05b-11ec-8062-5cfb3aa6f5e6,../../dada_data/gallery2m_clr_july/xml_labelim...,part1_img_xml


In [57]:
bboxes_df.value_counts('label86').sort_values()

label86
5     53
6     53
1     64
3     67
7     79
4     83
0    138
2    161
dtype: int64

In [96]:
bboxes_df.value_counts('labelimg').sort_values()

labelimg
超短裙        723
超短裤        736
连体装        948
半身裙       1024
上装_长款     1132
裤子        2843
包包        3304
上装_短款     5623
鞋子       10008
dtype: int64

##### 统计人员的bbox

In [104]:
df = bboxes_df
df.tail(1).values

array([['../../../../dada_data/gallery2m_clr_july/xml_labelimg/part5_img_xml/xml/1edd4d58-3912-4f0c-9647-33c81a3a0eee-406286.xml',
        1280, 1706, '鞋子', 458, 1571, 596, 1696,
        '1edd4d58-3912-4f0c-9647-33c81a3a0eee-406286',
        '../../../../dada_data/gallery2m_clr_july/xml_labelimg/part5_img_xml/images/1edd4d58-3912-4f0c-9647-33c81a3a0eee-406286.jpg',
        'part5_img_xml']], dtype=object)

In [105]:
df.head(1)

Unnamed: 0,xml_path,width,height,labelimg,bx1,bx2,bx3,bx4,stem,img_sp,man
0,../../dada_data/gallery2m_clr_july/xml_labelim...,1279,1706,包包,440,596,664,1028,0d99bc28-a05b-11ec-8062-5cfb3aa6f5e6,../../dada_data/gallery2m_clr_july/xml_labelim...,part1_img_xml


In [106]:
# df['man'] = df['xml_path'].apply(lambda x: Path(x).parts[5])
df.value_counts('man').sort_index()

man
part1_img_xml    12034
part2_img_xml     5179
part3_img_xml     2518
part4_img_xml     2923
part5_img_xml     3687
dtype: int64

### one object.

In [60]:
eng2num = {'shoe': 0, 
         'upperbody_short':2,
         'bag':1, 
         'lowbody_long':7, 
         'upperbody_long':6,
         'skirt':4, 
         'wholebody':5, 
         'lowbody_short':3}

In [61]:
eng2num_hard = {'0_shoe': 0, 
                 '1_bag':1, 
         '2_upperbody_short':2,
         '3_lowbody_short':3, 
         '4_skirt':4, 
         '5_wholebody':5, 
         '6_upperbody_long':6,
         '7_lowbody_long':7}

In [62]:
bboxes_df['cls_folder'] = bboxes_df['img_sp'].apply(lambda x: Path(x).parts[-2])

In [63]:
# bboxes_df['label_folder'] = bboxes_df['cls_folder'].map(eng2num)
bboxes_df['label_folder'] = bboxes_df['cls_folder'].map(eng2num_hard)

In [64]:
bboxes_eq_df = bboxes_df[bboxes_df['label_folder']==bboxes_df['label86']]

In [65]:
bboxes_df['label_query'] = bboxes_df['label_folder']==bboxes_df['label86']

In [66]:
len(bboxes_eq_df)

618

### 数据展示

In [47]:
!tree -d -L 1 /home/chenyi/workspace/dataset/data/dada_det

[01;34m/home/chenyi/workspace/dataset/data/dada_det[00m
├── [01;34mhard_sample_0610[00m
├── [01;34m__MACOSX[00m
├── [01;34mshoe_bag11k_may[00m
├── [01;34mshoeBag5k0515[00m
└── [01;34mwholebody0601_all[00m

5 directories


In [25]:
ROOT = Path('/home/chenyi/workspace/dataset/data/dada_det/hard_sample_0610/')

#### 处理打标标签

##### 构建中文标签和业务id映射

In [45]:
# counter = df.value_counts(df.columns[3])
# name_list = [item[0] for item in list(counter.index)]
# name_df = pd.DataFrame()
# name_df['name'] = name_list

In [121]:
import json
with open('../../../data/dict/name_chi2l1.json', 'r') as f:
    name_chi2l1 = json.load(f)

In [122]:
df['label6'] = df[df.columns[3]].map(name_chi2l1)

In [123]:
df.value_counts('label6')

label6
4    27
7    19
6    8 
8    4 
5    2 
dtype: int64

In [115]:
# 增加多出的中文标签映射
# name_chi2l1['皮衣外套'] = 4

In [116]:
import json
with open('../../../data/dict/name_chi2l1.json', 'w') as f:
    json.dump(name_chi2l1, f)

In [124]:
model_label_map

{0: 10, 1: 8, 2: 4, 3: 5, 4: 6, 5: 7}

In [126]:
l1_map_model = get_list2dict(model_label_map.values(), model_label_map.keys())
df['label_model'] = df['label6'].map(l1_map_model)

##### 存储目标index

In [None]:
# df = pd.read_csv('../data/wh.csv')

In [None]:
!ls /nas/chenyi/datasets_cls/dada_det_drawed/wholebody_jul_drawed

In [None]:
!mkdir -p /nas/chenyi/datasets_cls/dada_det_drawed/wholebody_jul_drawed

In [None]:
draw_root = Path('/nas/chenyi/datasets_cls/dada_det_drawed/wholebody_jul_drawed')

In [None]:
df.head(2).values

In [None]:
df['img_tp'] = df['img_sp'].apply(lambda x:'{0}/{1}/{2}'.format(draw_root, Path(x).parts[-2], Path(x).name))

In [None]:
sys.path.append('../../../')
from utils.plots.plots import draw_bbox
import cv2
from PIL import Image

In [None]:
# error_p_list = []
# for ind, row in tqdm(df.iterrows()):
#     row_info = row.loc[df.columns[:8]].values
#     w,h = row_info[1], row_info[2]
#     name = row.loc[['label_model']].values[0]
#     bbox = list(row_info[4:8])
#     bbox.insert(0,name)
#     tp = Path(row.loc[['img_tp']].values[0])
#     if not tp.parent.is_dir():
#         tp.parent.mkdir()
#     rp = tp if tp.is_file() else row.loc[['img_sp']].values[0]
#     img = cv2.imread(str(rp))
#     img = draw_bbox(img, bbox)
#     try:
#         cv2.imwrite(str(tp), img)
#     except:
#         error_p_list.append(tp)
#     # break

In [None]:
error_p_list

In [None]:
error_df = pd.DataFrame()
error_df['path'] = error_p_list
error_df['stem'] = error_df['path'].apply(lambda x: Path(x).stem)

In [None]:
df.head()

In [None]:
df = df[~df['stem'].isin(error_df['stem'])]

In [None]:
# df.to_csv('../data/wholebody5k0610/wholebody5k_july.csv', index=False)

#### Display Html

In [None]:
sys.path.append('../../../')
from utils.display.html import *
from IPython.display import HTML
from io import BytesIO

In [None]:
draw_root = Path('/nas/chenyi/datasets_cls/dada_det_drawed/wholebody_jul_drawed/')

In [None]:
img_list = glob.glob(str(draw_root/'*/*.jpg'))
len(img_list), img_list[-1]

In [None]:
# img_list = glob.glob(draw_root + '*.jpg')
df_drawed = pd.DataFrame()
df_drawed['path'] = img_list[-100:]

In [None]:
size = 500
df_drawed['res'] = df_drawed['path'].map(lambda x: get_thumbnail(x, size))

In [None]:
f = df_drawed[['res', 'path']].to_html(formatters={'res': image_formatter}, escape=False)

In [None]:
HTML(f)

### 解析json文件

In [None]:
json_list = glob.glob('/home/chenyi/workspace/dataset/haowei34k_labelImg/xml_label/*/*.json')

In [None]:
json_df = pd.DataFrame()
json_df['spath'] = json_list

In [None]:
xml_dir = Path('/home/chenyi/workspace/dataset/haowei34k_labelImg/json2xml/')

In [None]:
json_df['tpath'] = json_df['spath'].apply(lambda x:
                                         '{0}/{1}'.format(xml_dir, re.findall(r'/[0-9]{1,3}/(.*?.json)',x)[0]))

In [None]:
sys.path.append('/home/chenyi/workspace/myPyModule99/data_format/json_to_xml')
from json_to_xml.main import *

In [None]:
root_json_dir = xml_dir
root_save_xml_dir = '/home/chenyi/workspace/dataset/haowei34k_labelImg/json2xml/xml'

In [None]:
import json
for json_filename in tqdm(os.listdir(root_json_dir)):
    json_path = os.path.join(root_json_dir, json_filename)
    json_data = json.load(open(json_path))
    break

In [None]:
json_df['path'] = glob.glob(str(root_json_dir/'*.json'))

In [None]:
json_df['fname'] =  json_df['path'].apply(lambda x: re.findall(r'/json2xml/(.*?).json',x)[0])

In [None]:
json_labels = {}
for ind,row in tqdm(json_df.iterrows()):
    json_path = row['path']
    fname = row['fname']
    json_data = json.load(open(json_path))
    json_labels[fname] = ''
    for item in json_data['shapes']:
        name = item['label']
        points = item['points']
        xmin, ymin = points[0]
        xmax, ymax = points[1]
        
        if name not in classes_name.keys():
            classes_name[name] = len(classes_name)
        json_labels[fname] += '{} {:6f} {:6f} {:6f} {:6f}\n'.format(
            classes_name[name], xmin, ymin, xmax, ymax)

In [None]:
fname_list = []
xmin_list = []
ymin_list = []
xmax_list = []
ymax_list = []
labels_list = []
for ind,row in tqdm(json_df.iterrows()):
    json_path = row['path']
    fname = row['fname']
    json_data = json.load(open(json_path))
    for item in json_data['shapes']:
        name = item['label']
        points = item['points']
        xmin, ymin = points[0]
        xmax, ymax = points[1]
        
        fname_list.append(fname)
        labels_list.append(name)
        xmin_list.append(xmin)
        ymin_list.append(ymin)
        ymax_list.append(ymax)
        xmax_list.append(xmax)

In [None]:
json_bboxes = pd.DataFrame()

In [None]:
json_bboxes['fname'] = fname_list
json_bboxes['label'] = labels_list
json_bboxes['xmin'] = xmin_list
json_bboxes['ymin'] = ymin_list
json_bboxes['xmax'] = xmax_list
json_bboxes['ymax'] = ymax_list

In [None]:
json_bboxes.head()