# <center>DOTA_Workflow</center>
## 用于对DOTA数据集进行切分、整理、并将其转化为符合VOC标准的结构
## 1. ImgSplit
切分数据集
## 2. GetSpecificClass
对切分后的数据集进行清理，清除掉无目标图片
## 3. Txt2Xml
将数据集转化为符合VOC标准的结构

# DOTA 目标类别

In [None]:
"""
class_list = ['plane', 'baseball-diamond', 'bridge', 'ground-track-field', 'small-vehicle',
                  'large-vehicle', 'ship', 'tennis-court', 'basketball-court',  'storage-tank', 'soccer-ball-field', 
                  'roundabout', 'harbor', 'swimming-pool', 'helicopter']
"""

# 加载相关依赖项
## 比较关键的几个：
- from DOTA_devkit import DOTA （Modified）
- import shutil (in-build)
- from txt2xml import * （DIY）

In [8]:
"""
加载依赖项
"""
#%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import os
import pylab
import sys
# sys.path.append('/home/raymond/project/PytorchSSD_DOTA/data/DOTA_devkit') # 保证DOTA_devkit可用的关键
from DOTA_devkit import DOTA
import torchvision.transforms as transforms
from PIL import Image
import cv2
import shutil
from txt2xml import *

# 全局参数
- **Mode**：标注的解析方式
- **main_folder**: DOTA原始数据集的文件夹
- **main_type**：选择使用DOTA原始数据集的哪一部分（train、test、val）

In [9]:
# Config (Global)
Mode = 'parse_dota_rec'
main_folder = '/home/buaab622/data/DOTA'
main_type = 'val'

# ImgSplit参数
- **subset_name**：设定待切分子集的名称
- **SaveClasses**：选择保留的目标类别
- **subimg_size**：选择切分子图的尺寸
- **subimg_gap**： 选择子图之间重叠的范围
- **thresh_iou**：在切分图片的时候，可能有目标会被切割，目标剩余面积占比超过该阈值才被视作有效目标，保留包围框

In [25]:
# Config (ImgSplit)
subset_name = 'subset_car_ship_500'
SaveClasses = ['ship','small-vehicle']
subimg_size = 500
subimg_gap = 200
thresh_iou = 0.8
# Config (Get Specific Classes)

# txt2xml参数
- **img_shape**：数据集图片尺寸信息
- **class_list**：保留的目标类别
- **subset_type**：设定.txt文件的名称（需反应该数据集的用途：train/test/val）

In [26]:
# Config (txt2xml)
img_shape =(500, 500, 3)
class_list = ['ship','small-vehicle']
subset_type = 'car_ship_train'

# 建立相关的路径信息（不需手动更改）
- **raw_path**：待处理的DOTA原始数据集路径（DOTA + train/val/test）
- **subset_path**：子集存放路径（DOTA + subset_name）
- **GSC_folder**：被清洗后的子集路径（DOTA + subset_name + 'GSC'）
- **origin_folder**：待送入txt2xml进行VOC格式转化的文件夹
- **processed_folder**：VOC结构转化完成的输出文件夹

In [27]:
# generate some necessary path
# Main
raw_path = os.path.join(main_folder, main_type)
# ImgSplit
subset_path = os.path.join(main_folder,subset_name)
# GSC
GSC_folder = os.path.join(main_folder, subset_name + '_GSC')
# txt2xml
origin_folder = GSC_folder
processed_folder = GSC_folder

# 加载原始数据集

In [13]:
"""
生成一个DOTA实例
"""
# 测试路径：'/home/raymond/project/DOTA_PyTorch/DOTA_devkit/example'
# train路径：'/media/b622/MainDrive/Dataset/DOTA/train'
#dataset = DOTA.DOTA('/home/raymond/project/PytorchSSD_DOTA/data/DOTA_devkit/classsplit',parseMode='parse_dota_rec')
dataset = DOTA.DOTA(raw_path, parseMode = Mode)

 Error: /home/buaab622/data/DOTA/val/labelTxt/labelTxt.zip is not a .txt file
 Error: /home/buaab622/data/DOTA/val/labelTxt/Val_Task2_gt.zip is not a .txt file
 DOTA dataset has been successfully loaded 


# ImgSplit操作
### 裁剪图片+筛选类别

In [14]:
# ImgSplit
"""
裁剪图像与标注，生成新的数据集
"""
# split image
# cut image (original)
from DOTA_devkit import ImgSplit
imgids = dataset.getImgIds(catNms = SaveClasses)
imagelist = imgids
split = ImgSplit.splitbase(raw_path,subset_path,gap=subimg_gap, subsize=subimg_size, thresh=thresh_iou, choosebestpoint=True)
# split.splitdata(self,rate:int,imglist:List[str])
split.splitdata(1,imagelist)

加载图片ID完成：共有 219 张图片符合筛选条件
already finished: 0 %
already finished: 0 %
already finished: 10 %
already finished: 10 %
already finished: 20 %
already finished: 20 %
already finished: 30 %
already finished: 30 %
already finished: 40 %
already finished: 40 %
already finished: 50 %
already finished: 50 %
already finished: 60 %
already finished: 60 %
already finished: 70 %
already finished: 70 %
already finished: 80 %
already finished: 80 %
already finished: 90 %
already finished: 90 %
already finished: 100 %
already finished: 100 %


# Pre-Process of GSC
### 加载子集，筛选类别，构造路径

In [15]:
"""
生成一个DOTA实例"""
#root = '/media/raymond/MainDrive/Dataset/DOTA'
#raw_dataset = 'subset_planes_500_gap200'
#open_folder = os.path.join(root,subset_path)
dataset = DOTA.DOTA(subset_path, parseMode = Mode)
"""
加载包含指定类别目标的图片ID"""
imgids = dataset.getImgIds(catNms = SaveClasses)
print(len(imgids))
"""
构造打开、保存图像+标注的路径"""
img_root = os.path.join(subset_path,'images')
label_root = os.path.join(subset_path,'labelTxt')
img_save = os.path.join(GSC_folder,'images')
label_save = os.path.join(GSC_folder,'labelTxt')


 DOTA dataset has been successfully loaded 
加载图片ID完成：共有 2919 张图片符合筛选条件
2919


# GSC操作
### 清理无效数据（空目标图片）

In [17]:
"""
建立相关文件夹
"""
if not os.path.exists(GSC_folder):
    os.mkdir(GSC_folder)
    print('Directory is built:'+ GSC_folder)
if not os.path.exists(img_save): 
    os.mkdir(img_save)
    print('Directory is built:'+ img_save)
if not os.path.exists(label_save):
    os.mkdir(label_save)
    print('Directory is built:'+ label_save)
num_imgids = len(imgids)
for index,imgid in enumerate(imgids):
    imgname = str(imgid)+'.png'
    labelname = str(imgid)+'.txt'
    if index % (num_imgids//10) == 1:
        print("already finished: "+str(index//(num_imgids//10)*10)+" %")
    # copy img
    shutil.copy(os.path.join(img_root,imgname),os.path.join(img_save,imgname))
    # copy label
    shutil.copy(os.path.join(label_root,labelname),os.path.join(label_save,labelname))
    # print('processing: '+str(imgid))
print('All copy operation is done!')

Directory is built:/home/buaab622/data/DOTA/subset_car_ship_500_val_GSC
Directory is built:/home/buaab622/data/DOTA/subset_car_ship_500_val_GSC/images
Directory is built:/home/buaab622/data/DOTA/subset_car_ship_500_val_GSC/labelTxt
already finished: 0 %
already finished: 10 %
already finished: 20 %
already finished: 30 %
already finished: 40 %
already finished: 50 %
already finished: 60 %
already finished: 70 %
already finished: 80 %
already finished: 90 %
already finished: 100 %
All copy operation is done!


# txt2xml操作1
### 将.txt格式的标注文件转化为符合VOC标准的.xml标注文件

In [18]:
# txt2xml
txt2xml(origin_folder, img_shape, class_list, parseMode=Mode)

Directory is built:/home/buaab622/data/DOTA/subset_car_ship_500_val_GSC/Annotations
 DOTA dataset has been successfully loaded 
加载图片ID完成：返回所有图片ID
already finished: 0 %
already finished: 10 %
already finished: 20 %
already finished: 30 %
already finished: 40 %
already finished: 50 %
already finished: 60 %
already finished: 70 %
already finished: 80 %
already finished: 90 %
already finished: 100 %


### 生成符合VOC标准的路径，并生成包含所有图片id的.txt

In [19]:
# generate .txt list of images
generate_txt_imgids(origin_folder, dataset_name = subset_type)

### 生成符合VOC标准的JPEGImages软链接

In [28]:
# generate soft link
os.symlink(os.path.join(origin_folder,'images'), os.path.join(origin_folder,'JPEGImages'))