# <center>DOTA_Workflow</center>
## 用于对DOTA数据集进行切分、整理、并将其转化为符合VOC标准的结构
## 1. ImgSplit
切分数据集
## 2. GetSpecificClass
对切分后的数据集进行清理，清除掉无目标图片
## 3. Txt2Xml
将数据集转化为符合VOC标准的结构

# DOTA 目标类别

In [None]:
"""
class_list = ['plane', 'baseball-diamond', 'bridge', 'ground-track-field', 'small-vehicle',
                  'large-vehicle', 'ship', 'tennis-court', 'basketball-court',  'storage-tank', 'soccer-ball-field', 
                  'roundabout', 'harbor', 'swimming-pool', 'helicopter']
"""

# 加载相关依赖项
## 比较关键的几个：
- from DOTA_devkit import DOTA （Modified）
- import shutil (in-build)
- from txt2xml import * （DIY）

In [2]:
"""
加载依赖项
"""
#%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import os
import pylab
import sys
# sys.path.append('/home/raymond/project/PytorchSSD_DOTA/data/DOTA_devkit') # 保证DOTA_devkit可用的关键
from DOTA_devkit import DOTA
import torchvision.transforms as transforms
from PIL import Image
import cv2
import shutil
from txt2xml import *

ModuleNotFoundError: No module named 'cv2'

# 全局参数
- **Mode**：标注的解析方式
- **main_folder**: DOTA原始数据集的文件夹
- **main_type**：选择使用DOTA原始数据集的哪一部分（train、test、val）

In [None]:
# Config (Global)
Mode = 'parse_dota_rec'
main_folder = '/media/raymond/MainDrive/Dataset/DOTA'
main_type = 'train'

# ImgSplit参数
- **subset_name**：设定待切分子集的名称
- **SaveClasses**：选择保留的目标类别
- **subimg_size**：选择切分子图的尺寸
- **subimg_gap**： 选择子图之间重叠的范围
- **thresh_iou**：在切分图片的时候，可能有目标会被切割，目标剩余面积占比超过该阈值才被视作有效目标，保留包围框

In [None]:
# Config (ImgSplit)
subset_name = 'subset_planes_500_gap200_val'
SaveClasses = ['plane']
subimg_size = 500
subimg_gap = 200
thresh_iou = 0.8
# Config (Get Specific Classes)

# txt2xml参数
- **img_shape**：数据集图片尺寸信息
- **class_list**：保留的目标类别
- **subset_type**：设定.txt文件的名称（需反应该数据集的用途：train/test/val）

In [19]:
# Config (txt2xml)
img_shape =(500, 500, 3)
class_list = ['plane']
subset_type = 'plane_val'

# 建立相关的路径信息（不需手动更改）
- **raw_path**：待处理的DOTA原始数据集路径（DOTA + train/val/test）
- **subset_path**：子集存放路径（DOTA + subset_name）
- **GSC_folder**：被清洗后的子集路径（DOTA + subset_name + 'GSC'）
- **origin_folder**：待送入txt2xml进行VOC格式转化的文件夹
- **processed_folder**：VOC结构转化完成的输出文件夹

In [20]:
# generate some necessary path
# Main
raw_path = os.path.join(main_folder, main_type)
# ImgSplit
subset_path = os.path.join(main_folder,subset_name)
# GSC
GSC_folder = os.path.join(main_folder, subset_name + '_GSC')
# txt2xml
origin_folder = GSC_folder
processed_folder = GSC_folder

# 加载原始数据集

In [9]:
"""
生成一个DOTA实例
"""
# 测试路径：'/home/raymond/project/DOTA_PyTorch/DOTA_devkit/example'
# train路径：'/media/b622/MainDrive/Dataset/DOTA/train'
#dataset = DOTA.DOTA('/home/raymond/project/PytorchSSD_DOTA/data/DOTA_devkit/classsplit',parseMode='parse_dota_rec')
dataset = DOTA.DOTA(raw_path, parseMode = Mode)

 Error: /media/raymond/MainDrive/Dataset/DOTA/val/labelTxt/labelTxt.zip is not a .txt file
 Error: /media/raymond/MainDrive/Dataset/DOTA/val/labelTxt/Val_Task2_gt.zip is not a .txt file
 DOTA dataset has been successfully loaded 


# ImgSplit操作
### 裁剪图片+筛选类别

In [10]:
# ImgSplit
"""
裁剪图像与标注，生成新的数据集
"""
# split image
# cut image (original)
from DOTA_devkit import ImgSplit
imgids = dataset.getImgIds(catNms = SaveClasses)
imagelist = imgids
split = ImgSplit.splitbase(raw_path,subset_path,gap=subimg_gap, subsize=subimg_size, thresh=thresh_iou, choosebestpoint=True)
# split.splitdata(self,rate:int,imglist:List[str])
split.splitdata(1,imagelist)

加载图片ID完成：共有 70 张图片符合筛选条件


# Pre-Process of GSC
### 加载子集，筛选类别，构造路径

In [13]:
"""
生成一个DOTA实例"""
#root = '/media/raymond/MainDrive/Dataset/DOTA'
#raw_dataset = 'subset_planes_500_gap200'
#open_folder = os.path.join(root,subset_path)
dataset = DOTA.DOTA(subset_path, parseMode = Mode)
"""
加载包含指定类别目标的图片ID"""
imgids = dataset.getImgIds(catNms = SaveClasses)
print(len(imgids))
"""
构造打开、保存图像+标注的路径"""
img_root = os.path.join(subset_path,'images')
label_root = os.path.join(subset_path,'labelTxt')
img_save = os.path.join(GSC_folder,'images')
label_save = os.path.join(GSC_folder,'labelTxt')


 DOTA dataset has been successfully loaded 
加载图片ID完成：共有 1507 张图片符合筛选条件
1507


# GSC操作
### 清理无效数据（空目标图片）

In [16]:
"""
建立相关文件夹
"""
if not os.path.exists(GSC_folder):
    os.mkdir(GSC_folder)
    print('Directory is built:'+ GSC_folder)
if not os.path.exists(img_save): 
    os.mkdir(img_save)
    print('Directory is built:'+ img_save)
if not os.path.exists(label_save):
    os.mkdir(label_save)
    print('Directory is built:'+ label_save)
for imgid in imgids:
    imgname = str(imgid)+'.png'
    labelname = str(imgid)+'.txt'
    # copy img
    shutil.copy(os.path.join(img_root,imgname),os.path.join(img_save,imgname))
    # copy label
    shutil.copy(os.path.join(label_root,labelname),os.path.join(label_save,labelname))
    print('processing: '+str(imgid))
print('All copy operation is done!')

processing: P1390__1__18__6
processing: P1508__1__1__1
processing: P1397__1__4__7
processing: P1397__1__19__9
processing: P0179__1__12__11
processing: P1397__1__3__7
processing: P1398__1__2__9
processing: P0179__1__12__14
processing: P0170__1__1__1
processing: P1397__1__2__5
processing: P2236__1__2__1
processing: P1384__1__8__8
processing: P1398__1__4__13
processing: P1809__1__23__11
processing: P2231__1__4__4
processing: P1143__1__7__4
processing: P1390__1__17__4
processing: P1398__1__0__6
processing: P2778__1__8__6
processing: P1143__1__1__6
processing: P1809__1__18__10
processing: P1513__1__5__7
processing: P1088__1__2__4
processing: P2802__1__3__8
processing: P0161__1__4__5
processing: P1397__1__4__6
processing: P1854__1__21__4
processing: P1103__1__1__0
processing: P1390__1__8__7
processing: P1508__1__1__7
processing: P1376__1__7__8
processing: P1398__1__0__8
processing: P1397__1__25__9
processing: P2231__1__1__1
processing: P2778__1__4__6
processing: P0841__1__5__2
processing: P1

processing: P2791__1__11__15
processing: P1398__1__2__3
processing: P0841__1__3__9
processing: P0161__1__6__3
processing: P0249__1__1__3
processing: P2802__1__9__8
processing: P2218__1__2__0
processing: P1397__1__0__2
processing: P1474__1__7__16
processing: P1390__1__2__3
processing: P1397__1__21__8
processing: P1154__1__3__8
processing: P0213__1__2__1
processing: P1390__1__14__3
processing: P2242__1__0__3
processing: P0841__1__3__4
processing: P1390__1__1__7
processing: P1397__1__19__7
processing: P0161__1__4__1
processing: P2220__1__3__2
processing: P1738__1__12__8
processing: P1598__1__12__2
processing: P1854__1__21__12
processing: P2689__1__9__5
processing: P1179__1__7__4
processing: P1156__1__7__3
processing: P1854__1__18__6
processing: P1541__1__0__3
processing: P2802__1__6__9
processing: P2230__1__4__1
processing: P1397__1__6__1
processing: P2791__1__12__15
processing: P2220__1__2__1
processing: P0841__1__1__3
processing: P1809__1__6__3
processing: P1397__1__13__9
processing: P1

processing: P2230__1__6__2
processing: P1397__1__1__2
processing: P2231__1__2__0
processing: P1397__1__14__4
processing: P1854__1__39__12
processing: P2778__1__8__8
processing: P1601__1__1__5
processing: P1178__1__4__10
processing: P1390__1__14__8
processing: P1384__1__7__6
processing: P0841__1__3__0
processing: P1376__1__8__9
processing: P1088__1__1__2
processing: P1529__1__6__4
processing: P1854__1__4__6
processing: P0179__1__8__8
processing: P1598__1__9__1
processing: P1397__1__20__6
processing: P1390__1__3__3
processing: P1143__1__2__9
processing: P1854__1__31__7
processing: P2230__1__6__1
processing: P1390__1__19__5
processing: P1390__1__11__6
processing: P1156__1__9__1
processing: P1397__1__8__6
processing: P2791__1__8__8
processing: P1397__1__18__4
processing: P1384__1__5__7
processing: P2791__1__0__4
processing: P1390__1__18__7
processing: P2778__1__7__5
processing: P1390__1__6__4
processing: P0841__1__2__4
processing: P1397__1__16__3
processing: P1179__1__7__7
processing: P139

processing: P1854__1__15__6
processing: P0179__1__15__10
processing: P1373__1__5__12
processing: P2220__1__1__0
processing: P2271__1__4__2
processing: P1088__1__1__1
processing: P1601__1__1__8
processing: P0168__1__2__3
processing: P0179__1__6__3
processing: P1397__1__3__9
processing: P1390__1__16__9
processing: P1738__1__11__4
processing: P2689__1__7__7
processing: P2689__1__7__6
processing: P0179__1__1__1
processing: P1390__1__17__1
processing: P1376__1__8__10
processing: P1397__1__21__2
processing: P2778__1__6__2
processing: P1397__1__7__6
processing: P0179__1__9__5
processing: P1390__1__3__8
processing: P1809__1__22__9
processing: P1398__1__3__10
processing: P0249__1__2__4
processing: P1513__1__6__13
processing: P1390__1__7__7
processing: P0179__1__11__14
processing: P1474__1__5__7
processing: P1397__1__7__7
processing: P2778__1__7__6
processing: P0179__1__13__13
processing: P1483__1__1__1
processing: P1397__1__13__7
processing: P1397__1__24__3
processing: P1512__1__2__1
processing

processing: P1854__1__41__12
processing: P1384__1__5__6
processing: P1566__1__9__8
processing: P1483__1__2__0
processing: P1809__1__5__3
processing: P1583__1__5__6
processing: P2242__1__0__1
processing: P2802__1__16__16
processing: P0179__1__12__10
processing: P1149__1__1__6
processing: P1184__1__4__14
processing: P1513__1__2__9
processing: P0168__1__5__1
processing: P1178__1__5__4
processing: P2802__1__14__15
processing: P1397__1__24__7
processing: P1809__1__20__11
processing: P1390__1__2__1
processing: P1154__1__2__9
processing: P1397__1__2__2
processing: P0249__1__1__2
processing: P0179__1__12__9
processing: P2242__1__1__2
processing: P1398__1__0__7
processing: P1397__1__20__4
processing: P2802__1__14__14
processing: P1474__1__4__3
processing: P1854__1__0__0
processing: P1854__1__4__3
processing: P1373__1__3__9
processing: P1398__1__0__11
processing: P1390__1__7__6
processing: P2271__1__3__1
processing: P0841__1__1__1
processing: P1376__1__5__7
processing: P0249__1__0__3
processing:

processing: P2802__1__14__10
processing: P0259__1__3__5
processing: P0213__1__5__3
processing: P2242__1__5__3
processing: P1376__1__8__3
processing: P1397__1__13__4
processing: P1809__1__19__11
processing: P1397__1__9__4
processing: P2791__1__14__17
processing: P1390__1__13__6
processing: P2236__1__0__0
All copy operation is done!


# txt2xml操作1
### 将.txt格式的标注文件转化为符合VOC标准的.xml标注文件
### 生成符合VOC标准的路径，并生成包含所有图片id的.txt

In [21]:
# txt2xml
txt2xml(origin_folder, img_shape, class_list, parseMode=Mode)
generate_txt_imgids(origin_folder, dataset_name = subset_type)

Directory is built:/media/raymond/MainDrive/Dataset/DOTA/subset_planes_500_gap200_val_GSC/Annotations
 DOTA dataset has been successfully loaded 
加载图片ID完成：返回所有图片ID
0 annotation files has finished!
