# Getting Kaggle Dataset Arthropod Taxanomy 

In [None]:
# Lets get the data for training purposes from our kaggle dataset of Arthropod Taxanomy 
# Follow this link : https://www.kaggle.com/rohanprospects/account?isEditing=False&verifyPhone=False
!pip install -q kaggle
from google.colab import files
files.upload()
!mkdir ~/.kaggle
!cp "/content/kaggle.json" ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets list

In [None]:
!kaggle datasets download -d mistag/arthropod-taxonomy-orders-object-detection-dataset

In [None]:
!unzip /content/arthropod-taxonomy-orders-object-detection-dataset.zip

In [4]:
# YOLO annotations look like this in a txt file 
# category_id bbox 

!mkdir data
!mkdir data/train data/val
!mkdir data/train/images data/train/annotations data/val/images data/val/annotations


In [6]:
# written by Cai
#You can find this code in https://github.com/ct990912/kaggle/blob/main/Arthropod_convert2coco.py
import os
import json
import random
import shutil

In [7]:
def getFile(root_path):
    file_list = list()
    for file in os.listdir(root_path):
        if ('.' in file) == False and file != "coco":
            file_list.append(file)
    return file_list


def getAllJson(path):
    file_list = list()
    for file in os.listdir(path):
        if file.split('.')[1] != "vott":
            file_list.append(path + '/' + file)
    return file_list


def read_json(path: str):
    with open(path, 'r') as f:
        data = json.load(f)
    return data


def getImg(path):
    img_list = list()
    for img_file in os.listdir(path):
        if img_file != "annotations":
            img_list.append(img_file)
    return img_list



In [8]:
def convert2coco(root_path, split_rate=0.1,saveFile=None):
    """
    Args:
        root_path(str): your data's root path, e.g., "/home/cai/data/ArTaxOr/".
        split_rate(float): Divide the dataset into training and validation sets.
        saveFile(str): if saveFile is None, converted data will store \
                in 'root_path/coco/'; if given, converted data will \
                store your own "saveFile".
    Examples:
        >>>convert2coco("/home/cai/data/ArTaxOr/", 0.1, "/home/cai/coco/")
    """
    
    if saveFile is None:
        saveFile=root_path+"coco/"
        if not os.path.exists(root_path + "coco/"):
            os.makedirs(root_path + "coco/")
        if not os.path.exists(root_path + "coco/annotations/"):
            os.makedirs(root_path + "coco/annotations/")
        if not os.path.exists(root_path + "coco/train/"):
            os.makedirs(root_path + "coco/train/")
        if not os.path.exists(root_path + "coco/val/"):
            os.makedirs(root_path + "coco/val/")
    else:
        if not os.path.exists(saveFile + "annotations/"):
            os.makedirs(saveFile + "annotations/")
        if not os.path.exists(saveFile + "train/"):
            os.makedirs(saveFile + "train/")
        if not os.path.exists(saveFile + "val/"):
            os.makedirs(saveFile + "val/")
    
    all_class_file = getFile(root_path)  # 分别获取每个分类的目录名，存在list里面
    coco_train_annotations = dict(images=list(), annotations=list(), categories=list())  # coco训练集annotations
    coco_val_annotations = dict(images=list(), annotations=list(), categories=list())  # coco验证集annotations
    coco_annotations = [coco_train_annotations, coco_val_annotations]  # 用于划分测试集和训练集
    img_idx = 10
    bbox_idx = 100000
    label_idx = 0
    # ----------------获取coco格式中的categories
    for idx, label in enumerate(all_class_file):
        category = dict(id=idx, supercategory="Arthropod", name=label)
        coco_annotations[0]["categories"].append(category)
        coco_annotations[1]["categories"].append(category)

    # ---------------获取coco格式中的images和annotations
    for each_class in all_class_file:
        print("start to convert "+each_class+'\n')
        root_img = root_path + each_class + '/'  # 某类图片的根路径
        all_annotations = getAllJson(root_path + each_class + "/annotations")  # 获取所有的json annotation
        for json_file in all_annotations:  # 访问单个json，并按splite_rate数据存入到coco_anotations中
            data = read_json(json_file)
            flag = 1 if random.random() < split_rate else 0  # 划分测试集和训练集 train:0; val: 1;
            shutil.copyfile(root_img + data["asset"]["name"],
                        saveFile + ("train/" if flag == 0 else "val/") + data["asset"]["name"])
            # ---------------获取coco格式中的image
            img = dict(file_name=data["asset"]["name"],
                       height=data["asset"]["size"]["height"],
                       width=data["asset"]["size"]["width"],
                       id=img_idx)
            coco_annotations[flag]["images"].append(img)
            #  [x, y, w, h]， 下面的代码是一张图里面的所有bbox append到annotations里面
            for region in data["regions"]:
                bbox = region["boundingBox"]
                anno = dict(image_id=img_idx, segmentation=[[]], area=240, iscrowd=0,
                            bbox=[bbox["left"], bbox["top"], bbox["width"], bbox["height"]],
                            id=bbox_idx, category_id=label_idx)
                bbox_idx += 1
                coco_annotations[flag]["annotations"].append(anno)

            img_idx += 1
        label_idx += 1

    with open(saveFile+"annotations/train.json", "w") as f:
        json.dump(coco_annotations[0], f)
    with open(saveFile + "annotations/val.json", "w") as f:
        json.dump(coco_annotations[1], f)
    print("finish...")

In [9]:
if __name__ == "__main__":
    convert2coco("/content/ArTaxOr/",0.1,"./")

start to convert Hymenoptera

start to convert Lepidoptera

start to convert Coleoptera

start to convert Araneae

start to convert Diptera

start to convert Hemiptera

start to convert Odonata

finish...


In [10]:
!rm -rf "/content/ArTaxOr"

In [15]:
# For train data 
json_file = "/content/annotations/train.json"
with open(json_file) as datafile:
  data = json.load(datafile)

print(data['images'][0])
print(data['annotations'][1])

{'file_name': '9d29e0d08d8d.jpg', 'height': 2048, 'width': 1536, 'id': 10}
{'image_id': 11, 'segmentation': [[]], 'area': 240, 'iscrowd': 0, 'bbox': [439.1205357142857, 225.04789272030652, 183.34285714285713, 342.00191570881225], 'id': 100001, 'category_id': 0}


In [16]:
print(data['annotations'][4])

{'image_id': 15, 'segmentation': [[]], 'area': 240, 'iscrowd': 0, 'bbox': [315.83141762452107, 874.911877394636, 947.4942528735631, 504.15325670498083], 'id': 100005, 'category_id': 0}


In [23]:
# For training data 
for image in data["images"]:
    image_name = image['file_name'].split(".")[0]
    txt_file_path = os.path.join('/content/train',image_name+".txt")
    image_list = []
    for annotations in data['annotations']:
      if annotations['image_id'] == image['id']:
            #annotations['category_id'] = category_dict[annotations['category_id']]
            bbox = annotations['bbox']
            category = annotations['category_id']
            image_list.append((category,bbox))
    input = []
    for output in image_list:
      string_line = "{} {} {} {} {}".format(output[0], output[1][0], output[1][1],output[1][2],output[1][3])
      input.append(string_line)
    with open(txt_file_path, 'w') as fp:
      for line in input:
          fp.write(line)
          fp.write('\n') 

In [24]:
# For val data 
json_file = "/content/annotations/val.json"
with open(json_file) as datafile:
  data = json.load(datafile)

for image in data["images"]:
    image_name = image['file_name'].split(".")[0]
    txt_file_path = os.path.join('/content/val',image_name+".txt")
    image_list = []
    for annotations in data['annotations']:
      if annotations['image_id'] == image['id']:
            #annotations['category_id'] = category_dict[annotations['category_id']]
            bbox = annotations['bbox']
            category = annotations['category_id']
            image_list.append((category,bbox))
    input = []
    for output in image_list:
      string_line = "{} {} {} {} {}".format(output[0], output[1][0], output[1][1],output[1][2],output[1][3])
      input.append(string_line)
    with open(txt_file_path, 'w') as fp:
      for line in input:
          fp.write(line)
          fp.write('\n') 

In [25]:
!mkdir images
!mkdir images/train
!mkdir images/test
!mkdir images/train/images
!mkdir images/train/labels
!mkdir images/test/images
!mkdir images/test/labels


In [28]:
import os
import shutil

files = os.listdir('/content/train')
for file_name in files:
  if ".txt" in file_name:
    shutil.move(os.path.join('/content/train',file_name),os.path.join('/content/images/train/labels',file_name))
  elif ".jpg" in file_name:
    shutil.move(os.path.join('/content/train',file_name),os.path.join('/content/images/train/images',file_name))

In [30]:
# for test data 
import os
import shutil

files = os.listdir('/content/val')
for file_name in files:
  if ".txt" in file_name:
    shutil.move(os.path.join('/content/val',file_name),os.path.join('/content/images/test/labels',file_name))
  elif ".jpg" in file_name:
    shutil.move(os.path.join('/content/val',file_name),os.path.join('/content/images/test/images',file_name))

In [32]:
# Lets create a YAML file data.yaml 
import yaml
'''
train: /content/images/train/images
val: /content/images/test/images

nc: 7
names: ['Coleoptera','Diptera','Hymenoptera','Hemiptera','Lepidoptera','Araneae','Odonata']
'''
# Create this yaml file and save it under images folder only 

"\ntrain: /content/images/train/images\nval: /content/images/test/images\n\nnc: 7\nnames: ['Coleoptera','Diptera','Hymenoptera','Hemiptera','Lepidoptera','Araneae','Odonata']\n"

# Lets start with teh google drive setup now 

In [None]:
# setting up YOLO is very simple actually. Its super easy 
# YOLO is also one of the most used algorithm for object detection purpose 

# Clone the github repository 
!git clone https://github.com/ultralytics/yolov5
%cd yolov5

In [None]:
# Install requirements.txt
!pip install -r requirements.txt

In [36]:
!pip install utils

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting utils
  Downloading utils-1.0.1-py2.py3-none-any.whl (21 kB)
Installing collected packages: utils
Successfully installed utils-1.0.1


In [2]:
import torch

from IPython.display import Image, clear_output  # to display images
#from utils.google_utils import gdrive_download  # to download models/datasets

# clear_output()
print('Setup complete. Using torch %s %s' % (torch.__version__, torch.cuda.get_device_properties(0) if torch.cuda.is_available() else 'CPU'))

Setup complete. Using torch 1.12.1+cu113 _CudaDeviceProperties(name='Tesla T4', major=7, minor=5, total_memory=15109MB, multi_processor_count=40)


In [4]:
# this is the YAML file
%cat /content/images/data.yaml

train: /content/images/train/images
val: /content/images/test/images

nc: 7
names: ['Coleoptera','Diptera','Hymenoptera','Hemiptera','Lepidoptera','Araneae','Odonata']

# Define Model configuration and Architecture

In [5]:
# define number of classes based on YAML
import yaml
with open("/content/images/data.yaml", 'r') as stream:
    num_classes = str(yaml.safe_load(stream)['nc'])

In [6]:
#this is the model configuration we will use for our tutorial 
%cat /content/yolov5/models/yolov5m.yaml

# YOLOv5 🚀 by Ultralytics, GPL-3.0 license

# Parameters
nc: 80  # number of classes
depth_multiple: 0.67  # model depth multiple
width_multiple: 0.75  # layer channel multiple
anchors:
  - [10,13, 16,30, 33,23]  # P3/8
  - [30,61, 62,45, 59,119]  # P4/16
  - [116,90, 156,198, 373,326]  # P5/32

# YOLOv5 v6.0 backbone
backbone:
  # [from, number, module, args]
  [[-1, 1, Conv, [64, 6, 2, 2]],  # 0-P1/2
   [-1, 1, Conv, [128, 3, 2]],  # 1-P2/4
   [-1, 3, C3, [128]],
   [-1, 1, Conv, [256, 3, 2]],  # 3-P3/8
   [-1, 6, C3, [256]],
   [-1, 1, Conv, [512, 3, 2]],  # 5-P4/16
   [-1, 9, C3, [512]],
   [-1, 1, Conv, [1024, 3, 2]],  # 7-P5/32
   [-1, 3, C3, [1024]],
   [-1, 1, SPPF, [1024, 5]],  # 9
  ]

# YOLOv5 v6.0 head
head:
  [[-1, 1, Conv, [512, 1, 1]],
   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
   [[-1, 6], 1, Concat, [1]],  # cat backbone P4
   [-1, 3, C3, [512, False]],  # 13

   [-1, 1, Conv, [256, 1, 1]],
   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
   [[-1, 4], 1, Concat, [1]]

In [7]:
#customize iPython writefile so we can write variables
from IPython.core.magic import register_line_cell_magic

@register_line_cell_magic
def writetemplate(line, cell):
    with open(line, 'w') as f:
        f.write(cell.format(**globals()))

In [9]:
%%writetemplate /content/yolov5/models/yolov5m.yaml

# parameters
nc: {num_classes}  # number of classes
depth_multiple: 0.67  # model depth multiple
width_multiple: 0.75  # layer channel multiple

# anchors
anchors:
  - [10,13, 16,30, 33,23]  # P3/8
  - [30,61, 62,45, 59,119]  # P4/16
  - [116,90, 156,198, 373,326]  # P5/32

# YOLOv5 backbone
backbone:
  # [from, number, module, args]
  [[-1, 1, Focus, [64, 3]],  # 0-P1/2
   [-1, 1, Conv, [128, 3, 2]],  # 1-P2/4
   [-1, 3, C3, [128]],
   [-1, 1, Conv, [256, 3, 2]],  # 3-P3/8
   [-1, 9, C3, [256]],
   [-1, 1, Conv, [512, 3, 2]],  # 5-P4/16
   [-1, 9, C3, [512]],
   [-1, 1, Conv, [1024, 3, 2]],  # 7-P5/32
   [-1, 1, SPP, [1024, [5, 9, 13]]],
   [-1, 3, C3, [1024, False]],  # 9
  ]

# YOLOv5 head
head:
  [[-1, 1, Conv, [512, 1, 1]],
   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
   [[-1, 6], 1, Concat, [1]],  # cat backbone P4
   [-1, 3, C3, [512, False]],  # 13

   [-1, 1, Conv, [256, 1, 1]],
   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
   [[-1, 4], 1, Concat, [1]],  # cat backbone P3
   [-1, 3, C3, [256, False]],  # 17 (P3/8-small)

   [-1, 1, Conv, [256, 3, 2]],
   [[-1, 14], 1, Concat, [1]],  # cat head P4
   [-1, 3, C3, [512, False]],  # 20 (P4/16-medium)

   [-1, 1, Conv, [512, 3, 2]],
   [[-1, 10], 1, Concat, [1]],  # cat head P5
   [-1, 3, C3, [1024, False]],  # 23 (P5/32-large)

   [[17, 20, 23], 1, Detect, [nc, anchors]],  # Detect(P3, P4, P5)
  ]

# Train Custom YOLOv5 Detector

### Next, we'll fire off training!


Here, we are able to pass a number of arguments:
- **img:** define input image size
- **batch:** determine batch size
- **epochs:** define the number of training epochs. (Note: often, 3000+ are common here!)
- **data:** set the path to our yaml file
- **cfg:** specify our model configuration
- **weights:** specify a custom path to weights. (Note: you can download weights from the Ultralytics Google Drive [folder](https://drive.google.com/open?id=1Drs_Aiu7xx6S-ix95f9kNsA6ueKRpN2J))
- **name:** result names
- **nosave:** only save the final checkpoint
- **cache:** cache images for faster training

In [10]:
!cat /content/images/data.yaml

train: /content/images/train/images
val: /content/images/test/images

nc: 7
names: ['Coleoptera','Diptera','Hymenoptera','Hemiptera','Lepidoptera','Araneae','Odonata']

In [12]:
# train yolov5s on custom data for 100 epochs
# time its performance
%%time
%cd /content/yolov5
!python train.py --img 416 --batch 16 --epochs 100 --data '/content/images/data.yaml' --cfg /content/yolov5/models/yolov5m.yaml --weights '' --name yolov5m_results

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
      127.12      96.027      1081.9      838.71      161.02      145.45       659.6      981.34       398.3       220.3      816.38      690.44      146.89       115.8      689.26      820.36      108.76      70.608      686.43      711.62      161.02      138.39      620.05       779.4      86.157      122.86
      522.59      792.11      135.59      108.74      320.62      595.82      282.48       348.8      257.06      868.37      402.54      186.41      560.73      862.72      484.46      163.81      829.09      773.75      406.78      228.77      954.79      890.96      457.62      286.67      470.33      90.268
       477.4      405.29]
      942.34       394.3      89.747         128      795.22      478.16      166.25      204.51      839.36      354.57      113.29      116.23      751.08      250.11      154.48      167.72      845.24      192.74      173.61      160.37      690.76      306.02      141.24      1

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



CPU times: user 373 ms, sys: 69.6 ms, total: 443 ms
Wall time: 12.4 s
