In [12]:
# print(deepweeds_path)
# print(relativePath)

/Users/hlyd4326/Documents/sih_projects/Weed-ID-Interchange/conversion_tools
/deepweeds_to_json/deepweeds_images/20161207-112029-0.jpg


In [3]:
"""
deepweeds_to_json.py

Convert a DeepWeeds .csv file to CCT format.

Modified from code from Microsoft's CameraTraps repo:
https://github.com/microsoft/CameraTraps
"""


"""Constants and environment"""

import pandas as pd
import pathlib
import uuid
import json
import time
import datetime
from tqdm import tqdm
import humanfriendly
import exifread
import sys
import os
import PIL
from PIL import Image
#from visualization import visualize_db
#import path_utils

deepweeds_path = pathlib.Path(os.path.realpath('..'))

input_metadata_file = deepweeds_path / "deepweeds_to_json/labels.csv"
output_file = deepweeds_path / "deepweeds_to_json/deepweeds_imageinfo.json"
image_folder = deepweeds_path / "deepweeds_to_json/deepweeds_images_full"
dirName = "deepweeds_to_json/deepweeds_images_full"

#filename_replacements = {dirName:'DeepWeeds'}
category_mappings = {'none':'empty'}


"""
Read source data

DeepWeeds annotations have Filename, Label, and Species columns.
"""

input_metadata = pd.read_csv(input_metadata_file)

print('Read {} columns and {} rows from metadata file'.format(len(input_metadata.columns),
      len(input_metadata)))


"""Main loop over labels"""

startTime = time.time()

relativePathToImage = {}

images = []
annotations = []
categoryIDToCategories = {}
missingFiles = []

duplicateImageIDs = set()

# iRow = 0; row = input_metadata.iloc[iRow]
for iRow,row in tqdm(input_metadata.iterrows(),total=len(input_metadata)):
    
    # ImageID,Filename,FilePath,SpeciesID
    imageID = str(row['Filename'])
    fn = row['Filename']
    relativePath = os.path.join(dirName,fn)
    
    # This makes an assumption of one annotation per image, which happens to be
    # true in this data set.
    if relativePath in relativePathToImage:

        im = relativePathToImage[relativePath]
        assert im['id'] == imageID
        duplicateImageIDs.add(imageID)
            
    else:
        im = {}
        im['id'] = iRow
        im['file_name'] = 'deepweeds/' + os.path.basename(relativePath)
        im['license'] = 0
        images.append(im)
        relativePathToImage[relativePath] = im
        
        fullPath = os.path.join(deepweeds_path,relativePath)
        
        if not os.path.isfile(fullPath):
            
            missingFiles.append(fullPath)
        
        else:
            # Retrieve image width and height
            pilImage = PIL.Image.open(fullPath)
            width, height = pilImage.size
            im['width'] = width
            im['height'] = height
            im['resolution'] = width*height

    categoryName = row['Species'].lower()
    if categoryName in category_mappings:
        categoryName = category_mappings[categoryName]
        
    categoryID = row['Label']
    assert isinstance(categoryID,int)
    
    # Generate category objects
    if categoryID not in categoryIDToCategories:
        category = {}
        category['common_name'] = row['Species'].lower()
        category['id'] = row['Label']
        categoryIDToCategories[categoryID] = category
        if category['common_name'] == 'negative':
            category['role'] = 'na'
        else:
            category['role'] = 'weed'
        if category['common_name'] == 'chinee apple':
            category['species'] = 'ziziphus mauritiana'
            category['eppo_taxon_code'] = 'ZIPMA'
        if category['common_name'] == 'lantana':
            category['species'] = 'lantana camara'
            category['eppo_taxon_code'] = 'LANCA'
        if category['common_name'] == 'snake weed':
            category['species'] = 'gutierrezia sarothrae'
            category['eppo_taxon_code'] = 'GUESA'
        if category['common_name'] == 'siam weed':
            category['species'] = 'chromolaena odorata'
            category['eppo_taxon_code'] = 'EUPOD'
        if category['common_name'] == 'prickly acacia':
            category['species'] = 'vachellia nilotica'
            category['eppo_taxon_code'] = 'ACANL'
        if category['common_name'] == 'parthenium':
            category['species'] = 'parthenium hysterophorus'
            category['eppo_taxon_code'] = 'PTNHY'
        if category['common_name'] == 'rubber vine':
            category['species'] = 'cryptostegia grandiflora'
            category['eppo_taxon_code'] = 'CVRGR'
        if category['common_name'] == 'parkinsonia':
            category['species'] = 'parkinsonia aculeata'
            category['eppo_taxon_code'] = 'PAKAC'        

    # Create an annotation
    ann = {}
    
    # This creates a unique ID, however this feature may not be needed
    ann['id'] = iRow
    ann['image_id'] = im['id']    
    ann['category_id'] = categoryID
    ann['agcontext_id'] = 0
    ann['agcontext_name'] = 'deepweeds'
    
    annotations.append(ann)
    
categories = list(categoryIDToCategories.values())

elapsed = time.time() - startTime
print('Finished verifying file loop in {}, {} images, {} missing images, {} repeat labels'.format(
        humanfriendly.format_timespan(elapsed), len(images), len(missingFiles), len(duplicateImageIDs)))    


"""
Check for images that aren't included in the metadata file

This function is disabled for now.
"""

# Enumerate all images
# list(relativePathToImage.keys())[0]

#imageFullPaths = path_utils.find_images(image_folder,bRecursive=True)
#unmatchedFiles = []

#for iImage,imagePath in enumerate(imageFullPaths):
    
#    fn = os.path.relpath(imagePath,image_folder)    
#    if fn not in relativePathToImage:
#        unmatchedFiles.append(fn)

#print('Finished checking {} images to make sure they\'re in the metadata, found {} mismatches'.format(
#        len(imageFullPaths),len(unmatchedFiles)))

"""Create info array and object"""

info = [{
    "year": 2019,
    "version": 1,
    "description": "CSV annotations and JPEG images converted into WeedCOCO",
    "secondary_contributor": "Converted to WeedCOCO by Henry Lydecker",
    "contributor": "Alex Olsen",
    "id": 0
}]

"""Create license array and object"""

license = [{
    "id": 0,
    "license_name": "CC BY 4.0",
    "license_fullname": "Creative Commons Attribution 4.0",
    "license_version": "4.0",
    "url":"https://creativecommons.org/licenses/by/4.0/"
}]

"""
Create collection object
"""
collections = [
    {"author": "Olsen, Alex",
     "title": "DeepWeeds: A Multiclass Weed Species Image Dataset for Deep Learning",
     "year": 2019,
     "identifier": "doi:10.1038/s41598-018-38343-3",
     "rights": "Apache License 2.0",
     "accrual_policy": "Closed",
     "id": 0,
    }
]

# TODO: Create collection memberships from multiple csv files...

"""
Create dataset object

This information is invariant across a dataset upon upload.

Datasets can then be concatenated to include multiple agdatasets
"""
# Individual dataset level agdata
agcontext = [
    {
    "id": 0,
    "agcontext_name":"deepweeds",
    "crop_type":"pastoral_grassland",
    "crop_stage":"multiple",
    "location_continent":"Australia",
    "location_country":"Australia",
    "location_state":"Queensland",
    "upload_time":datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    "camera_type":"weedlogger",
    "camera_height":100,
    "camera_angle":90,
    "camera_fov":85,
    "emr_channels":"na",
    "url":"https://github.com/AlexOlsen/DeepWeeds"
    }
]


"""Write output"""

json_data = {}
json_data['images'] = images
json_data['annotations'] = annotations
json_data['categories'] = categories
json_data['info'] = info
json_data['license'] = license
json_data['agcontexts'] = agcontext
json_data['collections'] = collections
json.dump(json_data, open(output_file,'w'), indent=4)

print('Finished writing .json file with {} images, {} annotations, and {} categories'.format(
        len(images),len(annotations),len(categories)))

  2%|▏         | 360/17509 [00:00<00:04, 3591.09it/s]

Read 3 columns and 17509 rows from metadata file


100%|██████████| 17509/17509 [00:04<00:00, 3617.65it/s]


Finished verifying file loop in 4.84 seconds, 17509 images, 0 missing images, 0 repeat labels
Finished writing .json file with 17509 images, 17509 annotations, and 9 categories


## Utility Functions

For now I have moved these utility functions out of the convertor. They were difficult to get functioning and we may want to use different solutions once we are farther along.

In [3]:
"""Sanity-check the database's integrity"""

from data_management.databases import sanity_check_json_db

options = sanity_check_json_db.SanityCheckOptions()
sortedCategories,data = sanity_check_json_db.sanity_check_json_db(output_file, options)

    
"""Render a bunch of images to make sure the labels got carried along correctly"""
bbox_db_path = output_file
output_dir = preview_base

options = visualize_bbox_db.BboxDbVizOptions()
options.num_to_visualize = 1000
options.sort_by_filename = False

htmlOutputFile = visualize_bbox_db.process_images(bbox_db_path,output_dir,image_folder,options)

ModuleNotFoundError: No module named 'data_management'