In [1]:
import numpy as np
import pandas as pd
import os
import glob
import time
import cv2

from sklearn.preprocessing import LabelEncoder

In [2]:
label_path = '/home/w/DS_Projects/Kaggle/Nature Conservancy/Scripts/Bounding Boxes/Liu Weijie/'
data_path = '/home/w/DS_Projects/Kaggle/Nature Conservancy/train/'

labels = ['ALB', 'BET', 'DOL', 'LAG', 'NoF', 'OTHER', 'SHARK', 'YFT']

label_files = [label_path + '{}.json'.format(labels[0]), label_path + '{}.json'.format(labels[1]),
               label_path + '{}.json'.format(labels[2]), label_path + '{}.json'.format(labels[3]),
               label_path + '{}.json'.format(labels[5]), label_path + '{}.json'.format(labels[6]),
               label_path + '{}.json'.format(labels[7])]


data_dirs = [data_path + '{}/'.format(labels[0]), data_path + '{}/'.format(labels[1]), 
             data_path + '{}/'.format(labels[2]), data_path + '{}/'.format(labels[3]),
             data_path + '{}/'.format(labels[5]), data_path + '{}/'.format(labels[6]),
             data_path + '{}/'.format(labels[7])] 

In [3]:
def to_df():
    
    dfs = []
    for i in range(len(label_files)):
        df = pd.read_json(label_files[i])
        dfs.append(df)
    
    filenames = []
    short_filenames = []
    classes = []
    xs = []
    ys = []
    heights = []
    widths = []
    shapes = []

    for df in dfs:
        for file in range(len(df)):
            num_annots = len(df.iloc[file, :]['annotations'])
            for i in range(num_annots):
                
                filenames.append(data_path + df.iloc[file, :]['filename'])
                short_filenames.append(df.iloc[file, :]['filename'])
                shapes.append(cv2.imread(data_path + df.iloc[file, :]['filename']).shape)
                xs.append(df.iloc[file, :]['annotations'][i]['x'])
                ys.append(df.iloc[file, :]['annotations'][i]['y'])
                heights.append(df.iloc[file, :]['annotations'][i]['height'])
                widths.append(df.iloc[file, :]['annotations'][i]['width'])
                classes.append(df.iloc[file, :]['annotations'][i]['class'])
                

    print(len(filenames))
    print(len(classes))
    print(len(xs))
    print(len(ys))
    print(len(heights))
    print(len(widths))

    filenames = pd.DataFrame(filenames)
    short_filenames = pd.DataFrame(short_filenames)
    classes = pd.DataFrame(classes)
    xs = pd.DataFrame(xs)
    ys = pd.DataFrame(ys)
    heights = pd.DataFrame(heights)
    widths = pd.DataFrame(widths)
    
    shapes = pd.DataFrame(shapes)
    shapes = shapes.iloc[:, :-1]

    data = pd.concat([filenames, classes, xs, ys, widths, heights], axis = 1)
    data.columns = ['Filename', 'Class', 'x', 'y', 'width', 'height']
    shapes.columns = ['img_height', 'img_width']
    shapes['Filename'] = filenames
    
    return data, shapes, short_filenames

def preprocess(data, drop = True):
    data2 = data.copy()
    data2['Class_enc'] = LabelEncoder().fit_transform(data2['Class'])
    filenames = data2['Filename']
    if drop:
        data2.drop(['Class', 'Filename'], axis = 1, inplace = True)

    cols = data2.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    data2 = data2[cols] 
    
    data2['x_max'] = data2['x'] + data2['width']
    data2['y_max'] = data2['y'] + data2['height']

    data2 = data2.iloc[:, [0, 1, -2, 2, -1]]
    
    return data2, filenames

data, shapes, short_names = to_df()
data2, filenames = preprocess(data)

4371
4371
4371
4371
4371
4371


In [10]:
def convert_darknet2(data2, shapes):
    
    class_enc = data2['Class_enc']
    data = data2.copy()
    data.drop(['Class_enc'], axis = 1, inplace = True)
    n_images = data.shape[0]
    x_s = []
    y_s = []
    w_s = []
    h_s = []
    
    for i in range(n_images):
    
        dw = 1./shapes.iloc[i, 1]
        dh = 1./shapes.iloc[i, 0]
        
        x = (data.iloc[i, 0] + data.iloc[i, 1])/2.0
        y = (data.iloc[i, 2] + data.iloc[i, 3])/2.0
        
        w = data.iloc[i, 1] - data.iloc[i, 0]
        h = data.iloc[i, 3] - data.iloc[i, 2]
        
        x = x*dw
        w = w*dw
        y = y*dh
        h = h*dh
        
        x_s.append(x)
        y_s.append(y)
        h_s.append(h)
        w_s.append(w)
    
    x_s = pd.DataFrame(x_s)
    y_s = pd.DataFrame(y_s)
    h_s = pd.DataFrame(h_s)
    w_s = pd.DataFrame(w_s)
    
    converted = pd.concat([x_s, y_s, w_s, h_s], axis = 1)
    converted.columns = ['x', 'y', 'width', 'height']
    converted['Class_enc'] = class_enc
    
    cols = converted.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    converted = converted[cols] 
    
    return converted


def convert_faster(data2):
    
    class_enc = data2['Class_enc'] + 1
    data = data2.copy()
    data.drop(['Class_enc'], axis = 1, inplace = True)
    n_images = data.shape[0]
    x_s = []
    y_s = []
    x_s2 = []
    y_s2 = []
    
    for i in range(n_images):
        x = data.iloc[i, 0]
        y = data.iloc[i, 2]
        x2 = data.iloc[i, 1]
        y2 = data.iloc[i, 3]
        
        x_s.append(x)
        y_s.append(y)
        x_s2.append(x2)
        y_s2.append(y2)
        
    x_s = pd.DataFrame(x_s)
    y_s = pd.DataFrame(y_s)
    x_s2 = pd.DataFrame(x_s2)
    y_s2 = pd.DataFrame(y_s2)
    
    converted = pd.concat([x_s, y_s, x_s2, y_s2], axis = 1)
    converted.columns = ['x', 'y', 'x_end', 'y_end']
    converted['Class_enc'] = class_enc
    
    cols = converted.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    converted = converted[cols] 
    
    return converted


def to_dict(data, filenames):

    datac = data.copy()
    datac['Filename'] = filenames
    mydict = {}
    for x in range(len(datac)):
        currentid = datac.loc[x, 'Filename']
        currentvalue = datac.iloc[x,:-1]
        mydict.setdefault(currentid, [])
        mydict[currentid].append(currentvalue)
        
    return mydict

In [18]:
conv = convert_darknet2(data2, shapes)
conv_dict = to_dict(conv, short_names)
new = iter_dict(conv_dict)



In [19]:
conv_dict

{'ALB/img_04102.jpg': [Class_enc           0
  x            0.344531
  y            0.647917
  width        0.139063
  height       0.126389
  Name: 1350, dtype: object], 'ALB/img_01314.jpg': [Class_enc           0
  x            0.519531
  y            0.165278
  width        0.301563
  height       0.180556
  Name: 442, dtype: object], 'ALB/img_05674.jpg': [Class_enc            0
  x             0.263672
  y             0.476667
  width        0.0835938
  height        0.225333
  Name: 1784, dtype: object], 'YFT/img_07217.jpg': [Class_enc           6
  x            0.687888
  y            0.453113
  width         0.42387
  height       0.411742
  Name: 4293, dtype: object], 'ALB/img_03706.jpg': [Class_enc           0
  x            0.345313
  y            0.642361
  width        0.096875
  height       0.179167
  Name: 1236, dtype: object, Class_enc           0
  x            0.107031
  y            0.803472
  width          0.0625
  height       0.304167
  Name: 1237, dtype: object]

In [7]:
#conv_faster = convert_faster(data2)
#conv_faster = conv_faster.astype('int64')
#faster_dict = to_dict(conv_faster, short_names)
new_faster = iter_dict(faster_dict)

In [17]:
def iter_dict(dictionary):

    new_dict = {}
    
    for key, value in dictionary.items():
        lines = []
        for i in value:
            line = []
            for j in i:
                line.append(j)
            lines.append(line)
            
        new_dict[key] = lines
            
    return new_dict




In [None]:
def save_testnames():
    
    test_names = os.listdir('/home/w/DS_Projects/Kaggle/Nature Conservancy/test_stg1/test_stg1/')
    path = '/home/w/DS_Projects/Kaggle/Nature Conservancy/Scripts/BoundingBoxes_Liu Weijie/'

    full_test = []
    for i in test_names:
        j = path + i
        full_test.append(j)

    with open(path + '2017_test.txt', "a+") as myfile:
        for line in full_test:
            myfile.write(line + '\n')
            
    return

save_testnames()