In [1]:
# Import the required modules.

import numpy as np
import pandas as pd
import json
import cv2
import os
from matplotlib import pyplot as plt
import matplotlib.patches as patches
from matplotlib import gridspec

from imutils import perspective
from imutils import contours
import imutils
import sqlite3

from tqdm.notebook import tqdm

%matplotlib inline

In [2]:
# Connect to the SQLite database.
conn = sqlite3.connect('E:\\Documents\\PostDoc\\know_fusion_labels\\know_fusion_labels.lbldb')

# Set a start point to remove the non-tag tool results.
preset = 5087

# Read in the zooniverse classifications.
data = pd.read_csv('astro-ecology-classifications.csv')[preset:]
data.head()

Unnamed: 0,classification_id,user_name,user_id,user_ip,workflow_id,workflow_name,workflow_version,created_at,gold_standard,expert,metadata,annotations,subject_data,subject_ids
5087,135872963,RossMcWhirter,1813215.0,1a400102c1fc2a1080f9,7916,Knowsley Safari rhino enclosure,67.165,2018-12-10 17:30:41 UTC,,,"{""source"":""api"",""session"":""dd0eff20c2f11dc45f8...","[{""task"":""T0"",""task_label"":""Use the tools belo...","{""26793143"":{""retired"":{""id"":23500958,""workflo...",26793143
5088,135872977,RossMcWhirter,1813215.0,1a400102c1fc2a1080f9,7916,Knowsley Safari rhino enclosure,67.165,2018-12-10 17:30:44 UTC,,,"{""source"":""api"",""session"":""dd0eff20c2f11dc45f8...","[{""task"":""T0"",""task_label"":""Use the tools belo...","{""26793399"":{""retired"":{""id"":23502221,""workflo...",26793399
5089,135873070,RossMcWhirter,1813215.0,1a400102c1fc2a1080f9,7916,Knowsley Safari rhino enclosure,67.165,2018-12-10 17:31:21 UTC,,,"{""source"":""api"",""session"":""dd0eff20c2f11dc45f8...","[{""task"":""T0"",""task_label"":""Use the tools belo...","{""26795947"":{""retired"":{""id"":23503609,""workflo...",26795947
5090,135873112,RossMcWhirter,1813215.0,1a400102c1fc2a1080f9,7916,Knowsley Safari rhino enclosure,67.165,2018-12-10 17:31:41 UTC,,,"{""source"":""api"",""session"":""dd0eff20c2f11dc45f8...","[{""task"":""T0"",""task_label"":""Use the tools belo...","{""26793713"":{""retired"":{""id"":23502754,""workflo...",26793713
5091,135873216,RossMcWhirter,1813215.0,1a400102c1fc2a1080f9,7916,Knowsley Safari rhino enclosure,67.165,2018-12-10 17:32:09 UTC,,,"{""source"":""api"",""session"":""dd0eff20c2f11dc45f8...","[{""task"":""T0"",""task_label"":""Use the tools belo...","{""26783731"":{""retired"":{""id"":23494735,""workflo...",26783731


In [3]:
# Open a set of lists to store import classification data.
classification_id = []
filename = []
tool_label = []
tool = []
x = []
y = []

# Run through each of the rows and extract every tagged coordinate and associated class.
for i in tqdm(range(len(data))):
    place = i+preset
    for j in range(len(json.loads(data.annotations[place])[0]['value'])):
        classification_id.append(data.classification_id[place])
        filename.append(json.loads(data.subject_data[place])[str(data.subject_ids[place])]['Filename'][:-4] + '.png')
        index = json.loads(data.annotations[place])[0]['value'][j]
        tool_label.append(index['tool_label'])
        tool.append(index['tool'])
        x.append(index['x'])
        y.append(index['y'])

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=307282.0), HTML(value='')))




In [4]:
# Turn the lists into a pandas dataframe.
df = pd.DataFrame({'classification_id': classification_id, 'filename': filename, 'tool_label': tool_label, 'tool': tool, 'x': x, 'y': y}, columns=['classification_id','filename','tool_label','tool','x','y'])

# Get the list of unique filenames.
filenameuni = sorted(list(set(filename)))

In [5]:
# Perform some maintenance to change labels to be equal to names file.
df = df[df['tool'] < 3]
df.loc[df['tool'] == 0, 'tool'] = 4
df.loc[df['tool'] == 2, 'tool'] = 3
df.loc[df['tool'] == 1, 'tool'] = 2
df.sort_values(by=['filename'])
df.index = range(len(df))
df

Unnamed: 0,classification_id,filename,tool_label,tool,x,y
0,135872963,20180412_01_01_0185.png,Rhino,4,331.984375,278.000000
1,135872963,20180412_01_01_0185.png,Rhino,4,358.984375,280.000000
2,135873070,20180412_02_01_0116.png,Rhino,4,236.984375,264.000000
3,135873070,20180412_02_01_0116.png,Rhino,4,306.984375,273.000000
4,135873070,20180412_02_01_0116.png,Antelope,2,293.984375,335.000000
...,...,...,...,...,...,...
715635,203037303,20170717_01_01_0263.png,Antelope,2,481.698364,3.445114
715636,203037303,20170717_01_01_0263.png,Antelope,2,453.945312,6.397568
715637,203037303,20170717_01_01_0263.png,Antelope,2,443.316467,28.836182
715638,203037303,20170717_01_01_0263.png,Antelope,2,434.459106,3.445114


In [6]:
def midpoint(ptA, ptB):
    return ((ptA[0] + ptB[0]) * 0.5, (ptA[1] + ptB[1]) * 0.5)

In [7]:
count = 1

for n in tqdm(range(len(filenameuni))):

    targ = filenameuni[n]

    # Read in the images for creating bounding boxes from coordinate tags.
    img = cv2.imread(os.path.join('E:\\Documents\\PostDoc\\know_fusion_images', targ))

    # Convert into grayscale and into a numpy array.
    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    img = np.array(img)

    labs = df[df.filename == filenameuni[n]]
    labs.index = range(len(labs))
    
    # For each of the tags, find the brightest pixel near them.

    x_t = []
    y_t = []
    b_t = []

    cut = 255 - img

    for i in range(len(labs)):
    
        # Get the coordinates of the maximum pixel in a euclidean-distance weighted region around the tags.
    
        xx = np.repeat(np.power(range(640) - labs.x[i], 2.0)[np.newaxis,:], 512, 0)
    
        yy = np.repeat(np.power(range(512) - labs.y[i], 2.0)[np.newaxis,:], 640, 0).transpose()
    
        xy = np.sqrt(xx + yy)
    
        fin = 255 * (xy / np.max(xy))
    
        both = 0.9*fin + 0.1*cut
    
        x_c = np.where(both == np.amin(both))[0][0]
        y_c = np.where(both == np.amin(both))[1][0]

        x_t.append(y_c)
        y_t.append(x_c)
        b_t.append(img[x_c, y_c])
    
    labs['x_t'] = pd.Series(x_t)
    labs['y_t'] = pd.Series(y_t)
    labs['b_t'] = pd.Series(b_t)
    
    # For each classified image, work out the boxes for each of the tags.

    unique_class = np.unique(labs['classification_id'])

    for j in range(len(unique_class)):
        labset = labs[labs.classification_id == unique_class[j]]
        labset.sort_values(by=['x'])
        labset.index = range(len(labset))
    
        # Find the boxes.
    
        x_min = []
        x_max = []
        y_min = []
        y_max = []
    
        for k in range(len(labset)):
        
            x_cand_1 = labset.x_t[k]
            x_cand_2 = labset.x_t[k]
            y_cand_1 = labset.y_t[k]
            y_cand_2 = labset.y_t[k]
        
            x_min.append(x_cand_1 - 5)
            x_max.append(x_cand_2 + 5)
        
            y_min.append(y_cand_1 - 5)
            y_max.append(y_cand_2 + 5)
        
        labset['x_min'] = x_min
        labset['x_max'] = x_max
        labset['y_min'] = y_min
        labset['y_max'] = y_max

    img = cv2.imread(os.path.join('E:\\Documents\\PostDoc\\know_fusion_images', targ))
    
    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    img = cv2.medianBlur(img,5)

    sizey, sizex = img.shape

    rang = 5
    ofs = 2

    th = cv2.adaptiveThreshold(img,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY,rang,ofs)
    
    image = th
    image = cv2.bitwise_not(image)

    edged = cv2.Canny(image, 50, 100)
    edged = cv2.dilate(edged, None, iterations=1)
    edged = cv2.erode(edged, None, iterations=1)

    cnts, harchy = cv2.findContours(edged, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
    #cnts = cnts[0] if imutils.is_cv2() else cnts[1]

    cv2.drawContours(th, cnts, 0, (0, 255, 0), -1)

    (cnts, _) = contours.sort_contours(cnts)

    areas = []

    box_x = []
    box_y = []
    box_w = []
    box_h = []

    for c in cnts:

        areas.append(cv2.contourArea(c))

        orig = img.copy()
        box = cv2.minAreaRect(c)
        box = cv2.cv.BoxPoints(box) if imutils.is_cv2() else cv2.boxPoints(box)
        box = np.array(box, dtype="int")

        box = perspective.order_points(box)

        (tl, tr, br, bl) = box
        (tltrX, tltrY) = midpoint(tl, tr)
        (blbrX, blbrY) = midpoint(bl, br)

        (tlblX, tlblY) = midpoint(tl, bl)
        (trbrX, trbrY) = midpoint(tr, br)

        x, y, w, h = cv2.boundingRect(c)
    
        box_x.append(x)
        box_y.append(y)
        box_w.append(w)
        box_h.append(h)
        
    areas = []

    box_x = []
    box_y = []
    box_w = []
    box_h = []

    for c in cnts:

        areas.append(cv2.contourArea(c))

        orig = img.copy()
        box = cv2.minAreaRect(c)
        box = cv2.cv.BoxPoints(box) if imutils.is_cv2() else cv2.boxPoints(box)
        box = np.array(box, dtype="int")

        box = perspective.order_points(box)

        (tl, tr, br, bl) = box
        (tltrX, tltrY) = midpoint(tl, tr)
        (blbrX, blbrY) = midpoint(bl, br)

        (tlblX, tlblY) = midpoint(tl, bl)
        (trbrX, trbrY) = midpoint(tr, br)

        x, y, w, h = cv2.boundingRect(c)
    
        box_x.append(x)
        box_y.append(y)
        box_w.append(w)
        box_h.append(h)
    
    # Turn the lists into a pandas dataframe.
    boxes = pd.DataFrame({'x': box_x, 'y': box_y, 'w': box_w, 'h': box_h}, columns=['x','y','w','h'])

    boxes['a'] = boxes['w'] * boxes['h']

    boxes = boxes[boxes['a'] < 1000]

    boxes['keep'] = 0

    boxes.index = range(len(boxes))

    # Reject boxes without more than one tag inside.

    for i in range(len(labs)):
        for j in range(len(boxes)):
            if (boxes['x'][j] <= labs.x_t[i] and (boxes['x'][j]+boxes['w'][j]) >= labs.x_t[i] and boxes['y'][j] <= labs.y_t[i] and (boxes['y'][j]+boxes['h'][j]) >= labs.y_t[i]):
                boxes['keep'][j] += 1        
            
    boxes = boxes[boxes['keep'] > 1]

    boxes.index = range(len(boxes))

    labs['empty'] = 0

    boxfin = []

    for i in range(len(labs)):
        boxlabs = boxes
        boxlabs['keep'] = 0
        for j in range(len(boxlabs)):
            if (boxlabs['x'][j] <= labs.x_t[i] and (boxlabs['x'][j]+boxlabs['w'][j]) >= labs.x_t[i] and boxlabs['y'][j] <= labs.y_t[i] and (boxlabs['y'][j]+boxlabs['h'][j]) >= labs.y_t[i]):
                boxlabs['keep'][j] = 1
            
        boxlabs = boxlabs[boxlabs['keep'] == 1]
    
        if boxlabs.empty:
            labs['empty'][i] = 1
        else:
            boxlabs = boxlabs[boxlabs['a'] == np.amax(boxlabs['a'].values)]
        
            boxlabs.index = range(len(boxlabs))
    
            boxlabs = boxlabs.iloc[[0]]
        
            boxfin.append(boxlabs)
    
    if len(boxfin) > 1:
        boxfin = pd.concat(boxfin)
    elif len(boxfin) == 1:
        boxfin = boxlabs
    else:
        continue
        
    boxfin.index = range(len(boxfin))
        
    labs = labs[labs['empty'] == 0]

    labs.index = range(len(labs))

    # Now that we have a dataframe of labels and a dataframe of associated maxsize boxes, compute unique and aggregate.

    boxuni = boxfin.drop_duplicates()

    boxuni.index = range(len(boxuni))

    tool_final = []

    tool_label_final = []

    for i in range(len(boxuni)):
    
        duplicates = (boxfin[(boxfin['x'] == boxuni['x'][i]) & (boxfin['y'] == boxuni['y'][i]) & (boxfin['w'] == boxuni['w'][i]) & (boxfin['h'] == boxuni['h'][i])].index.values)

        tool_final.append(labs.iloc[duplicates].tool.value_counts().idxmax())
    
        tool_label_final.append(labs.iloc[duplicates].tool_label.value_counts().idxmax())

    boxuni['tool_final'] = tool_final

    boxuni['tool_label_final'] = tool_label_final
    
    for i in range(len(boxuni)):
        
        c = conn.cursor()
        
        c.execute("SELECT image_id FROM images WHERE path = '../../know_fusion_images/" + targ + "'")
        
        result = c.fetchall()
        
        res = result[0][0]
        
        c.execute("INSERT INTO labels VALUES (" + str(count) + "," + str(res) + "," + str(boxuni['tool_final'][i]) + "," + str(boxuni['x'][i]) + "," + str(boxuni['y'][i]) + "," + str(boxuni['w'][i]) + "," + str(boxuni['h'][i]) + "," + str(count) + ")")
        
        count = count + 1
        
conn.commit()
        
conn.close()

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=24128.0), HTML(value='')))

  labs['x_t'] = pd.Series(x_t)
  labs['y_t'] = pd.Series(y_t)
  labs['b_t'] = pd.Series(b_t)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labs['x_t'] = pd.Series(x_t)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labs['y_t'] = pd.Series(y_t)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labs['b_t'] = pd.Series(b_t)
A value is trying to be s


