# Bees vs Wasps : Generate labels.csv for the observations.

#### Imports

In [9]:
from pathlib import Path
import pandas as pd
import numpy as np
import random

#### Constants

In [10]:
p_base=Path('f:/kaggle/bee_vs-wasp/manual_curation_pass2/')

p_bee1='bee1'  # well curated bees; 'photo_quality' = 1
p_bee2='bee2'  # bees from a bee-counting camera, probably easy to overfit on these; 'photo_quality'=0
p_wasp1='wasp1' # well curated wasps 'photo_quality'=1
p_wasp2='wasp2' # less-well curated wasps 'photo_quality' = 0
p_insect='other_insect' # some other insects but not bees and not wasps, 'photo_quality'=1
p_noinsect='other_noinsect' # random photos, no insects there 'photo_quality'=1

columns = ['id', 'path', 'is_bee', 'is_wasp', 'is_otherinsect','is_other','photo_quality', 'is_validation', 'is_final_validation']

validation_ratio = 0.3 # reserve 30% of the data for training validation or final test 
final_validation_ratio = 0.5 # out of the validation set, mark that many for a final test (that is, do not use for hyperparameter tuning)

random.seed(99)


#### Helper functions

In [11]:
def row_maker(idx=0, path=None, is_bee=False, is_wasp=False, is_otherinsect = False, is_other = False, photo_quality = 0, is_validation=False, is_final_validation=False):
    observation = pd.DataFrame(
        data={
            'id': np.int(idx),
            'path':str(path.relative_to(p_base)), 
            'is_bee':np.int(is_bee), 
            'is_wasp': np.int(is_wasp),
            'is_otherinsect': np.int(is_otherinsect),
            'is_other': np.int(is_other),
            'photo_quality':np.int(photo_quality),
            'is_validation':np.int(is_validation),
            'is_final_validation':np.int(is_final_validation),            
            },index=[idx]
            )
    return observation
 

def row_adder(dataset_labels, source_path, is_bee=False, is_wasp=False, is_otherinsect = False, is_other = False, photo_quality = 0):
    global idx
    for thispath in (p_base/source_path).iterdir():
        idx=idx+1
        is_validation = False
        is_final_validation = False
        if random.random()<validation_ratio:
            if random.random()<final_validation_ratio:
                is_final_validation = True
            else:
                is_validation = True
        observation = row_maker(idx,path=thispath,is_bee=is_bee, is_wasp=is_wasp, is_otherinsect = is_otherinsect, is_other = is_other, photo_quality = photo_quality, is_validation=is_validation, is_final_validation=is_final_validation)
        dataset_labels=dataset_labels.append(observation)
    return dataset_labels        

#### Variables

In [12]:
dataset_labels = pd.DataFrame(columns=columns)
dataset_labels.set_index('id')
idx = 0

#### Execution

In [8]:
dataset_labels = row_adder(dataset_labels, p_bee1, is_bee=True,photo_quality = 1 )
dataset_labels = row_adder(dataset_labels, p_bee2, is_bee=True,photo_quality = 0 )
dataset_labels = row_adder(dataset_labels, p_wasp1, is_wasp=True,photo_quality = 1 )
dataset_labels = row_adder(dataset_labels, p_wasp2, is_wasp=True,photo_quality = 0 )
dataset_labels = row_adder(dataset_labels, p_insect, is_otherinsect=True,photo_quality = 1 )
dataset_labels = row_adder(dataset_labels, p_noinsect, is_other=True,photo_quality = 0 )
dataset_labels= dataset_labels.set_index('id')
# create the kind of label format that fastai expects
dataset_labels['label']=""
dataset_labels.loc[dataset_labels.is_bee==1,'label']="bee"
dataset_labels.loc[dataset_labels.is_wasp==1,'label']="wasp"
dataset_labels.loc[dataset_labels.is_otherinsect==1,'label']="insect"
dataset_labels.loc[dataset_labels.is_other==1,'label']="other"


dataset_labels.to_csv('../labels.csv',header=True, index=True, index_label='id', )

#### Review

In [6]:
dataset_labels.tail()

Unnamed: 0_level_0,path,is_bee,is_wasp,is_otherinsect,is_other,photo_quality,is_validation,is_final_validation,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
11417,other_noinsect\581704.jpg,0,0,0,1,0,0,1,other
11418,other_noinsect\581873.jpg,0,0,0,1,0,0,1,other
11419,other_noinsect\581880.jpg,0,0,0,1,0,1,0,other
11420,other_noinsect\6778651038_294d392627_n.jpg,0,0,0,1,0,0,1,other
11421,other_noinsect\9695883931_d5efb955d2_m.jpg,0,0,0,1,0,0,0,other


In [7]:
print(f'we have:')
print(f' bees..........: {np.sum(dataset_labels.is_bee.values)}')
print(f' wasps.........: {np.sum(dataset_labels.is_wasp.values)}')
print(f' other insects.: {np.sum(dataset_labels.is_otherinsect.values)}')
print(f' other.........: {np.sum(dataset_labels.is_other.values)}')
print(f'')
print(f'in that, there is:')
print(f' training photos : {np.sum(np.logical_and(dataset_labels.is_validation.values==0, dataset_labels.is_final_validation.values==0))}')
print(f' hyperparameter tuning (1st level validation) photos : {np.sum(dataset_labels.is_validation.values==1)}')
print(f' final validation (brag about your result with these) photos : {np.sum(dataset_labels.is_final_validation.values==1)}')
print(f'')
valid_bees  = np.sum(np.logical_and(dataset_labels.is_final_validation.values==1, dataset_labels.is_bee.values==1))
valid_wasps = np.sum(np.logical_and(dataset_labels.is_final_validation.values==1, dataset_labels.is_wasp.values==1))
result_resolution = 100*1.0/(valid_bees+valid_wasps)
print(f'In the final validation, there is {valid_bees} bees and {valid_wasps} wasps, meaning that the resolution of the result is {result_resolution:0.2f}%')


we have:
 bees..........: 3183
 wasps.........: 4943
 other insects.: 2439
 other.........: 856

in that, there is:
 training photos : 7939
 hyperparameter tuning (1st level validation) photos : 1719
 final validation (brag about your result with these) photos : 1763

In the final validation, there is 504 bees and 753 wasps, meaning that the resolution of the result is 0.08%


Have fun!