In [40]:
import json
import os
import pickle
import cv2
import numpy as np 
import pandas as pd
from PIL import Image, ImageFile, ImageFont, ImageDraw
from clahe import CLAHE, convert_dir_to_clahe
import tqdm
%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [41]:
from loaddata import loaddets_md3, loaddata, load_merged, loaddets_md4, load_testinfo

In [42]:
from exporter import Exporter

In [43]:
exporter = Exporter()

In [44]:
ANNOTATIONS_PATH = "metadata/"

SIZES = [(256,256), (380,380)]

TRAIN_PATHS = ["train/", "train_clahe/"]
TEST_PATHS = ["test/", "test_clahe/"]

LOWER_THRESHOLD = 0.5

In [45]:
df_dets_md3 = loaddets_md3()

In [46]:
df_anno, df_categories, df_images = loaddata()

In [47]:
testinfo = load_testinfo()

In [48]:
testinfo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60214 entries, 0 to 60213
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   image_id  60214 non-null  object
 1   seq_id    60214 non-null  object
dtypes: object(2)
memory usage: 941.0+ KB


In [49]:
df_dets_train_md4, df_dets_test_md4 = loaddets_md4()

In [50]:
df_dets_md3.columns

Index(['detections', 'image_id', 'max_detection_conf', 'nr_boxes'], dtype='object')

In [51]:
df_dets_train_md4.head()

Unnamed: 0,file,max_detection_conf,detections,image_id,nr_boxes
0,/home/beuving/kaggle/train/8b1f7ab6-21bc-11ea-...,0.0,[],8b1f7ab6-21bc-11ea-a13a-137349068a90,0
1,/home/beuving/kaggle/train/957ebe40-21bc-11ea-...,0.998,"[{'category': '1', 'conf': 0.998, 'bbox': [0.2...",957ebe40-21bc-11ea-a13a-137349068a90,8
2,/home/beuving/kaggle/train/97fe2070-21bc-11ea-...,0.999,"[{'category': '1', 'conf': 0.999, 'bbox': [0.4...",97fe2070-21bc-11ea-a13a-137349068a90,4
3,/home/beuving/kaggle/train/8fe37110-21bc-11ea-...,0.0,[],8fe37110-21bc-11ea-a13a-137349068a90,0
4,/home/beuving/kaggle/train/90a1b8d2-21bc-11ea-...,0.0,[],90a1b8d2-21bc-11ea-a13a-137349068a90,0


In [52]:
det_md3_images = list(df_dets_md3["image_id"])
train_images_md3 = list(df_images["image_id"])
test_images_md3 = list(set(det_md3_images) - set(train_images_md3))

In [53]:
train_images_md4 =  list(df_dets_train_md4["image_id"])
test_images_md4 = list(df_dets_test_md4["image_id"])

In [54]:
df_dets_train_md3 = df_dets_md3[df_dets_md3["image_id"].isin(train_images_md3)]
df_dets_test_md3 = df_dets_md3[df_dets_md3["image_id"].isin(test_images_md3)]

In [55]:
df_dets_train_md3["nr_boxes"].sum()

247460

In [56]:
df_dets_test_md3["nr_boxes"].sum()

48525

In [57]:
df_dets_train_md4["nr_boxes"].sum()

303171

In [58]:
df_dets_test_md4["nr_boxes"].sum()

54329

In [59]:
df_dets_train_md3

Unnamed: 0,detections,image_id,max_detection_conf,nr_boxes
0,"[{'category': '1', 'bbox': [0.6529, 0.5425, 0....",905a3c8c-21bc-11ea-a13a-137349068a90,0.999,1
1,"[{'category': '1', 'bbox': [0.0147, 0.0, 0.985...",905a3fc0-21bc-11ea-a13a-137349068a90,0.696,1
2,[],905a420e-21bc-11ea-a13a-137349068a90,0.000,0
3,"[{'category': '1', 'bbox': [0.0, 0.4669, 0.185...",905a4416-21bc-11ea-a13a-137349068a90,1.000,7
4,"[{'category': '1', 'bbox': [0.0, 0.0494, 0.528...",905a579e-21bc-11ea-a13a-137349068a90,0.999,1
...,...,...,...,...
261568,"[{'category': '1', 'bbox': [0.2771, 0.1184, 0....",8ebf1582-21bc-11ea-a13a-137349068a90,0.669,1
261569,"[{'category': '1', 'bbox': [0.656, 0.4392, 0.2...",8ebf18e8-21bc-11ea-a13a-137349068a90,1.000,1
261570,"[{'category': '1', 'bbox': [0.3886, 0.7674, 0....",8ebf1f50-21bc-11ea-a13a-137349068a90,1.000,1
261571,"[{'category': '1', 'bbox': [0.3287, 0.1321, 0....",8ebf2284-21bc-11ea-a13a-137349068a90,1.000,2


In [60]:
# we add the path to the dataframe (copied from 00)
def add_path(df, path):
    out = df.copy()
    out["filename"] = df["image_id"].apply(lambda x: os.path.join(path, x + ".jpg"))
    out["path"] = path
    return out

df_train_md3_clahe = add_path(df_dets_train_md3, TRAIN_PATHS[1])
df_test_md3_clahe = add_path(df_dets_test_md3, TEST_PATHS[1])
df_train_md4_clahe = add_path(df_dets_train_md4, TRAIN_PATHS[1])
df_test_md4_clahe = add_path(df_dets_test_md4, TEST_PATHS[1])

In [61]:
print(df_train_md3_clahe.info())
print(df_train_md3_clahe.info())
print(df_train_md3_clahe.info())
print(df_train_md3_clahe.info())
df_train_md3_clahe.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 203314 entries, 0 to 261572
Data columns (total 6 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   detections          203314 non-null  object 
 1   image_id            203314 non-null  object 
 2   max_detection_conf  203314 non-null  float64
 3   nr_boxes            203314 non-null  int64  
 4   filename            203314 non-null  object 
 5   path                203314 non-null  object 
dtypes: float64(1), int64(1), object(4)
memory usage: 10.9+ MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 203314 entries, 0 to 261572
Data columns (total 6 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   detections          203314 non-null  object 
 1   image_id            203314 non-null  object 
 2   max_detection_conf  203314 non-null  float64
 3   nr_boxes            203314 non-null  int64  
 4   filenam

Unnamed: 0,detections,image_id,max_detection_conf,nr_boxes,filename,path
0,"[{'category': '1', 'bbox': [0.6529, 0.5425, 0....",905a3c8c-21bc-11ea-a13a-137349068a90,0.999,1,train_clahe/905a3c8c-21bc-11ea-a13a-137349068a...,train_clahe/
1,"[{'category': '1', 'bbox': [0.0147, 0.0, 0.985...",905a3fc0-21bc-11ea-a13a-137349068a90,0.696,1,train_clahe/905a3fc0-21bc-11ea-a13a-137349068a...,train_clahe/
2,[],905a420e-21bc-11ea-a13a-137349068a90,0.0,0,train_clahe/905a420e-21bc-11ea-a13a-137349068a...,train_clahe/
3,"[{'category': '1', 'bbox': [0.0, 0.4669, 0.185...",905a4416-21bc-11ea-a13a-137349068a90,1.0,7,train_clahe/905a4416-21bc-11ea-a13a-137349068a...,train_clahe/
4,"[{'category': '1', 'bbox': [0.0, 0.0494, 0.528...",905a579e-21bc-11ea-a13a-137349068a90,0.999,1,train_clahe/905a579e-21bc-11ea-a13a-137349068a...,train_clahe/


In [67]:
df_train_md3_clahe = df_train_md3_clahe.merge(df_anno, on="image_id")

In [69]:
df_test_md3_clahe["category_id"] = 0

In [71]:
# md3 run
#boxes_train_md4 = export_detections(df_dets_train_md4, SIZES[1])
box = exporter.export_detections(df_train_md3_clahe, "md3", SIZES[0])
exporter.to_csv()
box = exporter.export_detections(df_test_md3_clahe, "md3", SIZES[0])
exporter.to_csv()

In [114]:
boxes_train_md4.head()

Unnamed: 0,path,image_id,idx,filename
0,_train_bbox256x256/,957ebe40-21bc-11ea-a13a-137349068a90,1,_train_bbox256x256/957ebe40-21bc-11ea-a13a-137...
1,_train_bbox256x256/,957ebe40-21bc-11ea-a13a-137349068a90,2,_train_bbox256x256/957ebe40-21bc-11ea-a13a-137...
2,_train_bbox256x256/,957ebe40-21bc-11ea-a13a-137349068a90,3,_train_bbox256x256/957ebe40-21bc-11ea-a13a-137...
3,_train_bbox256x256/,957ebe40-21bc-11ea-a13a-137349068a90,4,_train_bbox256x256/957ebe40-21bc-11ea-a13a-137...
4,_train_bbox256x256/,957ebe40-21bc-11ea-a13a-137349068a90,5,_train_bbox256x256/957ebe40-21bc-11ea-a13a-137...


We want several datasets prepared, two cropped bounding boxes at 256 and one at 380
Also add the inaturalist data

# Main procressing

In [83]:
def convert_images(annotation):

    img_id = annotation["image_id"]
    is_train = True 
    
    try:
        detections = annotation["detections"]
    except:
        print(f"Passed {img_id}. There are no detection data.")
        return
    
    path_for_train = TRAIN_PATH + img_id + ".jpg"
    path_for_test = TEST_PATH + img_id + ".jpg"
    
    if os.path.exists(path_for_train):
        file_path = path_for_train
        is_train = True
        return
    elif os.path.exists(path_for_test):
        file_path = path_for_test
        is_train = False
    else:
        print(f"Passed {img_id}. There are no data.")
        return
    
    
    try:      
        img = Image.open(file_path)
    except:
        print(f"Passed {img_id}. Fail to open image.")
        print(f"pass {file_path}.")
        return
    
    for i, detection in enumerate(detections, 1):
        
        if detection["conf"] < threshold:
            continue

        if detection["category"] != "1":
            continue
            
        if len(detection) == 0:
            img = img.resize(size)
            save_image(img, img_id, 0, is_train)
            x_tot, x2_tot = calc_mean_and_var(img)
            x_tot_list.append(x_tot)
            x2_tot_list.append(x2_tot)
            
        else:
            crop_area = get_crop_area(detection["bbox"], img.size)
            img_croped = img.crop(crop_area).resize(size)
            save_image(img_croped, img_id, i, is_train)
        
            x_tot, x2_tot = calc_x_and_x2_tot(img_croped)
            x_tot_list.append(x_tot)
            x2_tot_list.append(x2_tot)

In [84]:
for i, annotation in tqdm.tqdm(df_dets.iterrows()):
    convert_images(annotation)

In [96]:
#image stats
img_avr =  np.array(x_tot_list).mean(0)
img_std =  np.sqrt(np.array(x2_tot_list).mean(0) - img_avr**2)
print('mean:',img_avr, ', std:', img_std)

mean: [0.33705012 0.33705231 0.33704654] , std: [0.22359819 0.22362261 0.22359884]


In [97]:
croped_train = {"id": img_ids_train, "idx":img_idx_train}
df_croped_train = pd.DataFrame(croped_train)
#df_train_annotation = pd.DataFrame(train_annotations["annotations"])

ValueError: arrays must all be same length

In [57]:
df_croped_train.head()

Unnamed: 0,id,idx
0,905a3c8c-21bc-11ea-a13a-137349068a90,1
1,905a4416-21bc-11ea-a13a-137349068a90,1
2,905a4416-21bc-11ea-a13a-137349068a90,2
3,905a4416-21bc-11ea-a13a-137349068a90,3
4,905a579e-21bc-11ea-a13a-137349068a90,1


In [85]:
croped_test = {"id": img_ids_test, "idx":img_idx_test}
df_croped_test = pd.DataFrame(croped_test)

In [86]:
df_croped_test["image_id"] = df_croped_test["id"] + "_" + df_croped_test["idx"].astype(str)
df_croped_test["category_id"] = 0

In [93]:
df_croped_test = df_croped_test.drop_duplicates()


In [91]:
print(len(df_croped_test))
df_croped_test.head()
#path,image_id,idx,filename,confidence

79348


Unnamed: 0,id,idx,image_id,category_id
0,915879a0-21bc-11ea-a13a-137349068a90,1,915879a0-21bc-11ea-a13a-137349068a90_1,0
1,91588116-21bc-11ea-a13a-137349068a90,1,91588116-21bc-11ea-a13a-137349068a90_1,0
2,9158a2f4-21bc-11ea-a13a-137349068a90,1,9158a2f4-21bc-11ea-a13a-137349068a90_1,0
3,9158aaa6-21bc-11ea-a13a-137349068a90,1,9158aaa6-21bc-11ea-a13a-137349068a90_1,0
4,9158f1a0-21bc-11ea-a13a-137349068a90,1,9158f1a0-21bc-11ea-a13a-137349068a90_1,0
