In [160]:
import pandas as pd
from sklearn import preprocessing
import cv2
from PIL.Image import fromarray
from random import uniform
import json
import numpy as np
import os
from shutil import copy
from tqdm.notebook import tqdm
import magic
import re
import matplotlib.pyplot as plt

In [3]:
# train = pd.read_csv('./data/train.csv')
# train.species.replace({"globis": "short_finned_pilot_whale",
#                           "pilot_whale": "short_finned_pilot_whale",
#                           "kiler_whale": "killer_whale",
#                           "bottlenose_dolpin": "bottlenose_dolphin"}, inplace=True)

# enc = preprocessing.LabelEncoder()
# train['Y'] = enc.fit_transform(train['individual_id'])
# train.to_csv('/content/happywhale/csv/train.csv', index=False)

In [None]:
def replace_nan(x):
    if isinstance(x, float):
        return np.nan
    return x

def get_box(x):
    bbox = json.loads(x)
    if len(bbox) == 0:
        return np.nan
    if bbox[0] == np.nan:
        return np.nan
    
    return ' '.join([str(i) for i in bbox[0]])


def get_conf(x):
    conf = json.loads(x)
    
    if len(conf) == 0:
        return np.nan
    
    if np.isnan(conf[0]):
        return np.nan
    
    
    return conf[0]


def wipe_low_conf(row):
    if row['conf'] < 0.25 and isinstance(row['box1'], float):
        row['box1'] = np.nan
        row['box2'] = np.nan
    return row

def get_shape(x):
    t = magic.from_file(src + x)
    w, h = re.search('(\d{2,})x(\d+)', t).groups()
    return f'{w} {h}'

def get_absolute_box(row):
    if isinstance(row['box'], float):
        return np.nan
    t = magic.from_file(src + row['image'])
    w, h = re.search('(\d{2,})x(\d+)', t).groups()
    w = int(w)
    h = int(h)
    
    box = [int(i) for i in row['box'].split()]
    x0 = round(box[0] / w, 4)
    y0 = round(box[1] / h, 4)
    x1 = round(box[2] / w, 4)
    y1 = round(box[3] / h, 4)
    
    return f'{x0} {y0} {x1} {y1}'


def get_box_distance(row):
    if isinstance(row['box1'], float) or isinstance(row['box2'], float):
        return np.nan
    
    box1 = [int(i) for i in row['box1'].split()]
    box2 = [int(i) for i in row['box2'].split()]
    h, w = row['shape'].split()
    h = int(h)
    w = int(w)
    
    x0 = round(abs(box1[0] - box2[0]) / w, 4)
    y0 = round(abs(box1[1] - box2[1]) / h, 4)
    x1 = round(abs(box1[2] - box2[2]) / w, 4)
    y1 = round(abs(box1[3] - box2[3]) / h, 4)
    
    return x0 + y0 + x1 + y1

def wipe_high_dist(row):
    if row['dst'] > 0.6:
        row['box1'] = np.nan
        row['box2'] = np.nan
    return row


def get_final_box(row):
    if isinstance(row['box1'], float) or isinstance(row['box2'], float):
        return np.nan
    else:
        box1 = [int(i) for i in row['box1'].split()]
        box2 = [int(i) for i in row['box2'].split()]
        
        weights = [0.75, 0.25]
        x0 = round(np.average([box1[0], box2[0]], weights=weights))
        y0 = round(np.average([box1[1], box2[1]], weights=weights))
        x1 = round(np.average([box1[2], box2[2]], weights=weights))
        y1 = round(np.average([box1[3], box2[3]], weights=weights))
        
        return f'{x0} {y0} {x1} {y1}'

In [74]:
src = '/home/kutsenko/kaggle/data/train_images/'

train2 = pd.read_csv('/home/kutsenko/kaggle/data/train2.csv')
train2.rename(columns={'box': 'box1'}, inplace=True)
train2['box1'] = train2['box1'].apply(replace_nan)

train3 = pd.read_csv('/home/kutsenko/kaggle/data/train3.csv')

train3['bbox'] = train3['bbox'].apply(get_box)
train3['conf'] = train3['conf'].apply(get_conf)
train3['image'] = train3['image_path'].apply(lambda x: x.split('/')[-1])
train3 = train3[['image', 'bbox', 'conf']]
train3.rename(columns={'bbox': 'box2'}, inplace=True)

train2 = train2.merge(train3, on='image')

with open('/home/kutsenko/kaggle/happywhale/data/deleted.txt', 'r') as f:
    deleted = [i.strip() for i in f.readlines()]

train2 = train2[~train2['image'].isin(deleted)]



train2['shape'] = train2['image'].apply(get_shape)

train2 = train2.apply(wipe_low_conf, axis=1)


train2['dst'] = train2.apply(get_box_distance, axis=1)
# dst_frame = train2[~train2['dst'].isna()].sort_values('dst', ascending=False)
train2 = train2.apply(wipe_high_dist, axis=1)
train2['box'] = train2.apply(get_final_box, axis=1)

with open('/home/kutsenko/kaggle/happywhale/data/json/target_to_id.json', 'r') as f:
    target_to_id = json.load(f)
target_to_id = {target_to_id[key]: int(key) for key in target_to_id}

train2 = train2[['image', 'individual_id', 'box']]
train2['Y'] = train2['individual_id'].apply(lambda x: target_to_id[x])

train2['box'] = train2.apply(get_absolute_box, axis=1)
train2.to_csv('/home/kutsenko/kaggle/happywhale/data/train_.csv', index=False)

In [295]:
train = pd.read_csv('/home/kutsenko/kaggle/happywhale/data/train_.csv')

In [301]:
train.to_csv('/home/kutsenko/kaggle/happywhale/data/train_.csv', index=False)

In [43]:
# low_conf_ids = train2[(train2['box1'].isna()) & ((train2['conf'] < 0.22))].index
# low_conf = train2[(train2['box1'].isna()) & ((train2['box2'].isna()) | (train2['conf'] < 0.02))]

In [235]:
indexs_no_box = dst_frame[dst_frame['dst'] > 0.54].index

In [291]:
view_sample = train2[~train2['box'].isna()].sample(200)

In [None]:
for i, row in view_sample.iterrows():
    img_n = row['image']
    img = cv2.imread(src + img_n)
    box = [int(i) for i in row['box1'].split()]
    cv2.rectangle(img, (box[0], box[1]), (box[2], box[3]), (255,0,0), 2)
    box = [int(i) for i in row['box2'].split()]
    cv2.rectangle(img, (box[0], box[1]), (box[2], box[3]), (0, 255,0), 2)
    cv2.imwrite('/home/kutsenko/kaggle/view3/' + img_n, img)
    
    

In [294]:
for i, row in view_sample.iterrows():
    img_n = row['image']
    img = cv2.imread(src + img_n)
    box = [int(i) for i in row['box'].split()]
    cv2.rectangle(img, (box[0], box[1]), (box[2], box[3]), (0, 255,0), 2)
    cv2.imwrite('/home/kutsenko/kaggle/view3/' + img_n, img)
    
    

In [None]:
# sample = na_images.iloc[i]
# # sample = train2[train2['image'] == 'db571a3686e9ae.jpg'].iloc[0]
# img_n = sample['image']
# print(i, img_n)
# box = sample['box1']
# img = cv2.imread(src + img_n)
# h, w, c = img.shape
# if isinstance(box, str):
#     box = [int(i) for i in box.split()]
#     cv2.rectangle(img, (box[0], box[1]), (box[2], box[3]), (255,0,0), 2)
#     ratio = round(uniform(0, 1), 4)
#     x0 = round(round(box[0] / w * ratio, 4) * w)
#     y0 = round(round(box[1] / h  * ratio, 4) * h)
#     x1 = round(round(1 - (1 - box[2] / w)  * ratio, 4) * w)
#     y1 = round(round(1 - (1 - box[3] / h)  * ratio, 4) * h)
#     print('ratio', ratio)
#     cv2.rectangle(img, (x0, y0), (x1, y1), (0,255,255), 2)
    
# else:
#     print('box none')

# i += 1 
# fromarray(img)


In [47]:
# for img_n in low_conf['image']:
#     copy(src + img_n, '/home/kutsenko/kaggle/view2/' + img_n)

# low_conf_origin = low_conf['image'].tolist()
# low_conf_clean = os.listdir('/home/kutsenko/kaggle/view2')
# to_del2 = list(set(low_conf_origin) - set(low_conf_clean))
# to_del2 = list(set(to_del2))
# len(to_del2)

27

In [43]:
# na_images = train2[(train2['box1'].isna()) & (train2['box2'].isna())]
# na_images = na_images.reset_index(drop=True)

# na_images_origin = na_images['image'].tolist()

# na_images_clean = os.listdir('/home/kutsenko/kaggle/view')
# to_del = list(set(na_images_origin) - set(na_images_clean))
# to_del.append('cd5fe465c60cb9.jpg')
# to_del.append('083a0fee112e3c.jpg')
# to_del.append('f7942e041d9963.jpg')
# to_del.append('9f94de1a3c768b.jpg')
# to_del.append('bc6c01a7baf94b.jpg')
# to_del.append('bb875ffcb8d064.jpg')
# to_del = list(set(to_del))
# len(to_del)


# with open('/home/kutsenko/kaggle/happywhale/data/deleted.txt', 'w') as f:
#     for i in deleted:
#         f.write(i + '\n')

In [320]:
src = '/home/kutsenko/kaggle/data/test_images/'

In [302]:
src = '/home/kutsenko/kaggle/data/test_images/'

test2 = pd.read_csv('/home/kutsenko/kaggle/data/test2.csv')
test2.rename(columns={'box': 'box1'}, inplace=True)
test2['box1'] = test2['box1'].apply(replace_nan)

In [311]:
test3 = pd.read_csv('/home/kutsenko/kaggle/data/test3.csv')

test3['bbox'] = test3['bbox'].apply(get_box)
test3['conf'] = test3['conf'].apply(get_conf)
test3['image'] = test3['image_path'].apply(lambda x: x.split('/')[-1])
test3 = test3[['image', 'bbox', 'conf']]
test3.rename(columns={'bbox': 'box2'}, inplace=True)


In [312]:
test2 = test2.merge(test3, on='image')

In [319]:
test2[test2.image == '42fb13a2a6e9d3.jpg']

Unnamed: 0,image,predictions,box1,box2,conf
7453,42fb13a2a6e9d3.jpg,37c7aba965a5 114207cab555 a6e325d8e924 19fbb96...,1476 1125 2027 1253,1478 1101 2057 1245,0.19434


In [322]:
test2

Unnamed: 0,image,predictions,box1,box2,conf
0,000110707af0ba.jpg,37c7aba965a5 114207cab555 a6e325d8e924 19fbb96...,329 874 2934 1271,93 807 3546 1377,0.154790
1,0006287ec424cb.jpg,37c7aba965a5 114207cab555 a6e325d8e924 19fbb96...,1039 1636 1833 1875,772 1623 2220 1885,0.113400
2,000809ecb2ccad.jpg,37c7aba965a5 114207cab555 a6e325d8e924 19fbb96...,787 787 1541 948,382 772 1970 989,0.082519
3,00098d1376dab2.jpg,37c7aba965a5 114207cab555 a6e325d8e924 19fbb96...,77 228 2029 714,31 222 1939 713,0.608400
4,000b8d89c738bd.jpg,37c7aba965a5 114207cab555 a6e325d8e924 19fbb96...,4 20 289 317,0 12 293 317,0.854980
...,...,...,...,...,...
27951,fff6ff1989b5cd.jpg,37c7aba965a5 114207cab555 a6e325d8e924 19fbb96...,0 31 1225 524,32 20 1213 532,0.568850
27952,fff8fd932b42cb.jpg,37c7aba965a5 114207cab555 a6e325d8e924 19fbb96...,5 44 888 836,0 3 889 835,0.769040
27953,fff96371332c16.jpg,37c7aba965a5 114207cab555 a6e325d8e924 19fbb96...,,285 848 3133 1218,0.166140
27954,fffc1c4d3eabc7.jpg,37c7aba965a5 114207cab555 a6e325d8e924 19fbb96...,1477 1178 2413 1422,1413 1173 2386 1444,0.142700


In [None]:
test2['shape'] = test2['image'].apply(get_shape)
test2 = test2.apply(wipe_low_conf, axis=1)

In [321]:
os.path.isfile('/home/kutsenko/kaggle/data/test_images/42fb13a2a6e9d3.jpg')

False

In [None]:
train2['dst'] = train2.apply(get_box_distance, axis=1)
# dst_frame = train2[~train2['dst'].isna()].sort_values('dst', ascending=False)
train2 = train2.apply(wipe_high_dist, axis=1)
train2['box'] = train2.apply(get_final_box, axis=1)

with open('/home/kutsenko/kaggle/happywhale/data/json/target_to_id.json', 'r') as f:
    target_to_id = json.load(f)
target_to_id = {target_to_id[key]: int(key) for key in target_to_id}

train2 = train2[['image', 'individual_id', 'box']]
train2['Y'] = train2['individual_id'].apply(lambda x: target_to_id[x])

train2['box'] = train2.apply(get_absolute_box, axis=1)
train2.to_csv('/home/kutsenko/kaggle/happywhale/data/train_.csv', index=False)