In [1]:
import os
import gc
import cv2
import math
import copy
import time
import random

# For data manipulation
import numpy as np
import pandas as pd

# Utils
import joblib
from tqdm import tqdm
from collections import defaultdict
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

def set_seed(seed=42):
    np.random.seed(seed)
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed(42)

In [2]:
ROOT_DIR = '/home/workspace/happy-whale-and-dolphin'
TRAIN_DIR = '/home/workspace/happy-whale-and-dolphin/train_images'
TEST_DIR = '/home/workspace/happy-whale-and-dolphin/test_images'
train_fullbody_csv_dir = f"{ROOT_DIR}/train_fullbody.csv"
train_backfins_csv_dir = f"{ROOT_DIR}/train_backfins.csv" 
test_fullbody_csv_dir = f"{ROOT_DIR}/test_fullbody.csv" 
test_backfins_csv_dir = f"{ROOT_DIR}/test_backfins.csv"

In [3]:
def df_bbox(bbox_list, data_type="fullbody"):
    new_bboxes_list = []
    for bbox in tqdm(bbox_list): 
        if pd.isna(bbox):
            new_bboxes_list.append([]) 
            continue
        try:
            bbox = bbox.replace("[","").replace("]","") 
            if data_type=="fullbody":
                bbox = bbox.split("  ")
            elif data_type=="backfins":
                bbox = bbox.split(" ")
            else:
                raise "data_type is not right"
        except:
            print(bbox)
            print(type(bbox))
            print(bbox == np.nan)
            print(pd.isna(bbox))
        
        bbox = [int(i) for i in bbox if i not in ['', " "]]
        assert len(bbox) == 4
        new_bboxes_list.append(bbox) 
    assert len(bbox_list) == len(new_bboxes_list)
    return new_bboxes_list

# fullbody

In [None]:
train_fullbody_df = pd.read_csv(train_fullbody_csv_dir)
bbox_list = train_fullbody_df["bbox"].to_list()
train_fullbody_df["bbox"] = df_bbox(bbox_list)
train_fullbody_df.head()

In [None]:
for _, image, bbox, _  in tqdm(train_fullbody_df.itertuples(), total=len(train_fullbody_df)):
    if bbox != []:
        img = cv2.imread(f"{TRAIN_DIR}/{image}") 
        img = img[bbox[1]:bbox[3],bbox[0]:bbox[2]] 
    cv2.imwrite(f'{ROOT_DIR}/train_fullbody_images/{image}', img)

In [None]:
test_fullbody_df = pd.read_csv(test_fullbody_csv_dir)
bbox_list = test_fullbody_df["bbox"].to_list()
test_fullbody_df["bbox"] = df_bbox(bbox_list)
test_fullbody_df.head()

In [None]:
for _, image, bbox, _  in tqdm(test_fullbody_df.itertuples(), total=len(test_fullbody_df)):
    if bbox != []:
        img = cv2.imread(f"{TEST_DIR}/{image}")
        img = img[bbox[1]:bbox[3],bbox[0]:bbox[2]] 
    cv2.imwrite(f'{ROOT_DIR}/test_fullbody_images/{image}', img) 

# backfins

In [None]:
train_backfins_df = pd.read_csv(train_backfins_csv_dir)
bbox_list = train_backfins_df["bbox"].to_list()
train_backfins_df["bbox"] = df_bbox(bbox_list,"backfins")
train_backfins_df.head()

In [None]:
for _, image, bbox in tqdm(train_backfins_df[["image","bbox"]].itertuples(), total=len(train_backfins_df)):
    if bbox != []:
        img = cv2.imread(f"{TRAIN_DIR}/{image}")
        img = img[bbox[1]:bbox[3],bbox[0]:bbox[2]] 
    cv2.imwrite(f'{ROOT_DIR}/train_backfins_images/{image}', img)

In [4]:
test_backfins_df = pd.read_csv(test_backfins_csv_dir)
bbox_list = test_backfins_df["bbox"].to_list()
test_backfins_df["bbox"] = df_bbox(bbox_list,"backfins")
test_backfins_df.head()

100%|██████████| 27956/27956 [00:00<00:00, 379757.95it/s]


Unnamed: 0,bbox,conf,height,image,image_id,image_path,label_path,predictions,split,width
0,"[3529, 2029, 3599, 2359]",[ 0.0087128],2399,000110707af0ba.jpg,000110707af0ba.jpg,../input/happy-whale-and-dolphin/test_images/0...,/kaggle/working/output/test/labels/000110707af...,37c7aba965a5 114207cab555 a6e325d8e924 19fbb96...,Test,3599
1,"[1246, 1612, 1608, 1780]",[ 0.6001],2400,0006287ec424cb.jpg,0006287ec424cb.jpg,../input/happy-whale-and-dolphin/test_images/0...,/kaggle/working/output/test/labels/0006287ec42...,37c7aba965a5 114207cab555 a6e325d8e924 19fbb96...,Test,3600
2,"[947, 762, 1156, 828]",[ 0.10522],1488,000809ecb2ccad.jpg,000809ecb2ccad.jpg,../input/happy-whale-and-dolphin/test_images/0...,/kaggle/working/output/test/labels/000809ecb2c...,37c7aba965a5 114207cab555 a6e325d8e924 19fbb96...,Test,2240
3,"[751, 232, 1355, 408]",[ 0.80859],892,00098d1376dab2.jpg,00098d1376dab2.jpg,../input/happy-whale-and-dolphin/test_images/0...,/kaggle/working/output/test/labels/00098d1376d...,37c7aba965a5 114207cab555 a6e325d8e924 19fbb96...,Test,2048
4,"[0, 2, 293, 318]",[ 0.89453],319,000b8d89c738bd.jpg,000b8d89c738bd.jpg,../input/happy-whale-and-dolphin/test_images/0...,/kaggle/working/output/test/labels/000b8d89c73...,37c7aba965a5 114207cab555 a6e325d8e924 19fbb96...,Test,293


In [5]:
saved_list = os.listdir(f'{ROOT_DIR}/test_backfins_images/')
for _, image, bbox in tqdm(test_backfins_df[["image","bbox"]].itertuples(), total=len(test_backfins_df)):
    if image in saved_list:
        continue
    if bbox != []:
        img = cv2.imread(f"{TEST_DIR}/{image}") 
        img = img[bbox[1]:bbox[3],bbox[0]:bbox[2]] 
    try:
        cv2.imwrite(f'{ROOT_DIR}/test_backfins_images/{image}', img)
    except:
        print(image)
        print(img.shape)

 87%|████████▋ | 24288/27956 [00:02<00:00, 10595.66it/s]

dfccc5735c4023.jpg
(0, 236, 3)


100%|██████████| 27956/27956 [02:19<00:00, 200.80it/s]  
