# 검은색 마스킹 제거

In [1]:
import pandas as pd
import numpy as np
import os
import time
import random
from PIL import Image
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import cv2
import gc
import concurrent

import timm
import torch
import albumentations as A
import torch.nn as nn
from albumentations.pytorch import ToTensorV2
from torch.optim import Adam
from torchvision import transforms
import torchvision.transforms as T
from torch.utils.data import Dataset, DataLoader
from albumentations.core.transforms_interface import ImageOnlyTransform
from augraphy import *

from sklearn.metrics import accuracy_score, f1_score

import wandb

In [6]:
train_df = pd.read_csv('/data/ephemeral/home/data/train.csv')
train_df

Unnamed: 0,ID,target
0,002f99746285dfdd.jpg,16
1,008ccd231e1fea5d.jpg,10
2,008f5911bfda7695.jpg,10
3,009235e4c9c07af5.jpg,4
4,00b2f44967580c74.jpg,16
...,...,...
1565,fed9e9ec4a77bc06.jpg,4
1566,feeade617aa68c45.jpg,7
1567,ff51dd281a8423f1.jpg,11
1568,ff8a6a251ce51c95.jpg,5


In [3]:
train_df = df[(df['target']==3) |  (df['target']==4) | (df['target']==7) | (df['target']==14)]
train_df.reset_index(drop=True, inplace=True)
train_df

Unnamed: 0,ID,target
0,009235e4c9c07af5.jpg,4
1,012913977fd1d980.jpg,14
2,0250ee8107091ade.jpg,7
3,02dad82a9420ae86.jpg,7
4,03084c1b03921a99.jpg,7
...,...,...
345,fe30845b79d1d42f.jpg,7
346,fe73481aadfbec34.jpg,3
347,fe9683317086dbab.jpg,3
348,fed9e9ec4a77bc06.jpg,4


In [6]:
img = cv2.imread('/data/ephemeral/home/data/train/1ec14a14bbe633db.jpg')
img[200][200].tolist()

[193, 191, 180]

## 마킹 제거 함수

In [7]:
def find_mask(img):
    # 검은색 박스 탐지 (예: 색상 범위 설정)
    lower_black = np.array([0, 0, 0])
    upper_black = np.array([100, 100, 100])

    # 검은색 영역 마스크 생성
    mask = cv2.inRange(img, lower_black, upper_black)

    return mask

def del_mask(data_df):
    l = data_df.shape[0]
    for i in range(l):
        img = cv2.imread(os.path.join('/data/ephemeral/home/data/train', data_df.iloc[i, 0]))
        after = cv2.medianBlur(img, 11)
        mask = find_mask(after)
        img[mask!=0] = [np.mean(img[0]), np.mean(img[1]), np.mean(img[2])] # [204, 202, 202] or img[200][200].tolist()

        cv2.imwrite(os.path.join('/data/ephemeral/home/data/train_mask_deleted', data_df.iloc[i, 0]), img)

        del img, after
        gc.collect()

In [8]:
del_mask(train_df)

: 

# test 데이터 박스 제거

In [3]:
data_df = pd.read_csv('/data/ephemeral/home/data/sample_submission.csv')

In [4]:
def find_mask(img):
    # 검은색 박스 탐지 (예: 색상 범위 설정)
    lower_black = np.array([0, 0, 0])
    upper_black = np.array([100, 100, 100])

    # 검은색 영역 마스크 생성
    mask = cv2.inRange(img, lower_black, upper_black)

    return mask

def del_mask(data_df):
    l = data_df.shape[0]
    for i in range(l):
        img = cv2.imread(os.path.join('/data/ephemeral/home/data/test', data_df.iloc[i, 0]))
        after = cv2.medianBlur(img, 11)
        mask = find_mask(after)
        img[mask!=0] = [np.mean(img[0]), np.mean(img[1]), np.mean(img[2])] # [204, 202, 202] or img[200][200].tolist()

        cv2.imwrite(os.path.join('/data/ephemeral/home/data/test_mask_deleted', data_df.iloc[i, 0]), img)

        del img, after
        gc.collect()

In [5]:
del_mask(data_df)

### 멀티쓰레드 활용

In [16]:
data_df

Unnamed: 0,ID,target
0,0008fdb22ddce0ce.jpg,0
1,00091bffdffd83de.jpg,0
2,00396fbc1f6cc21d.jpg,0
3,00471f8038d9c4b6.jpg,0
4,00901f504008d884.jpg,0
...,...,...
3135,ffb4b6f619fb60ea.jpg,0
3136,ffb54299b1ad4159.jpg,0
3137,ffc2c91dff8cf2c0.jpg,0
3138,ffc4e330a5353a2a.jpg,0


In [19]:
def find_mask(img):
    # 검은색 박스 탐지 (예: 색상 범위 설정)
    lower_black = np.array([0, 0, 0])
    upper_black = np.array([100, 100, 100])

    # 검은색 영역 마스크 생성
    mask = cv2.inRange(img, lower_black, upper_black)

    return mask

def process_chunk(chunk_range):
    for i in range(chunk_range[0], chunk_range[1]):
        img = cv2.imread(os.path.join('/data/ephemeral/home/data/test', data_df.iloc[i, 0]))
        after = cv2.medianBlur(img, 11)
        mask = find_mask(after)
        img[mask!=0] = [np.mean(img[0]), np.mean(img[1]), np.mean(img[2])] # [204, 202, 202] or img[200][200].tolist()

        cv2.imwrite(os.path.join('/data/ephemeral/home/data/test_mask_deleted2', data_df.iloc[i, 0]), img)

        del img, after
        gc.collect()

l = data_df.shape[0]
save_path_root = '/data/ephemeral/home/data/test_mask_deleted2'

with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    futures = []

    for start in (0, l, 628):
        chunk_range = (start, start+628)
        future = executor.submit(process_chunk, chunk_range)
        futures.append(future)

        del future
        gc.collect()

In [22]:
len(os.listdir('/data/ephemeral/home/data/test_mask_deleted2'))

1256

In [15]:
3140/5


628.0