In [1]:
import os
import time
import gc

import timm
import torch
import albumentations as A
import pandas as pd
import numpy as np
import torch.nn as nn

from albumentations.pytorch import ToTensorV2
from torch.optim import Adam
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from torchvision.datasets import ImageFolder
from torch.optim.lr_scheduler import CosineAnnealingLR

from PIL import Image
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score

import matplotlib.pyplot as plt
import seaborn as sns

INFO:albumentations.check_version:A new version of Albumentations is available: 1.4.13 (you have 1.4.12). Upgrade using: pip install -U albumentations. To disable automatic update checks, set the environment variable NO_ALBUMENTATIONS_UPDATE to 1.


In [7]:
MODEL_PATH = '/upstage-cv-classification-cv2/data/results/best_model.pth'
MODEL_NAME = 'efficientnet_b4'

TRAIN_CSV_PATH = '/upstage-cv-classification-cv2/data/train.csv'

TEST_CSV_PATH = '/upstage-cv-classification-cv2/data/sample_submission.csv'
TEST_IMAGE_PATH = '/upstage-cv-classification-cv2/data/test'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 데이터 로드

In [3]:
# training config
img_size = 380
LR = 1e-3
BATCH_SIZE = 32

patience = 5
min_delta = 0.001 # 성능 개선의 최소 변화량

# test image 변환
data_transform = A.Compose([
    A.Resize(height = img_size, width = img_size),
    A.Normalize(mean=[0.485, 0.456, 0.406], std = [0.229, 0.224, 0.225]),
    ToTensorV2()
])

class ImageDataset(Dataset):
    def __init__(self, csv, path, transform=None):
        self.df = pd.read_csv(csv).values
        self.path = path
        self.transform = transform

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        name, target = self.df[idx]
        img = np.array(Image.open(os.path.join(self.path, name)))
        if self.transform:
            img = self.transform(image = img)['image']
    
        return img, target, name

    def get_labels(self):
        return self.df[:, 1] 

In [4]:
test_dataset = ImageDataset(
    TEST_CSV_PATH,
    TEST_IMAGE_PATH,
    transform = data_transform
)

test_loader = DataLoader(
    test_dataset,
    batch_size = BATCH_SIZE,
    num_workers = 0,
    pin_memory = True,
    drop_last = False
)

# 모델 로드

In [5]:
torch.cuda.empty_cache()
gc.collect()

36

In [8]:
model = timm.create_model(MODEL_NAME, pretrained=False, num_classes=17).to(device)
model.load_state_dict(torch.load(MODEL_PATH))

  model.load_state_dict(torch.load(MODEL_PATH))


<All keys matched successfully>

# 모델 평가

In [9]:
test_preds_list = []
test_ids = []
test_probs_list = []

loss_fn = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr = LR)

model.eval()

with torch.no_grad():
    pbar = tqdm(test_loader)
    for step, (image, targets, id) in enumerate(pbar):
        image = image.to(device)
        targets = targets.to(device)

        outputs = model(image)
        loss = loss_fn(outputs, targets)

        probs = torch.softmax(outputs, dim=1)
        probs, preds = torch.max(probs, dim=1)
        
        # 제일 확률 높은 데이터
        test_preds_list.extend(preds.detach().cpu().numpy()) 
        test_probs_list.extend(probs.detach().cpu().numpy())

        test_ids.extend(id)

        pbar.set_description(f"Loss: {loss.item():.4f}")



Loss: 13.8069: 100%|██████████| 99/99 [00:16<00:00,  5.92it/s]


In [10]:
test_result_df = pd.DataFrame()
test_result_df['id'] = test_ids
test_result_df['pred'] = test_preds_list
test_result_df['prob'] = test_probs_list

test_result_df

Unnamed: 0,id,pred,prob
0,0008fdb22ddce0ce.jpg,2,1.000000
1,00091bffdffd83de.jpg,12,1.000000
2,00396fbc1f6cc21d.jpg,5,1.000000
3,00471f8038d9c4b6.jpg,4,0.816450
4,00901f504008d884.jpg,2,1.000000
...,...,...,...
3135,ffb4b6f619fb60ea.jpg,6,0.999964
3136,ffb54299b1ad4159.jpg,10,0.968894
3137,ffc2c91dff8cf2c0.jpg,8,1.000000
3138,ffc4e330a5353a2a.jpg,0,1.000000


In [11]:
high_pred_df = test_result_df[test_result_df['prob'] > 0.999]
len(high_pred_df)

2512

In [12]:
high_pred_df.value_counts('pred')

pred
0     199
9     199
15    199
16    199
2     196
10    194
8     186
5     185
4     164
11    159
12    154
6     135
13    127
1      72
7      69
3      51
14     24
Name: count, dtype: int64

In [15]:
smaple_37_df = high_pred_df[(high_pred_df['pred'] == 3) | (high_pred_df['pred'] == 7)]
smaple_37_df.rename({'id': 'ID', 'pred' : 'target'})
smaple_37_df

Unnamed: 0,id,pred,prob
24,01c918594307c6f2.jpg,3,0.999945
54,0546cab14c4ee65b.jpg,7,0.999349
117,0b0e7e754b5a103c.jpg,7,0.999857
136,0c5b6caa96ec9882.jpg,3,0.999999
146,0d2833d1992b660a.jpg,3,0.999997
...,...,...,...
3032,f831eba2f22f6104.jpg,3,0.999997
3041,f8a3a3bffd832fb0.jpg,7,0.999984
3052,f95c84c901644f43.jpg,3,0.999966
3069,fa1a9c70401b36ff.jpg,3,0.999996


In [16]:
smaple_37_df.to_csv('/upstage-cv-classification-cv2/data/train_semi_37.csv', index=False)