## Import

In [1]:
import random
import pandas as pd
import numpy as np
import os
import cv2
from tqdm.auto import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import albumentations as A
from albumentations.pytorch.transforms import ToTensorV2

import torchvision.models as models

from sklearn import metrics
from sklearn.preprocessing import StandardScaler, LabelEncoder

from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings(action='ignore') 

In [2]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

## Hyperparameter Setting

In [3]:
CFG = {
    'IMG_SIZE':800,
    'IMG_SIZE_D':800,
    'EPOCHS':30,
    'LEARNING_RATE':1e-4,
    'BATCH_SIZE':8,
    'SEED':42
}

## Fixed RandomSeed

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED']) # Seed 고정

## Data Pre-processing
#### 1. Load Dataframe
#### 2. 결측치 보완
#### 3. Train / Validation Split
#### 4. Numeric Feature Scaling / Categorical Featrue Label-Encoding

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
os.getcwd()

'/content'

In [7]:
# !unzip -qq "/content/drive/MyDrive/Colab Notebooks/cropped_train.zip"
# !unzip -qq "/content/drive/MyDrive/Colab Notebooks/cropped_test.zip"


In [8]:
train_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/test.csv')

In [9]:
train_df.rename(columns = {'나이':'age',
                           '수술연월일':'yr_mn_dt',
                           '진단명':'diag_cat',
                           '암의 위치':'location',
                           '암의 개수':'sng_mul',
                           '암의 장경':'size_mm',
                           'DCIS_or_LCIS_여부':'DCIS_or_LCIS'},inplace=True)

test_df.rename(columns = {'나이':'age',
                           '수술연월일':'yr_mn_dt',
                           '진단명':'diag_cat',
                           '암의 위치':'location',
                           '암의 개수':'sng_mul',
                           '암의 장경':'size_mm',
                           'DCIS_or_LCIS_여부':'DCIS_or_LCIS'},inplace=True)

In [10]:
# #전처리 
# for i in range(1000):
#     y, m, d = map(int, train_df.iloc[i,4].split('-'))
#     num =  ((y-2000)*365)+(m*31)+d
#     train_df.iloc[i,4] = num/1000

# for i in range(250):
#     y, m, d = map(int, test_df.iloc[i,3].split('-'))
#     num =  ((y-2000)*365)+(m*31)+d
#     test_df.iloc[i,3] = num/1000

# train_df['size_mm'] = train_df['size_mm'].fillna(0) #mean
# train_df['HG'] = train_df['HG'].fillna(4)
# train_df['HG_score_1'] = train_df['HG_score_1'].fillna(4)
# train_df['HG_score_3'] = train_df['HG_score_3'].fillna(4)
# train_df[train_df['T_category'] == 1|2|3] = 1
# train_df['T_category'] = train_df['T_category'].fillna(-1)
# train_df = train_df.fillna(0)

# test_df['size_mm'] = test_df['size_mm'].fillna(0) #mean
# test_df['HG'] = test_df['HG'].fillna(4)
# test_df['HG_score_1'] = test_df['HG_score_1'].fillna(4)
# test_df['HG_score_3'] = test_df['HG_score_3'].fillna(4)
# test_df[test_df['T_category'] == 1|2|3] = 1
# test_df['T_category'] = test_df['T_category'].fillna(-1)
# test_df = test_df.fillna(0)

In [11]:
### 추가 전처리 

train_df.drop(labels= train_df[train_df.PR_Allred_score > 9].index, axis = 0, inplace = True)
train_df.drop(['ER', 'HG_score_2', 'DCIS_or_LCIS_type', 'ER'], axis = 1, inplace = True)

print(train_df.shape)
train_df

(998, 25)


Unnamed: 0,ID,img_path,mask_path,age,yr_mn_dt,diag_cat,location,sng_mul,size_mm,NG,...,ER_Allred_score,PR,PR_Allred_score,KI-67_LI_percent,HER2,HER2_IHC,HER2_SISH,HER2_SISH_ratio,BRCA_mutation,N_category
0,BC_01_0001,./train_imgs/BC_01_0001.png,-,63,2015-10-23,1,2,1,19.0,2.0,...,8.0,1.0,6.0,12.0,0.0,1.0,,,,0
1,BC_01_0002,./train_imgs/BC_01_0002.png,-,51,2015-10-28,1,1,1,22.0,3.0,...,,0.0,,70.0,0.0,0.0,,,,1
2,BC_01_0003,./train_imgs/BC_01_0003.png,-,37,2015-10-29,1,2,1,,2.0,...,7.0,1.0,4.0,7.0,0.0,1.0,,,0.0,0
3,BC_01_0004,./train_imgs/BC_01_0004.png,-,54,2016-03-08,1,2,1,0.0,3.0,...,,0.0,,1.0,1.0,3.0,,,,0
4,BC_01_0005,./train_imgs/BC_01_0005.png,-,57,2015-10-30,1,2,1,8.0,2.0,...,8.0,0.0,,8.0,1.0,2.0,1.0,5.44,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,BC_01_3464,./train_imgs/BC_01_3464.png,-,65,2006-12-22,1,2,1,25.0,1.0,...,,0.0,,,0.0,0.0,,,,1
996,BC_01_3482,./train_imgs/BC_01_3482.png,-,48,2006-11-17,1,1,1,7.0,1.0,...,,1.0,,,0.0,0.0,,,,0
997,BC_01_3485,./train_imgs/BC_01_3485.png,-,64,2006-11-10,1,2,1,15.0,1.0,...,,1.0,,,0.0,0.0,,,,1
998,BC_01_3502,./train_imgs/BC_01_3502.png,-,50,2006-09-22,1,1,1,7.0,1.0,...,,0.0,,,0.0,0.0,,,,0


In [12]:
# test data 전처리
test_df.drop(['ER', 'HG_score_2', 'DCIS_or_LCIS_type', 'ER'], axis = 1, inplace = True)
test_df

Unnamed: 0,ID,img_path,age,yr_mn_dt,diag_cat,location,sng_mul,size_mm,NG,HG,...,T_category,ER_Allred_score,PR,PR_Allred_score,KI-67_LI_percent,HER2,HER2_IHC,HER2_SISH,HER2_SISH_ratio,BRCA_mutation
0,BC_01_0011,./test_imgs/BC_01_0011.png,55,2015-11-17,2,2,1,23.0,2.0,2.0,...,2.0,8.0,0.0,2.0,5.00,0.0,1.0,,,
1,BC_01_0220,./test_imgs/BC_01_0220.png,43,2020-06-09,4,2,1,13.0,3.0,2.0,...,1.0,4.0,1.0,8.0,8.67,0.0,0.0,,,
2,BC_01_0233,./test_imgs/BC_01_0233.png,76,2020-05-14,1,1,1,,,,...,0.0,6.0,1.0,6.0,,,2.0,,,
3,BC_01_0258,./test_imgs/BC_01_0258.png,58,2020-05-20,1,2,1,1.3,2.0,2.0,...,1.0,7.0,0.0,,21.17,1.0,3.0,,,0.0
4,BC_01_0260,./test_imgs/BC_01_0260.png,56,2020-05-20,1,2,2,15.0,3.0,3.0,...,1.0,8.0,1.0,3.0,20.57,1.0,3.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,BC_01_3328,./test_imgs/BC_01_3328.png,61,2009-10-30,1,1,1,18.0,2.0,2.0,...,1.0,5.0,1.0,3.0,,0.0,0.0,,,
246,BC_01_3404,./test_imgs/BC_01_3404.png,42,2009-05-19,1,2,1,20.0,1.0,2.0,...,1.0,5.0,1.0,5.0,,0.0,1.0,,,
247,BC_01_3418,./test_imgs/BC_01_3418.png,37,2009-04-24,1,1,1,17.0,1.0,1.0,...,1.0,5.0,1.0,5.0,,0.0,0.0,,,
248,BC_01_3438,./test_imgs/BC_01_3438.png,37,2009-02-06,1,1,1,7.0,1.0,1.0,...,1.0,5.0,1.0,5.0,,0.0,0.0,,,


In [13]:
train_df, val_df, train_labels, val_labels = train_test_split(
                                                    train_df.drop(columns=['N_category']), 
                                                    train_df['N_category'], 
                                                    test_size=0.1,  # 0.2 
                                                    random_state=CFG['SEED']
                                                )

In [14]:
def get_values(value):
    return value.values.reshape(-1, 1)

numeric_cols = ['age', 'size_mm', 'ER_Allred_score', 'PR_Allred_score', 'KI-67_LI_percent', 'HER2_SISH_ratio']
ignore_cols = ['ID', 'img_path', 'mask_path', 'yr_mn_dt', 'N_category']

for col in train_df.columns:
    if col in ignore_cols:
        continue
    if col in numeric_cols:
        scaler = StandardScaler()
        train_df[col] = scaler.fit_transform(get_values(train_df[col]))
        val_df[col] = scaler.transform(get_values(val_df[col]))
        test_df[col] = scaler.transform(get_values(test_df[col]))
    else:
        le = LabelEncoder()
        train_df[col] = le.fit_transform(get_values(train_df[col]))
        val_df[col] = le.transform(get_values(val_df[col]))
        test_df[col] = le.transform(get_values(test_df[col]))

## CustomDataset

In [15]:
base = '/content/train_imgs'

In [16]:
base + train_df['img_path'].iloc[0][12:]

'/content/train_imgs/BC_01_2648.png'

In [17]:
class CustomDataset(Dataset):
    def __init__(self, medical_df, labels, transforms=None):
        self.medical_df = medical_df
        self.transforms = transforms
        self.labels = labels
        
    def __getitem__(self, index):
        img_path = base + self.medical_df['img_path'].iloc[index][12:]
        # print(img_path)
        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        if self.transforms is not None:
            image = self.transforms(image=image)['image']
                
        if self.labels is not None:
            label = self.labels[index]
            return image,  label #tabular = 2nd
        else:
            return image #, tabular
          
    def __len__(self):
        return len(self.medical_df)

In [18]:
train_transforms = A.Compose([
                            A.HorizontalFlip(),
                            A.VerticalFlip(),
                            A.Rotate(limit=180, border_mode=cv2.BORDER_CONSTANT,p=0.3),
                            A.GaussNoise(p=0.5), #NOISE ADD 
                            A.Resize(CFG['IMG_SIZE_D'],CFG['IMG_SIZE']),
                            A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225), max_pixel_value=255.0, always_apply=False, p=1.0),
                            ToTensorV2()
                            ])

test_transforms = A.Compose([
                            A.Resize(CFG['IMG_SIZE_D'],CFG['IMG_SIZE']),
                            A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225), max_pixel_value=255.0, always_apply=False, p=1.0),
                            ToTensorV2()
                            ])

In [19]:
train_dataset = CustomDataset(train_df, train_labels.values, train_transforms)
train_loader = DataLoader(train_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=True, num_workers=0)

val_dataset = CustomDataset(val_df, val_labels.values, test_transforms)
val_loader = DataLoader(val_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

## Model Architecture

In [20]:
ex_model = models.resnet50(weights=True)
for i, param in enumerate(ex_model.parameters()):
  print(i, len(param))

0 64
1 64
2 64
3 64
4 64
5 64
6 64
7 64
8 64
9 256
10 256
11 256
12 256
13 256
14 256
15 64
16 64
17 64
18 64
19 64
20 64
21 256
22 256
23 256
24 64
25 64
26 64
27 64
28 64
29 64
30 256
31 256
32 256
33 128
34 128
35 128
36 128
37 128
38 128
39 512
40 512
41 512
42 512
43 512
44 512
45 128
46 128
47 128
48 128
49 128
50 128
51 512
52 512
53 512
54 128
55 128
56 128
57 128
58 128
59 128
60 512
61 512
62 512
63 128
64 128
65 128
66 128
67 128
68 128
69 512
70 512
71 512
72 256
73 256
74 256
75 256
76 256
77 256
78 1024
79 1024
80 1024
81 1024
82 1024
83 1024
84 256
85 256
86 256
87 256
88 256
89 256
90 1024
91 1024
92 1024
93 256
94 256
95 256
96 256
97 256
98 256
99 1024
100 1024
101 1024
102 256
103 256
104 256
105 256
106 256
107 256
108 1024
109 1024
110 1024
111 256
112 256
113 256
114 256
115 256
116 256
117 1024
118 1024
119 1024
120 256
121 256
122 256
123 256
124 256
125 256
126 1024
127 1024
128 1024
129 512
130 512
131 512
132 512
133 512
134 512
135 2048
136 2048
137 2048
138

In [21]:
class ImgFeatureExtractor(nn.Module):
    def __init__(self):
        super(ImgFeatureExtractor, self).__init__()
        self.backbone = models.resnet50(weights=True)
        for i, param in enumerate(self.backbone.parameters()):
              if i >139:
                param.requires_grad= True
        self.embedding = nn.Linear(1000,512)

        
    def forward(self, x):
        x = self.backbone(x)

        x = self.embedding(x)
        return x

In [22]:
class ClassificationModel(nn.Module):
    def __init__(self):
        super(ClassificationModel, self).__init__()
        self.img_feature_extractor = ImgFeatureExtractor()
        self.classifier = nn.Sequential(
            nn.Linear(in_features=512, out_features=256),
            nn.LeakyReLU(),

            nn.Linear(in_features=256, out_features=128),
            nn.LeakyReLU(),

            nn.Linear(in_features=128, out_features=1),
            nn.Sigmoid(),
        )
        
    def forward(self, img): 
        img_feature = self.img_feature_extractor(img)
        output = self.classifier(img_feature) 
        return output

## Train

In [23]:
def train(model, optimizer, train_loader, val_loader, scheduler, device):
    model.to(device)
    criterion = nn.BCEWithLogitsLoss().to(device)
    
    best_score = 0
    best_epcoh = 0 
    best_model = None
    
    for epoch in range(1, CFG['EPOCHS']+1):
        model.train()
        train_loss = []
        for img, label in tqdm(iter(train_loader)): 
            img = img.float().to(device)
            label = label.float().to(device)
            
            optimizer.zero_grad()
            
            model_pred = model(img)
            
            loss = criterion(model_pred, label.reshape(-1,1))
            
            loss.backward()
            optimizer.step()
            
            train_loss.append(loss.item())
        
        val_loss, val_score = validation(model, criterion, val_loader, device)
        print(f'Epoch [{epoch}], Train Loss : [{np.mean(train_loss):.5f}] Val Loss : [{val_loss:.5f}] Val Score : [{val_score:.5f}]')
        
        if scheduler is not None:
            scheduler.step(val_score)
        
        if best_score < val_score:
            best_epoch = epoch
            best_score = val_score
            best_model = model
            torch.save(best_model.state_dict(), f'./{epoch}_model.pth')
    
    return best_model

In [24]:
def validation(model, criterion, val_loader, device):
    model.eval()
    pred_labels = []
    true_labels = []
    val_loss = []
    threshold = 0.5
    with torch.no_grad():
        for img, label in tqdm(iter(val_loader)): 
            true_labels += label.tolist()
            
            img = img.float().to(device)
            label = label.float().to(device)
            
            model_pred = model(img)
            
            loss = criterion(model_pred, label.reshape(-1,1))
            
            val_loss.append(loss.item())
            
            model_pred = model_pred.squeeze(1).to('cpu')  
            pred_labels += model_pred.tolist()
    
    pred_labels = np.where(np.array(pred_labels) > threshold, 1, 0)
    val_score = metrics.f1_score(y_true=true_labels, y_pred=pred_labels, average='macro')
    return np.mean(val_loss), val_score

## WANDB setting

## Run!!

In [None]:
model = nn.DataParallel(ClassificationModel())
model.eval()
optimizer = torch.optim.Adam(params = model.parameters(), lr = CFG["LEARNING_RATE"])
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=1, threshold_mode='abs',min_lr=1e-8, verbose=True)

infer_model = train(model, optimizer, train_loader, val_loader, scheduler, device)

  0%|          | 0/113 [00:00<?, ?it/s]

## Inference

In [None]:
test_dataset = CustomDataset(test_df, None, test_transforms)
test_loader = DataLoader(test_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

In [None]:
def inference(model, test_loader, device):
    model.to(device)
    model.eval()
    preds = []
    threshold = 0.5
    
    with torch.no_grad():
        for img in tqdm(iter(test_loader)): # , tabular   = 2nd
            img = img.float().to(device)
            # tabular = tabular.float().to(device)
            
            model_pred = model(img)
            
            model_pred = model_pred.squeeze(1).to('cpu')
            
            preds += model_pred.tolist()
    
    preds = np.where(np.array(preds) > threshold, 1, 0)
    
    return preds

In [None]:
preds = inference(infer_model, test_loader, device)

## Submission

In [None]:
submit = pd.read_csv('./sample_submission.csv')

In [None]:
submit['N_category'] = preds
submit.to_csv('./submit.csv', index=False)

In [None]:
os.getcwd()