In [1]:
import sys
import os
import cv2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import math
from tqdm.auto import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import albumentations as A
from albumentations.pytorch.transforms import ToTensorV2

import torchvision.models as models

from sklearn import metrics
from sklearn.preprocessing import StandardScaler, LabelEncoder

from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings(action='ignore') 

In [2]:
os.getcwd()

'/Users/krc/Documents/breast_dacon/dacon_bc_prediction'

In [22]:
device = torch.device('cpu') if torch.backends.mps.is_available() else torch.device('cpu')

In [4]:
base_path = '/Users/krc/Documents/breast_dacon'

train_df = pd.read_csv('/Users/krc/Documents/breast_dacon/open/train.csv')
test_df = pd.read_csv('/Users/krc/Documents/breast_dacon/open/test.csv')

# os.mkdir('./new_train') 
# os.mkdir('./new_test')

--- 

## resize (background delete) - K means - score evaluation - apply crop

In [5]:
os.getcwd()

'/Users/krc/Documents/breast_dacon/dacon_bc_prediction'

In [6]:
from numpy.linalg import norm
import copy 

#From t.ly/TLq_


class KMeans():
    def __init__(self, n_clusters, max_iter = 20, random_state = 60):
        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.random_state = random_state
        
    def initialize_centroids(self, X): # center 초기화 - 이미지 내 특정 점으로 고정
        np.random.RandomState(self.random_state)
        random_idx = np.random.permutation(X.shape[0])
        centroids = X[random_idx[:self.n_clusters]]

        return centroids
        
    def compute_centroids(self, X, labels): # centroids 계산 
        centroids = np.zeros((self.n_clusters, X.shape[1]))
        for k in range(self.n_clusters):
            centroids[k, :] = np.mean(X[labels == k, :], axis=0)
        return centroids
        
    def compute_distance(self, X, centroids): #
        distance = np.zeros((X.shape[0], self.n_clusters))
        for k in range(self.n_clusters):
            row_norm = norm(X - centroids[k, :], axis=1)
            distance[:, k] = np.square(row_norm)
        return distance
    
    def find_closest_cluster(self, distance):
        return np.argmin(distance, axis=1)
    
    def compute_sse(self, X, labels, centroids):
        distance = np.zeros(X.shape[0])
        for k in range(self.n_clusters):
            distance[labels == k] = norm(X[labels == k] - centroids[k], axis=1)
        return np.sum(np.square(distance))
    
    def fit(self, X):
        self.centroids = self.initialize_centroids(X)
 
        for i in range(self.max_iter):
            old_centroids = self.centroids
            distance = self.compute_distance(X, old_centroids)
            self.labels = self.find_closest_cluster(distance)
            self.centroids = self.compute_centroids(X, self.labels)
            if np.all(old_centroids == self.centroids):
                break
        self.error = self.compute_sse(X, self.labels, self.centroids)
        # return self.labels
        
    def predict(self, X):
        distance = self.compute_distance(X, self.centroids)
        return self.find_closest_cluster(distance)

In [41]:
# file_path = '/Users/krc/Documents/breast_dacon/train_imgs/'
# file_list = os.walk(file_path)  # OS로 불러오기
# img_files = [file for file in file_list if file[-1][-1].endswith(".png")]



# if not img_files:  # if empty folder
#     print("there are no png files")
#     sys.exit()

# for i, f in enumerate(img_files[0][2]):
#     if i% 20 == 0:
#         print(f'index {i} is starting.. ')
#     # t.ly/zgLP
#     image = cv2.imread(file_path + f)
#     y_orig, x_orig, channel= image.shape
#     if x_orig / y_orig < 1.5:
#         cv2.imwrite(f'./new_train/{f}', image)
#         print(f'{f} has one image')
#         continue
#     gray_sample = image.copy()
#     gray_sample = cv2.cvtColor(gray_sample, cv2.COLOR_RGB2GRAY)
    
#     # 모양 맞추기 flip 
#     gray_sample =cv2.flip(gray_sample, 0)
#     # resize to 400 * 200
#     gray_sample = cv2.resize(gray_sample, dsize=(400, 200), interpolation=cv2.INTER_LINEAR) 
    
#     #전처리
#     coord = np.where( gray_sample < 239 )
#     co_array = np.array(coord)
#     co_array = np.float32(co_array).T
#     from sklearn.metrics import silhouette_samples, silhouette_score

#     best_score = 0
#     best_k = 1
#     best_centroids = []

#     for j, k in enumerate([ 4, 3, 2]):
        
#         # Run the Kmeans algorithm
#         km = KMeans(n_clusters=k)
#         km.fit(co_array)
#         labels = km.predict(co_array) # input data

#         centroids = km.compute_centroids(co_array, labels) # cluster_centers_
#         # Get silhouette samples
#         silhouette_vals = silhouette_samples(co_array, labels)

#         # Get the average silhouette score and plot it
#         avg_score = np.mean(silhouette_vals) ## score 

#         # print('K = ',k, 'avg_score:', avg_score)

#         if best_score < avg_score:
#             if best_k == 3 and k ==2 and (avg_score-best_score) < 0.09:
#                 break
#             best_score = avg_score
#             best_k = k
#             best_centroids = centroids
#             x_coord = centroids[ : , 1]
#             x_coord.sort()        
#             if best_k == 2:
#                 x1, x2 = x_coord
#             elif best_k ==3  :
#                 x1, x2, x3 = x_coord
#             elif best_k ==4  :
#                 x1, x2, x3, x4 = x_coord

#     if best_k == 2:
#         crop_image = image[:, : int( (x1+x2)/2  *x_orig / 400 ), :]
#     elif best_k > 2:
#         alpha = int(y_orig / 2)
#         crop_image = image[:, max(0, int(x2 *x_orig /400) -alpha)  :int(x2 *x_orig /400) +alpha , : ] ## 마이너스나옴 

#     cv2.imwrite(f'./new_train/{f}', crop_image)
    
    

index 0 is starting.. 
BC_01_1092.png has one image
index 20 is starting.. 
index 40 is starting.. 
index 60 is starting.. 
BC_01_1939.png has one image
BC_01_1093.png has one image
BC_01_1091.png has one image
index 80 is starting.. 
index 100 is starting.. 
BC_01_2348.png has one image
BC_01_0007.png has one image
index 120 is starting.. 
index 140 is starting.. 
index 160 is starting.. 
BC_01_0215.png has one image
BC_01_2358.png has one image
BC_01_0214.png has one image
index 180 is starting.. 
BC_01_0348.png has one image
BC_01_2575.png has one image
index 200 is starting.. 
index 220 is starting.. 
BC_01_0217.png has one image
index 240 is starting.. 
BC_01_0304.png has one image
index 260 is starting.. 
BC_01_0674.png has one image
BC_01_1436.png has one image
BC_01_2659.png has one image
index 280 is starting.. 
index 300 is starting.. 
BC_01_1345.png has one image
index 320 is starting.. 
BC_01_1582.png has one image
index 340 is starting.. 
BC_01_0298.png has one image
index

In [43]:
# # test imgs crop 
# file_path = '/Users/krc/Documents/breast_dacon/test_imgs/'
# file_list = os.walk(file_path)  # OS로 불러오기
# img_files = [file for file in file_list if file[-1][-1].endswith(".png")]



# if not img_files:  # if empty folder
#     print("there are no png files")
#     sys.exit()

# for i, f in enumerate(img_files[0][2]):
#     if i% 20 == 0:
#         print(f'index {i} is starting.. ')
#     # t.ly/zgLP
#     image = cv2.imread(file_path + f)
#     y_orig, x_orig, channel= image.shape
#     if x_orig / y_orig < 1.5:
#         cv2.imwrite(f'./new_test/{f}', image)
#         print(f'image {f} has one image')
#         continue
#     gray_sample = image.copy()
#     gray_sample = cv2.cvtColor(gray_sample, cv2.COLOR_RGB2GRAY)
    
#     # 모양 맞추기 flip 
#     gray_sample =cv2.flip(gray_sample, 0)
#     # resize to 400 * 200
#     gray_sample = cv2.resize(gray_sample, dsize=(400, 200), interpolation=cv2.INTER_LINEAR) 
    
#     #전처리
#     coord = np.where( gray_sample < 239 )
#     co_array = np.array(coord)
#     co_array = np.float32(co_array).T
#     from sklearn.metrics import silhouette_samples, silhouette_score

#     best_score = 0
#     best_k = 1
#     best_centroids = []

#     for j, k in enumerate([ 4, 3, 2]):
        
#         # Run the Kmeans algorithm
#         km = KMeans(n_clusters=k)
#         km.fit(co_array)
#         labels = km.predict(co_array) # input data

#         centroids = km.compute_centroids(co_array, labels) # cluster_centers_
#         # Get silhouette samples
#         silhouette_vals = silhouette_samples(co_array, labels)

#         # Get the average silhouette score and plot it
#         avg_score = np.mean(silhouette_vals) ## score 

#         # print('K = ',k, 'avg_score:', avg_score)

#         if best_score < avg_score:
#             if best_k == 3 and k ==2 and (avg_score-best_score) < 0.09:
#                 break
#             best_score = avg_score
#             best_k = k
#             best_centroids = centroids
#             x_coord = centroids[ : , 1]
#             x_coord.sort()        
#             if best_k == 2:
#                 x1, x2 = x_coord
#             elif best_k ==3  :
#                 x1, x2, x3 = x_coord
#             elif best_k ==4  :
#                 x1, x2, x3, x4 = x_coord

#     if best_k == 2:
#         crop_image = image[:, : int( (x1+x2)/2  *x_orig / 400 ), :]
#     elif best_k > 2:
#         alpha = int(y_orig / 2)
#         crop_image = image[:, max(0, int(x2 *x_orig /400) -alpha)  :int(x2 *x_orig /400) +alpha , : ] ## 마이너스나옴 

#     cv2.imwrite(f'./new_test/{f}', crop_image)
    
    

index 0 is starting.. 
image BC_01_3257.png has one image
image BC_01_1246.png has one image
index 20 is starting.. 
image BC_01_2774.png has one image
image BC_01_2629.png has one image
index 40 is starting.. 
image BC_01_0933.png has one image
index 60 is starting.. 
image BC_01_2263.png has one image
index 80 is starting.. 
index 100 is starting.. 
image BC_01_2933.png has one image
image BC_01_2718.png has one image
index 120 is starting.. 
image BC_01_2527.png has one image
index 140 is starting.. 
image BC_01_0848.png has one image
index 160 is starting.. 
image BC_01_0526.png has one image
index 180 is starting.. 
image BC_01_0233.png has one image
image BC_01_0390.png has one image
index 200 is starting.. 
image BC_01_2803.png has one image
index 220 is starting.. 
index 240 is starting.. 


--- 
## Model input

In [7]:
# hypter parameter

CFG = {
    'IMG_SIZE':1024,
    'IMG_SIZE_D':1024,
    'EPOCHS':20,
    'LEARNING_RATE':1e-4,
    'BATCH_SIZE':8,
    'SEED':27
}

In [8]:
import random

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
#     torch.cuda.manual_seed(seed)
    torch.backends.mps.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(CFG['SEED']) # Seed 고정

### data load 

In [9]:

train_df = pd.read_csv('../open/train.csv')
test_df = pd.read_csv('../open/test.csv')

In [10]:
train_df, val_df, train_labels, val_labels = train_test_split(
                                                    train_df.drop(columns=['N_category']), 
                                                    train_df['N_category'], 
                                                    test_size=0.1,  # 0.2 
                                                    random_state=CFG['SEED']
                                                )

In [11]:
base = './new_train'

In [33]:
base + train_df['img_path'].iloc[0][-14:]

'./new_test/BC_01_2318.png'

In [13]:
class CustomDataset(Dataset):
    def __init__(self, medical_df, labels, transforms=None):
        self.medical_df = medical_df
        self.transforms = transforms
        self.labels = labels
        
    def __getitem__(self, index):
        img_path = base + self.medical_df['img_path'].iloc[index][-14:]
        # print(img_path)
        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        if self.transforms is not None:
            image = self.transforms(image=image)['image']
                
        if self.labels is not None:
            label = self.labels[index]
            return image,  label #tabular = 2nd
        else:
            return image #, tabular
          
    def __len__(self):
        return len(self.medical_df)

In [14]:
train_transforms = A.Compose([
                            A.HorizontalFlip(),
                            A.VerticalFlip(),
                            A.Rotate(limit=180, border_mode=cv2.BORDER_CONSTANT,p=0.3),
                            A.GaussNoise(p=0.2), #NOISE ADD 
                            A.Resize(CFG['IMG_SIZE_D'],CFG['IMG_SIZE']),
                            A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225), max_pixel_value=255.0, always_apply=False, p=1.0),
                            ToTensorV2()
                            ])

test_transforms = A.Compose([
                            A.Resize(CFG['IMG_SIZE_D'],CFG['IMG_SIZE']),
                            A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225), max_pixel_value=255.0, always_apply=False, p=1.0),
                            ToTensorV2()
                            ])

In [15]:
train_dataset = CustomDataset(train_df, train_labels.values, train_transforms)
train_loader = DataLoader(train_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=True, num_workers=0)

val_dataset = CustomDataset(val_df, val_labels.values, test_transforms)
val_loader = DataLoader(val_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

## model architecture

In [17]:
class ImgFeatureExtractor(nn.Module):
    def __init__(self):
        super(ImgFeatureExtractor, self).__init__()
        self.backbone = models.swin_s(weights=True)
        for param in self.backbone.parameters():
            param.requires_grad = False
        for param in self.backbone.head.parameters():
            param.requires_grad = True
        self.embedding = nn.Linear(1000,512)

        
    def forward(self, x):
        x = self.backbone(x)

        x = self.embedding(x)
        return x

In [18]:
class ClassificationModel(nn.Module):
    def __init__(self):
        super(ClassificationModel, self).__init__()
        self.img_feature_extractor = ImgFeatureExtractor()
        self.classifier = nn.Sequential(
            nn.Linear(in_features=512, out_features=256),
            nn.LeakyReLU(),

            nn.Linear(in_features=256, out_features=128),
            nn.LeakyReLU(),

            nn.Linear(in_features=128, out_features=64),
            nn.LeakyReLU(),

            nn.Linear(in_features=64, out_features=1),
            nn.Sigmoid(),
        )
        
    def forward(self, img): 
        img_feature = self.img_feature_extractor(img)
        output = self.classifier(img_feature) 
        return output

In [19]:
def train(model, optimizer, train_loader, val_loader, scheduler, device):
    model.to(device)
    criterion = nn.BCEWithLogitsLoss().to(device)
    
    best_score = 0
    best_epcoh = 0 
    best_model = None
    
    for epoch in range(1, CFG['EPOCHS']+1):
        model.train()
        train_loss = []
        for img, label in tqdm(iter(train_loader)): 
            img = img.float().to(device)
            label = label.float().to(device)
            
            optimizer.zero_grad()
            
            model_pred = model(img)
            
            loss = criterion(model_pred, label.reshape(-1,1))
            
            loss.backward()
            optimizer.step()
            
            train_loss.append(loss.item())
        
        val_loss, val_score = validation(model, criterion, val_loader, device)
        print(f'Epoch [{epoch}], Train Loss : [{np.mean(train_loss):.5f}] Val Loss : [{val_loss:.5f}] Val Score : [{val_score:.5f}]')
        
        if scheduler is not None:
            scheduler.step(val_score)
        
        if best_score < val_score:
            best_epoch = epoch
            best_score = val_score
            best_model = model
            torch.save(best_model.state_dict(), f'./{epoch}_model.pth')
    
    return best_model

In [20]:
def validation(model, criterion, val_loader, device):
    model.eval()
    pred_labels = []
    true_labels = []
    val_loss = []
    threshold = 0.5
    with torch.no_grad():
        for img, label in tqdm(iter(val_loader)): 
            true_labels += label.tolist()
            
            img = img.float().to(device)
            label = label.float().to(device)
            
            model_pred = model(img)
            
            loss = criterion(model_pred, label.reshape(-1,1))
            
            val_loss.append(loss.item())
            
            model_pred = model_pred.squeeze(1).to('cpu')  
            pred_labels += model_pred.tolist()
    
    pred_labels = np.where(np.array(pred_labels) > threshold, 1, 0)
    val_score = metrics.f1_score(y_true=true_labels, y_pred=pred_labels, average='macro')
    return np.mean(val_loss), val_score

In [23]:
model = nn.DataParallel(ClassificationModel())

model.eval()
optimizer = torch.optim.Adam(params = model.parameters(), lr = CFG["LEARNING_RATE"])
#scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=1, threshold_mode='abs',min_lr=1e-8, verbose=True)
scheduler = None

infer_model = train(model, optimizer, train_loader, val_loader, scheduler, device)

  0%|          | 0/113 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch [1], Train Loss : [0.69816] Val Loss : [0.68247] Val Score : [0.27536]


  0%|          | 0/113 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch [2], Train Loss : [0.68059] Val Loss : [0.63513] Val Score : [0.51240]


  0%|          | 0/113 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch [3], Train Loss : [0.66264] Val Loss : [0.60371] Val Score : [0.63986]


  0%|          | 0/113 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch [4], Train Loss : [0.64995] Val Loss : [0.60523] Val Score : [0.63870]


  0%|          | 0/113 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch [5], Train Loss : [0.64508] Val Loss : [0.61122] Val Score : [0.60682]


  0%|          | 0/113 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch [6], Train Loss : [0.64180] Val Loss : [0.58031] Val Score : [0.72780]


  0%|          | 0/113 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch [7], Train Loss : [0.64211] Val Loss : [0.59177] Val Score : [0.68922]


  0%|          | 0/113 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch [8], Train Loss : [0.64355] Val Loss : [0.58100] Val Score : [0.71820]


  0%|          | 0/113 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch [9], Train Loss : [0.64586] Val Loss : [0.59613] Val Score : [0.64968]


  0%|          | 0/113 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch [10], Train Loss : [0.63818] Val Loss : [0.58540] Val Score : [0.71717]


  0%|          | 0/113 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch [11], Train Loss : [0.63210] Val Loss : [0.58244] Val Score : [0.71717]


  0%|          | 0/113 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch [12], Train Loss : [0.63478] Val Loss : [0.58395] Val Score : [0.69697]


  0%|          | 0/113 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch [13], Train Loss : [0.63846] Val Loss : [0.60473] Val Score : [0.64968]


  0%|          | 0/113 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch [14], Train Loss : [0.63552] Val Loss : [0.59417] Val Score : [0.68997]


  0%|          | 0/113 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch [15], Train Loss : [0.63705] Val Loss : [0.58932] Val Score : [0.71899]


  0%|          | 0/113 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch [16], Train Loss : [0.63607] Val Loss : [0.59419] Val Score : [0.69988]


  0%|          | 0/113 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch [17], Train Loss : [0.62968] Val Loss : [0.59983] Val Score : [0.64968]


  0%|          | 0/113 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch [18], Train Loss : [0.63611] Val Loss : [0.60441] Val Score : [0.65986]


  0%|          | 0/113 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch [19], Train Loss : [0.63013] Val Loss : [0.60187] Val Score : [0.64996]


  0%|          | 0/113 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch [20], Train Loss : [0.63100] Val Loss : [0.60139] Val Score : [0.66000]


## inference

In [26]:
base = './new_test/'

In [27]:
test_dataset = CustomDataset(test_df, None, test_transforms)
test_loader = DataLoader(test_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=False, num_workers=0)

In [28]:
def inference(model, test_loader, device):
    #model.to(device)
#     model.eval()
    preds = []
    threshold = 0.5
    
    with torch.no_grad():
        for img in tqdm(iter(test_loader)): # , tabular   = 2nd
            img = img.float().to(device)
            # tabular = tabular.float().to(device)
            
            model_pred = model(img)
            
            model_pred = model_pred.squeeze(1).to('cpu')
            
            preds += model_pred.tolist()
    
    preds = np.where(np.array(preds) > threshold, 1, 0)
    
    return preds

In [29]:
submit = pd.read_csv('../open/sample_submission.csv')

In [30]:
preds = inference(infer_model, test_loader, device)

  0%|          | 0/32 [00:00<?, ?it/s]

In [31]:
submit['N_category'] = preds
submit.to_csv('./submit.csv', index=False)