In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
from glob import glob
from random import shuffle, seed
import cv2
import torch
import torch.nn as nn
from torch.nn import Module
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torch.optim import Adam
from torch.nn import BCELoss
from tqdm.notebook import tqdm
from torchinfo import summary

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [3]:
torch.manual_seed(42)
seed(42)

In [4]:
train_folder = 'train_images'
test_folder = 'test_images'

In [5]:
train_metadata = 'train.csv'
test_metadata = 'test.csv'

In [6]:
train_df = pd.read_csv(train_metadata)
test_df = pd.read_csv(test_metadata)

In [7]:
train_df.head()

Unnamed: 0,site_id,patient_id,image_id,laterality,view,age,cancer,biopsy,invasive,BIRADS,implant,density,machine_id,difficult_negative_case
0,2,10006,462822612,L,CC,61.0,0,0,0,,0,,29,False
1,2,10006,1459541791,L,MLO,61.0,0,0,0,,0,,29,False
2,2,10006,1864590858,R,MLO,61.0,0,0,0,,0,,29,False
3,2,10006,1874946579,R,CC,61.0,0,0,0,,0,,29,False
4,2,10011,220375232,L,CC,55.0,0,0,0,0.0,0,,21,True


In [8]:
train_folders = os.listdir(train_folder)

In [9]:
train_folders[:5]

['33624', '64153', '65117', '19605', '40910']

In [10]:
train_df[train_df.patient_id==33624]

Unnamed: 0,site_id,patient_id,image_id,laterality,view,age,cancer,biopsy,invasive,BIRADS,implant,density,machine_id,difficult_negative_case
22215,1,33624,1111416410,L,MLO,56.0,0,0,0,,0,B,49,False
22216,1,33624,1462078398,L,CC,56.0,0,0,0,,0,B,49,False
22217,1,33624,788897862,R,CC,56.0,0,1,0,0.0,0,B,49,True
22218,1,33624,1362210397,R,MLO,56.0,0,1,0,0.0,0,B,49,True


In [11]:
len(train_df['image_id'].value_counts()) == len(train_df)

True

Each image_id is unique so they can be used for assigning the scans to each fold

# Getting the CV-Setup done

The Training data will be split into 5 folds where each fold should contain roughtly the same number of cancer cases

For each patient the left and right breast can probably be placed in a different fold without causing any data leakage

The patient_id equals the folder in train_images and the image_id equals the scan inside the folder

In [12]:
# this function returns the indices of each fold

def get_indices_folds(DF):
    
    df = DF.copy()
    
    # number of cancer & no cancer scans
    scans_no_cancer = len(df[df['cancer'] == 0])
    scans_cancer = len(df[df['cancer'] == 1])
    
    #so each fold should contain...
    scans_fold_no_cancer = int(scans_no_cancer / 5)
    scans_fold_cancer = int(scans_cancer / 5)
    
    patient_ids = list(train_df['patient_id'].unique())
    shuffle(patient_ids)
    
    # foldes to hold the image ids
    cancer_fold_1, cancer_fold_2, cancer_fold_3, cancer_fold_4, cancer_fold_5 = [], [], [], [], []
    no_cancer_fold_1, no_cancer_fold_2, no_cancer_fold_3, no_cancer_fold_4, no_cancer_fold_5 = [], [], [], [], []
    
    # list of folders
    cancer_folders = [cancer_fold_1, cancer_fold_2, cancer_fold_3, cancer_fold_4, cancer_fold_5]
    no_cancer_folders = [no_cancer_fold_1, no_cancer_fold_2, no_cancer_fold_3, no_cancer_fold_4, no_cancer_fold_5]
    
    # loop over all patient ids and assign them to a fold (each patient + side must be in a single fold)
    for ID in tqdm(patient_ids):
        for side in ['L', 'R']:
            filt = (df['patient_id'] == ID) & (df['laterality'] == side)  # boolean df as filter
            
            # df only with the current patient id & one breast
            current_df = df[filt]
            
            # image ids which should be assigned to one fold
            values_to_assign = current_df['image_id'].values
            
            # array of cancer values from the selected df
            cancer_value = current_df['cancer'].unique()
            
            # should only contain one value!
            if len(cancer_value) > 1:
                print('\n\n\nERROR: GOT DIFFERENT CANCER VALUES!\n\n\n')
            else:
                cancer_value = cancer_value[0]
            
            # the image ids should be assigned to the folder with the least values
            
            if cancer_value == 0:  # add it to one of no cancer folders
                len_folders = [len(folder) for folder in no_cancer_folders]
                folder = np.array(len_folders).argmin()  # the one with the least values
                for image_id in values_to_assign:
                    no_cancer_folders[folder].append(image_id)
                    
            # same with cancer_values
            elif cancer_value == 1:
                len_folders = [len(folder) for folder in cancer_folders]
                folder = np.array(len_folders).argmin()  # the one with the least values
                for image_id in values_to_assign:
                    cancer_folders[folder].append(image_id)
                
    return cancer_fold_1, cancer_fold_2, cancer_fold_3, cancer_fold_4, cancer_fold_5, no_cancer_fold_1, no_cancer_fold_2, no_cancer_fold_3, no_cancer_fold_4, no_cancer_fold_5

In [13]:
cancer_fold_1, cancer_fold_2, cancer_fold_3, cancer_fold_4, cancer_fold_5, no_cancer_fold_1, no_cancer_fold_2, no_cancer_fold_3, no_cancer_fold_4, no_cancer_fold_5 = get_indices_folds(train_df)

  0%|          | 0/11913 [00:00<?, ?it/s]

In [15]:
sum_len = 0
for folder in [cancer_fold_1, cancer_fold_2, cancer_fold_3, cancer_fold_4, cancer_fold_5, no_cancer_fold_1, no_cancer_fold_2, no_cancer_fold_3, no_cancer_fold_4, no_cancer_fold_5]:
    sum_len += len(folder)
    print(sum_len)


234
466
696
928
1158
11869
22578
33287
43997
54706


In [16]:
sum_len == len(train_df)

True

In [17]:
cancer_fold_1

[1274788012,
 1306633485,
 812707999,
 1256209009,
 1847998717,
 568569032,
 637984831,
 602369659,
 2132016159,
 759765009,
 1456539286,
 1705616801,
 698067885,
 1846840029,
 1863446,
 783906176,
 1874344878,
 1283242087,
 2058514481,
 1338411990,
 1568613395,
 401151862,
 825403089,
 1597116280,
 235064489,
 274766802,
 1481006643,
 1015929339,
 1494928875,
 212955895,
 623439984,
 892289164,
 1350576272,
 1378580532,
 259642159,
 1923100903,
 551129512,
 1191890529,
 515330003,
 1291644410,
 2142591939,
 4493744,
 2082874992,
 853840135,
 1956200588,
 42886513,
 685765674,
 11944138,
 102668587,
 998306444,
 1319601220,
 943062334,
 1735459739,
 1106334075,
 1731900075,
 1925785468,
 238449586,
 390797277,
 776982430,
 1824044872,
 728564060,
 1167308429,
 1719011144,
 384023835,
 1998384452,
 406764921,
 1897884654,
 187375781,
 831862963,
 378623093,
 801684731,
 1510269247,
 1710665236,
 22058316,
 732919056,
 291721849,
 1151546567,
 18384498,
 286472263,
 1096308702,
 18100045