In [33]:
import matplotlib.pyplot as plt
import os
import tensorflow as tf
import numpy as np
import pandas as pd
import random

In [30]:
# Global constants

TRAIN_FOLDER = './train_tfrecords'
TEST_FOLDER = './test_tfrecords'
TRAIN_CSV = './train.csv'
TEST_CSV = './test.csv'

VAL_RATIO = 0.2

In [43]:
train_df = pd.read_csv(TRAIN_CSV)
test_df = pd.read_csv(TEST_CSV)

if 1:
    print(f'Train examples\n{train_df.head()}\n')
    print(f'Test examples\n{test_df.head()}')

Train examples
     image_name  patient_id     sex  age_approx anatom_site_general_challenge  \
0  ISIC_2637011  IP_7279968    male        45.0                     head/neck   
1  ISIC_0015719  IP_3075186  female        45.0               upper extremity   
2  ISIC_0052212  IP_2842074  female        50.0               lower extremity   
3  ISIC_0068279  IP_6890425  female        45.0                     head/neck   
4  ISIC_0074268  IP_8723313  female        55.0               upper extremity   

  diagnosis benign_malignant  target  
0   unknown           benign       0  
1   unknown           benign       0  
2     nevus           benign       0  
3   unknown           benign       0  
4   unknown           benign       0  

Test examples
     image_name  patient_id     sex  age_approx anatom_site_general_challenge
0  ISIC_0052060  IP_3579794    male        70.0                           NaN
1  ISIC_0052349  IP_7782715    male        40.0               lower extremity
2  ISIC_0058510

In [34]:
# Split train and valid while making sure there is no patients simultaneously in train and valid
unique_patient_ids = set(train_df['patient_id'])
unique_patient_ids = list(unique_patient_ids)

if 1:
    print(unique_patient_ids[:5])
    
random.shuffle(unique_patient_ids)

if 1:
    print(unique_patient_ids[:5])

['IP_2482649', 'IP_1564049', 'IP_3078108', 'IP_2350511', 'IP_9802994']
['IP_5142207', 'IP_6275614', 'IP_1386934', 'IP_9086201', 'IP_8759634']


In [44]:
start_val_idx = int( (1 - VAL_RATIO) * len(unique_patient_ids))
train_ids = unique_patient_ids[:start_val_idx]
valid_ids = unique_patient_ids[start_val_idx:]

small_train_df = train_df[train_df['patient_id'].isin(train_ids)].sample(frac=1).reset_index(drop=True)
valid_df = train_df[train_df['patient_id'].isin(valid_ids)].sample(frac=1).reset_index(drop=True)

# Checking that there is no common patient id
a = set(small_train_df['patient_id'])
b = set(valid_df['patient_id'])
c = a.intersection(b)

assert len(c) == 0, 'Patients simultaneously in training and validation set'

# Checking the size
print(f'There are {len(small_train_df)} samples in the small training set.')
print(f'There are {len(valid_df)} samples in the validation set.')

if 1:
    print(f'Train examples\n{small_train_df.head()}\n')
    print(f'Validation examples\n{valid_df.head()}')

There are 26239 samples in the small training set.
There are 6887 samples in the validation set.
Train examples
     image_name  patient_id     sex  age_approx anatom_site_general_challenge  \
0  ISIC_9750403  IP_3710233    male        60.0                         torso   
1  ISIC_9271514  IP_3854976  female        45.0                     head/neck   
2  ISIC_8021960  IP_9045497    male        70.0               lower extremity   
3  ISIC_8078202  IP_4938349    male        70.0               lower extremity   
4  ISIC_8336956  IP_5561220  female        20.0               upper extremity   

  diagnosis benign_malignant  target  
0   unknown           benign       0  
1   unknown           benign       0  
2     nevus           benign       0  
3   unknown           benign       0  
4   unknown           benign       0  

Validation examples
     image_name  patient_id     sex  age_approx anatom_site_general_challenge  \
0  ISIC_6300814  IP_4208266  female        35.0                  

In [39]:
train_records_list = os.listdir(TRAIN_FOLDER)
test_records_list = os.listdir(TEST_FOLDER)

if 0:
    print(train_records_list)
    print(test_records_list)

In [50]:
train_patient_ids = train_df['patient_id']
test_patient_ids = test_df['patient_id']

print(f'{len(train_patient_ids)} present patient IDs with {len(set(train_patient_ids))} unique IDs in train set')
print(f'{len(test_patient_ids)} present patient IDs with {len(set(test_patient_ids))} unique IDs in test set')

33126 present patient IDs with 2056 unique IDs in train set
10982 present patient IDs with 690 unique IDs in test set
