# Notebook- Create train, validation, test datasets 
# Author : V.Albors   Date : 20.01.2020
# Purpose :
Create the train, validation and tests datasets from the ISIC images.  
**Input** :  
  The .json files describing the type of images ( one .json per image )
  Files are located in directory :   
   * Json files : root_dir + json_dir 
  
   
**Output**:  
  Files :dftrain.csv, dfval.csv, dftest.csv - train, validation and test datasets. 
  Files separated by ";"  containing the following attributes per image :  
    * Image file name 
    * sex
    * benign/malignant
    * age 
    * site ( part where the nevus is located )
    * age_rang  (Rang of ages ['0-10', '11-20', '21-30', '31-40','41-50','51-60','61-70','71-80','81-90'])
    * age_rang2 (Rangs of ages ['0-30', '31-60', '61-91'] ) 
    * Image file name with extension .png
    
   * Convert the Crosstabs in dataframes to do a Chi ^2 test 
     Dataframes : 
        * table_bm_sex  
        * table_bm_age  
        * table_bm_sex_age
   * Save the data frames in the csv directory 
   
  These files are downloaded in directory : root_dir + csv_dir  

**Method** :
  Creation of a balanced train, validation, and test dataset 
  From total # of images 11.953  images reduce to a Total = 4.356 samples 
  
      * Total samples to test = 3.500  
      * 3.500  = 1.750 Benigm & 1.750 Malignant
      * Distribute the 3.500 images randomly into 60% train, 20% Val and 20% test
  
  * Read image characteristic from the .json file
  * Create groups acording to the ages ( Rangs of ages ['0-30'-young, '31-60'-middle, '61-91'-old]
  * Select the number of images for each of the 12 groups 
  * Create the .csv files that contain the train, validation and test datasets ( shuffled ) 
  * CSV files with fields :
        * internal image id
        * file name
        * sex  ( male, female )
        * bm ( benigm / malignant )
        * age
        * site ( location of the nevus ) 
        * age_rang   ( age rang : Rang of ages ['0-10', '11-20', '21-30', '31-40','41-50','51-60','61-70','71-80','81-90'] ) 
        * age_rang2 ( (Rangs of ages ['0-30', '31-60', '61-91'] ) 
        * file_name_ext  ( file name with extension ) 

In [1]:
# Import routines
import sys  
subrc_dir = "/home/user/Documentos/UOC/PFM/PFMProject/"
sys.path.append(subrc_dir) 
from  Models_routines import *
import inspect

# List functions inside the module
import Models_routines as module
functions = inspect.getmembers(module, inspect.isfunction)
lsfunctions = [item[0] for item in functions]
print ( lsfunctions )

['confusion_ROC_AUC', 'create_column_tensor', 'create_label_tensor', 'create_val_test', 'define_dirs', 'extract_images_bm', 'extract_images_train', 'load_hist_model', 'load_images', 'model_load', 'plot_save_acc_loss', 'print_network', 'process_clinical_info', 'read_dataframes', 'read_dataframes_tables', 'reproducible_results', 'save_model', 'save_network_json', 'start', 'stop', 'to_one_hot', 'to_one_hot_words', 'xi_squared']


In [2]:
# Reproducible results 
reproducible_results ()

Using TensorFlow backend.


In [3]:
# Define directories
(root_dir,json_dir,imag_dir,csv_dir,model_json_dir,model_bin_dir,results_dir,Tensor_dir) = define_dirs("")

In [4]:
print (csv_dir)
print (json_dir)
print (imag_dir)

/home/valborsf/Documents/UOC/PFMProject 
/home/valborsf/Documents/UOC/PFMProject/DataNew/ALL_JSON/
/home/valborsf/Documents/UOC/PFMProject/DataNew/ALL_IMAGES/


In [5]:
# New dataset without SONIC disturbing images
json_dir =  root_dir +"/DataNew/ALL_JSON/"                # .json dir images
imag_dir =  root_dir +"/DataNew/ALL_IMAGES/"              # .png dir - images

# directories for  CSV's
csv_dir =  root_dir +"/DataNew4/CSV/"                      # .csv dir - dftrain, dfval, dftest

In [6]:
# Get a list of the files that are located in the directory: root_dir +"/Data/ALL_JSON/"
import glob
import numpy

files_json = (glob.glob(json_dir+"*.json"))

print (len(files_json))                                # Number of images

11953


In [7]:
# Extract data from the .json file 
import json
import pandas as pd 

#data = json.load(open('/home/user/Documentos/UOC/PFM/Data/Original/Alot/ISIC-images/ISIC_0024524.json'

#  A dataframe is created (dfimages) with the following attributes : 
#                 file_name (image file), sex, benign-malignant, age
#  With .get load = None if no information exist

columns = ['file_name','sex', 'bm','age','site']

dfimages = pd.DataFrame(columns=columns)
data = {}
for i in range(len(files_json)):
   data[i] = json.load(open(files_json[i]))
   dfimages = dfimages.append ({'file_name': data[i]['name'],
                                'sex':       data[i]['meta']['clinical']['sex'],
                                'bm':        data[i]['meta']['clinical']['benign_malignant'],
                                'age':       data[i]['meta']['clinical']['age_approx'],
                                'site':      data[i]['meta']['clinical'].get('anatom_site_general')
                               },ignore_index=True)



In [8]:
# How to access info to the data
print(data[0]['meta']['clinical']['sex'])
print(data[0]['meta']['clinical']['benign_malignant'])
print(data[0]['meta']['clinical']['age_approx'])
print(data[0]['meta']['clinical']['anatom_site_general'])
print(dfimages.info())

male
benign
45
posterior torso
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11953 entries, 0 to 11952
Data columns (total 5 columns):
file_name    11953 non-null object
sex          11953 non-null object
bm           11953 non-null object
age          11953 non-null object
site         9888 non-null object
dtypes: object(5)
memory usage: 467.0+ KB
None


In [9]:
# Print type of dataframe attributes
# Convert attribute age to numeric 

dfimages["age"] = pd.to_numeric(dfimages["age"])
print(dfimages.dtypes)

file_name    object
sex          object
bm           object
age           int64
site         object
dtype: object


In [10]:
# Print example of data frame dfimages
print (dfimages[0:1])
print (dfimages['age'][3])

      file_name   sex      bm  age             site
0  ISIC_0032490  male  benign   45  posterior torso
20


In [11]:
# Try a first gouping for the images based on age  : age_rang
#     a second grouping                            : age_rang2
# age_rang = ['0-10', '11-20', '21-30', '31-40','41-50','51-60','61-70','71-80','81-90'] 
# age_rang2 = ['0-30', '31-60', '61-91'] 
# list compresion technique : [f(x) for x in sequence if condition]
#                             [f(x) if condition else g(x) for x in sequence]
# for index, row in df.iterrows():
#     df.at[index, 'new_column'] = new_value

for index, row in dfimages.iterrows():
    y = dfimages['age'][index]
    if y <= 10 :
        dfimages.at[index,'age_rang'] = '0-10'
        dfimages.at[index,'age_rang2'] = '0-30'
    elif  y >= 11 and y <= 20 :
        dfimages.at[index,'age_rang'] = '11-20'
        dfimages.at[index,'age_rang2'] = '0-30'
    elif  y >= 21 and y <= 30 :
        dfimages.at[index,'age_rang'] = '21-30'
        dfimages.at[index,'age_rang2'] = '0-30'
    elif  y >= 31 and y <= 40 :
        dfimages.at[index,'age_rang'] = '31-40'
        dfimages.at[index,'age_rang2'] ='31-60'
    elif  y >= 41 and y <= 50 :
        dfimages.at[index,'age_rang'] = '41-50'
        dfimages.at[index,'age_rang2'] ='31-60'
    elif  y >= 51 and y <= 60 :
        dfimages.at[index,'age_rang'] = '51-60'
        dfimages.at[index,'age_rang2'] ='31-60'
    elif  y >= 61 and y <= 70 :
        dfimages.at[index,'age_rang'] = '61-70'
        dfimages.at[index,'age_rang2'] ='61>91'
    elif  y >= 71 and y <= 80 :
        dfimages.at[index,'age_rang'] = '71-80'
        dfimages.at[index,'age_rang2'] ='61>91'
    elif  y >= 81 and y <= 90 :
        dfimages.at[index,'age_rang'] = '81-90'
        dfimages.at[index,'age_rang2'] ='61>91'
    elif  y >= 91 :
        dfimages.at[index,'age_rang'] = '>91'
        dfimages.at[index,'age_rang2'] ='61>91'

print(dfimages[0:15])

       file_name     sex         bm  age             site age_rang age_rang2
0   ISIC_0032490    male     benign   45  posterior torso    41-50     31-60
1   ISIC_0032224  female     benign   45             None    41-50     31-60
2   ISIC_0027277    male  malignant   40  lower extremity    31-40     31-60
3   ISIC_0034231  female     benign   20  posterior torso    11-20      0-30
4   ISIC_0032483  female     benign   45             None    41-50     31-60
5   ISIC_0034275    male  malignant   70             None    61-70     61>91
6   ISIC_0032218  female     benign   40             None    31-40     31-60
7   ISIC_0031441    male     benign   55             None    51-60     31-60
8   ISIC_0034263    male  malignant   80  upper extremity    71-80     61>91
9   ISIC_0024273    male  malignant   70             None    61-70     61>91
10  ISIC_0024813    male     benign   30  posterior torso    21-30      0-30
11  ISIC_0028322    male     benign   45             None    41-50     31-60

In [12]:
# Print crosstabs
#df_images[surveys_images.sex == "female"]
pd.crosstab (dfimages.sex, dfimages.age_rang, margins=True)
pd.crosstab ([dfimages.bm, dfimages.sex], dfimages.age_rang, margins=True)

Unnamed: 0_level_0,age_rang,0-10,11-20,21-30,31-40,41-50,51-60,61-70,71-80,81-90,All
bm,sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
benign,female,91,221,647,1106,1406,721,370,179,77,4818
benign,male,93,163,425,897,1216,956,691,406,110,4957
malignant,female,1,5,59,114,129,165,169,105,69,816
malignant,male,1,9,39,47,168,267,423,268,140,1362
All,,186,398,1170,2164,2919,2109,1653,958,396,11953


In [13]:
pd.crosstab (dfimages.sex, dfimages.age_rang2, margins=True)

age_rang2,0-30,31-60,61>91,All
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,1024,3641,969,5634
male,730,3551,2038,6319
All,1754,7192,3007,11953


In [14]:
pd.crosstab (dfimages.bm, dfimages.age_rang2, margins=True)

age_rang2,0-30,31-60,61>91,All
bm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
benign,1640,6302,1833,9775
malignant,114,890,1174,2178
All,1754,7192,3007,11953


In [15]:
pd.crosstab ([dfimages.bm, dfimages.sex], dfimages.age_rang2, margins=True)

Unnamed: 0_level_0,age_rang2,0-30,31-60,61>91,All
bm,sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
benign,female,959,3233,626,4818
benign,male,681,3069,1207,4957
malignant,female,65,408,343,816
malignant,male,49,482,831,1362
All,,1754,7192,3007,11953


In [16]:
#Contar nº de filas con una condicion en una columna
#df[df.a > 1].sum()   
#
# dfimages[dfimages.bm == 'malignant'].count()
print ('Malignant:', dfimages.bm.value_counts()['malignant'])
print ('Benign:', dfimages.bm.value_counts()['benign'])

Malignant: 2178
Benign: 9775


In [None]:
# Convert the Crosstabs in dataframes to do a Chi ^2 test 
# Save the data frames in the csv directory 
table_bm_sex  = pd.crosstab (dfimages.bm, dfimages.sex, margins=False)
table_bm_age  = pd.crosstab (dfimages.bm, dfimages.age_rang2, margins=False)
table_bm_sex_age  = pd.crosstab (dfimages.bm, [dfimages.sex, dfimages.age_rang2], margins=False)
#print( type(table_bm_sex ))
#print (  table_bm_sex.shape)
#print( type(table_bm_age ))
#print (  table_bm_age.shape)
#print( type(table_bm_sex_age ))
#print (  table_bm_sex_age.shape)
#print (  table_bm_sex_age)
table_bm_sex.to_csv(csv_dir+'table_bm_sex.csv')
table_bm_age.to_csv(csv_dir+'table_bm_age.csv')
table_bm_sex_age.to_csv(csv_dir+'table_bm_sex_age.csv')

In [17]:
#Shuffle dataframes in sklearn
from sklearn.utils import shuffle
# Shuffle before distributing 
dfimages_shuf = shuffle(dfimages, random_state=20)

# Images are selected 1/2 Malignant & 1/2 Benign  
# Create train dataframe
# def extract_images_bm ( df,benign,n_benign,n_malignant,perc_train, perc_val
# Selects the number of benigm and malignant samples, shuffles them and then extracts the % of training, validation
# and rest test 
# benigm = 'benign'
# n_benign = nº of benign samples
# n_malignant = nº of malignant samples
# perc_train = % training
# perc_val = % validation 

(dftrain, dfval, dftest)  = extract_images_bm(dfimages_shuf,'benign',1750, 1750, 60, 20)



In [18]:
print ( dftrain.shape)
print ( dfval.shape)
print ( dftest.shape)
print ( dftrain[1:3])
print ( dfval[1:3])
print ( dftest[1:3])

(2100, 7)
(700, 7)
(700, 7)
         file_name   sex         bm  age             site age_rang age_rang2
5043  ISIC_0028860  male  malignant   65  posterior torso    61-70     61>91
3076  ISIC_0025918  male     benign   75  posterior torso    71-80     61>91
         file_name     sex         bm  age             site age_rang age_rang2
6417  ISIC_0028244    male     benign   50  lower extremity    41-50     31-60
5959  ISIC_0032993  female  malignant   35   anterior torso    31-40     31-60
         file_name     sex         bm  age  site age_rang age_rang2
9498  ISIC_0024233    male  malignant   60  None    51-60     31-60
4964  ISIC_0029624  female     benign   45  None    41-50     31-60


In [19]:
print ('Benign Train :', dftrain.bm.value_counts()['benign'])
print ('Malignant Train:', dftrain.bm.value_counts()['malignant'])
print ('Benign val :', dfval.bm.value_counts()['benign'])
print ('Malignant val:', dfval.bm.value_counts()['malignant'])
print ('Benign Test :', dftest.bm.value_counts()['benign'])
print ('Malignant Test:', dftest.bm.value_counts()['malignant'])

Benign Train : 1049
Malignant Train: 1051
Benign val : 361
Malignant val: 339
Benign Test : 340
Malignant Test: 360


In [20]:
# Add extension to the file name
dftrain['file_name_ext'] = dftrain['file_name'] + ".jpg" 
dfval['file_name_ext'] = dfval['file_name'] + ".jpg" 
dftest['file_name_ext'] = dftest['file_name'] + ".jpg" 

In [21]:
# To check if file exist 
#from pathlib import Path
# my_file = Path("/path/to/file")
#if my_file.is_file():
    # file exists

In [22]:
print( csv_dir)

/home/valborsf/Documents/UOC/PFMProject/DataNew4/CSV/


In [23]:
# writte the result to a .csv  file
# Shuffle 
# Once the train, val and test are set, shuffle them
dftrain_shuf = shuffle(dftrain, random_state=20)
dfval_shuf = shuffle(dfval, random_state=20)
dftest_shuf = shuffle(dftest, random_state=20)

# Print head
print(dftrain_shuf[:5])
print(dftrain_shuf[210:215])

#Create files with shuffle result
dftrain_shuf.to_csv(csv_dir+'dftrain.csv')
dfval_shuf.to_csv(csv_dir+'dfval.csv')
dftest_shuf.to_csv(csv_dir+'dftest.csv')

         file_name     sex         bm  age             site age_rang  \
3921  ISIC_0025496  female     benign   45             None    41-50   
1791  ISIC_0033804  female  malignant   30  posterior torso    21-30   
7930  ISIC_0013707  female     benign   85  lower extremity    81-90   
8347  ISIC_0010901  female     benign   65  posterior torso    61-70   
5139  ISIC_0009882  female  malignant   60  upper extremity    51-60   

     age_rang2     file_name_ext  
3921     31-60  ISIC_0025496.jpg  
1791      0-30  ISIC_0033804.jpg  
7930     61>91  ISIC_0013707.jpg  
8347     61>91  ISIC_0010901.jpg  
5139     31-60  ISIC_0009882.jpg  
          file_name     sex         bm  age             site age_rang  \
11141  ISIC_0010834    male  malignant   55  lower extremity    51-60   
9383   ISIC_0033920  female     benign   25  upper extremity    21-30   
8651   ISIC_0010760    male     benign   45  posterior torso    41-50   
9761   ISIC_0001106    male  malignant   70  lower extremity    6

In [None]:
# Copy Images in directory Collab
#import os, shutil 

#collab_dir = root_dir + "/Data/COLLAB/"

#for index, row in dftrain.iterrows():
#    file = dftrain['file_name'][index]
#    file_jpg  = file + ".jpg"
#    file_json = file + ".json" 
#    dst = collab_dir + file_json     
#    src = json_dir + file_json 
#    shutil.copyfile ( src,dst)   # Copy file .json to COLLAB dir
#    dst = collab_dir + file_jpg 
#    src = imag_dir + file_jpg 
#    shutil.copyfile ( src,dst)   # Copy file .png to COLLAB dir

#for index, row in dfval.iterrows():
#    file = dfval['file_name'][index]
#    file_jpg  = file + ".jpg"
#    file_json = file + ".json" 
#    dst = collab_dir + file_json     
#    src = json_dir + file_json 
#    shutil.copyfile ( src,dst)   # Copy file .json to COLLAB dir
#    dst = collab_dir + file_jpg 
#    src = imag_dir + file_jpg 
#    shutil.copyfile ( src,dst)   # Copy file .png to COLLAB dir

#for index, row in dftest.iterrows():
#    file = dftest['file_name'][index]
#    file_jpg  = file + ".jpg"
#    file_json = file + ".json" 
#    dst = collab_dir + file_json     
#    src = json_dir + file_json 
#    shutil.copyfile ( src,dst)   # Copy file .json to COLLAB dir
#    dst = collab_dir + file_jpg 
#   src = imag_dir + file_jpg 
#    shutil.copyfile ( src,dst)   # Copy file .png to COLLAB dir

In [None]:
#!ls

In [24]:
# Train set of data 
pd.crosstab ([dftrain_shuf.bm, dftrain_shuf.sex], dftrain_shuf.age_rang2, margins=True)

Unnamed: 0_level_0,age_rang2,0-30,31-60,61>91,All
bm,sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
benign,female,112,337,74,523
benign,male,80,320,126,526
malignant,female,32,207,164,403
malignant,male,21,219,408,648
All,,245,1083,772,2100


In [25]:
# Validation set of data
pd.crosstab ([dfval_shuf.bm, dfval_shuf.sex], dfval_shuf.age_rang2, margins=True)

Unnamed: 0_level_0,age_rang2,0-30,31-60,61>91,All
bm,sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
benign,female,31,121,20,172
benign,male,34,114,41,189
malignant,female,13,51,57,121
malignant,male,7,78,133,218
All,,85,364,251,700


In [26]:
# Test set of data
pd.crosstab ([dftest_shuf.bm, dftest_shuf.sex], dftest_shuf.age_rang2, margins=True)

Unnamed: 0_level_0,age_rang2,0-30,31-60,61>91,All
bm,sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
benign,female,34,113,22,169
benign,male,18,113,40,171
malignant,female,13,68,57,138
malignant,male,9,86,127,222
All,,74,380,246,700
