In [274]:
import pandas as pd
import numpy as np

# set data source
data_url = 'raw_data/ISIC_2019_Training_GroundTruth.csv'
categories= ['MEL', 'NV', 'BCC', 'AK', 'BKL', 'DF', 'VASC', 'SCC']
sizes =     [0.15,  0.25, 0.15,  0.10, 0.15,  0.05,  0.05,  0.10]

# to load the raw data
def load_raw_data():
    # load data
    data = pd.read_csv(data_url)
    # Added a Cat column for conviniences
    data['Cat']=''
    for c in categories:
        data.loc[data[c]>0, 'Cat']=c
        
    # Added a label/target column
    data['y']=0
    data['y']=(data.MEL+data.BCC+data.AK+data.SCC).astype(int)
    return data

def get_data_number(data, number):
    result = pd.DataFrame()
    for c in categories:
        result = pd.concat([result, data[data[c]>0].sample(number, random_state=42)], ignore_index=True)
    return result

def get_data_propotional(data, number):
    result = pd.DataFrame()
    numbers=np.multiply(sizes, number).astype(int)
    for i in range(8):
        result = pd.concat([result, data[data[categories[i]]>0].sample(numbers[i], random_state=42)], ignore_index=True)
    return result

def get_data(number=80):
    """
    when number is lesst than 800, return the 8 categories evenly.
    when the number is larger than 800 return the 8 categories as per the following percentages:
    NV=25%, BKL=15%, DF=5%, UASC=5%, MEL=15%, BCC=15%, AK=10%, SCC=10% 
    if the number is negative, return the entire dataset
    """
    data = load_raw_data()

    if number<0:
            result = data
    if (number<800):
            result =get_data_number(data, number=number//8)
    else:
            result = get_data_propotional(data, number=number)
    
    # randomize the data
    result.sample(frac=1, random_state=42).reset_index(drop=True, inplace=True)
    return result

In [277]:
data = get_data(80).reset_index(drop=True)
data.head(20)

Unnamed: 0,image,MEL,NV,BCC,AK,BKL,DF,VASC,SCC,UNK,Cat,y
0,ISIC_0068778,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,MEL,1
1,ISIC_0058285,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,MEL,1
2,ISIC_0073194,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,MEL,1
3,ISIC_0058003,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,MEL,1
4,ISIC_0070105,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,MEL,1
5,ISIC_0064161,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,MEL,1
6,ISIC_0026652,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,MEL,1
7,ISIC_0053615,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,MEL,1
8,ISIC_0054787,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,MEL,1
9,ISIC_0061194,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,MEL,1
