In [2]:
!pip install -U -q PyDrive

In [7]:
# Authentication for loading data from Google Drive
# Import packages
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
from google.colab import drive
from os import path

In [8]:
# Authenticate User
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
auth_drive = GoogleDrive(gauth)

In [51]:
DRIVE_PATH = '/content/drive'
drive.mount(DRIVE_PATH)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [56]:
DATA_PATH = path.join(DRIVE_PATH, 'My Drive', 'LinkedIn_Articles', 'Datasets', 'ISIC_Skin_Cancer')

In [65]:
# Import necessary packages
import pandas as pd
import glob # I use glob instead of ls due to double quote vs ls command on Colab

In [66]:
glob.glob(DATA_PATH+'/*')

['/content/drive/My Drive/LinkedIn_Articles/Datasets/ISIC_Skin_Cancer/ISIC_2019_Training_GroundTruth.csv',
 '/content/drive/My Drive/LinkedIn_Articles/Datasets/ISIC_Skin_Cancer/ISIC_2019_Training_Metadata.csv',
 '/content/drive/My Drive/LinkedIn_Articles/Datasets/ISIC_Skin_Cancer/ISIC_2019_Training_Input']

In [67]:
df_train_truth = pd.read_csv(path.join(DATA_PATH, 'ISIC_2019_Training_GroundTruth.csv'))
df_train_meta = pd.read_csv(path.join(DATA_PATH, 'ISIC_2019_Training_Metadata.csv'))

In [62]:
df_train_truth.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25331 entries, 0 to 25330
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   image   25331 non-null  object 
 1   MEL     25331 non-null  float64
 2   NV      25331 non-null  float64
 3   BCC     25331 non-null  float64
 4   AK      25331 non-null  float64
 5   BKL     25331 non-null  float64
 6   DF      25331 non-null  float64
 7   VASC    25331 non-null  float64
 8   SCC     25331 non-null  float64
 9   UNK     25331 non-null  float64
dtypes: float64(9), object(1)
memory usage: 1.9+ MB


In [69]:
df_train_meta.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25331 entries, 0 to 25330
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   image                25331 non-null  object 
 1   age_approx           24894 non-null  float64
 2   anatom_site_general  22700 non-null  object 
 3   lesion_id            23247 non-null  object 
 4   sex                  24947 non-null  object 
dtypes: float64(1), object(4)
memory usage: 989.6+ KB


In [71]:
df_train = pd.concat([df_train_meta, df_train_truth], axis=1)

In [86]:
df_train = df_train.iloc[:,~df_train.columns.duplicated()]

In [88]:
df_train

Unnamed: 0,image,age_approx,anatom_site_general,lesion_id,sex,MEL,NV,BCC,AK,BKL,DF,VASC,SCC,UNK
0,ISIC_0000000,55.0,anterior torso,,female,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ISIC_0000001,30.0,anterior torso,,female,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ISIC_0000002,60.0,upper extremity,,female,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ISIC_0000003,30.0,upper extremity,,male,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ISIC_0000004,80.0,posterior torso,,male,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25326,ISIC_0073247,85.0,head/neck,BCN_0003925,female,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25327,ISIC_0073248,65.0,anterior torso,BCN_0001819,male,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25328,ISIC_0073249,70.0,lower extremity,BCN_0001085,male,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25329,ISIC_0073251,55.0,palms/soles,BCN_0002083,female,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [94]:
# Check na for useful columns
print(df_train.image.isna().unique())
print(df_train.age_approx.isna().unique())
print(df_train.anatom_site_general.isna().unique())
print(df_train.sex.isna().unique())

print(df_train.MEL.isna().unique())
print(df_train.NV.isna().unique())
print(df_train.BCC.isna().unique())
print(df_train.AK.isna().unique())

print(df_train.BKL.isna().unique())
print(df_train.DF.isna().unique())
print(df_train.VASC.isna().unique())
print(df_train.SCC.isna().unique())
print(df_train.UNK.isna().unique())

[False]
[False  True]
[False  True]
[False  True]
[False]
[False]
[False]
[False]
[False]
[False]
[False]
[False]
[False]


In [100]:
age_mean = df_train.age_approx.mean()
replacement_values = {'age_approx': age_mean, 'anatom_site_general': 'Unknown', 'sex': 'Unknown'}

In [103]:
df_train = df_train.fillna(value=replacement_values)

In [105]:
df_required = df_train[['image', 'age_approx','anatom_site_general','sex','MEL','NV','BCC','AK','BKL','DF','VASC','SCC','UNK']]
df_required

Unnamed: 0,image,age_approx,anatom_site_general,sex,MEL,NV,BCC,AK,BKL,DF,VASC,SCC,UNK
0,ISIC_0000000,55.0,anterior torso,female,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ISIC_0000001,30.0,anterior torso,female,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ISIC_0000002,60.0,upper extremity,female,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ISIC_0000003,30.0,upper extremity,male,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ISIC_0000004,80.0,posterior torso,male,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
25326,ISIC_0073247,85.0,head/neck,female,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25327,ISIC_0073248,65.0,anterior torso,male,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25328,ISIC_0073249,70.0,lower extremity,male,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25329,ISIC_0073251,55.0,palms/soles,female,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
