## Create Experiment

In [None]:
%%capture
# Create an experiment comet_ml experiment
!pip install --upgrade comet_ml --quiet

In [None]:
import json
from comet_ml import Experiment

experiment = False
path_to_auth_file = '/content/drive/MyDrive/Colab Notebooks/_auth/comet.json'

In [None]:
with open(path_to_auth_file) as json_file:
    data = json.load(json_file)
    experiment = Experiment(
        api_key=data['api_key'],
        project_name=data['project_name'],
        workspace=data['workspace'],
        log_code=True,
        auto_histogram_tensorboard_logging=True,
        auto_histogram_weight_logging=True,
        auto_histogram_gradient_logging=True,
        auto_histogram_activation_logging=True,
    )
    experiment.add_tag('randomForest')

COMET INFO: Experiment is live on comet.ml https://www.comet.ml/saschamet/master-thesis/096a5db70a8243f0b419c27c8c92c875



## Setup

In [1]:
import random
import numpy as np
import pandas as pd
from joblib import dump, load
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split

In [2]:
SEED = 1
random.seed(SEED)
np.random.seed(SEED)

## Data Loading

In [3]:
%%capture
!wget https://raw.githubusercontent.com/SaschaMet/melanoma-classification/master/data/train.csv
!wget https://raw.githubusercontent.com/SaschaMet/melanoma-classification/master/data/test.csv

In [4]:
df_train = pd.read_csv("/content/train.csv")
df_test = pd.read_csv("/content/test.csv")

In [5]:
df_train.head(3)

Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,target
0,ISIC_2637011,IP_7279968,male,45.0,head/neck,unknown,benign,0
1,ISIC_0015719,IP_3075186,female,45.0,upper extremity,unknown,benign,0
2,ISIC_0052212,IP_2842074,female,50.0,lower extremity,nevus,benign,0


In [6]:
df_train.dtypes

image_name                        object
patient_id                        object
sex                               object
age_approx                       float64
anatom_site_general_challenge     object
diagnosis                         object
benign_malignant                  object
target                             int64
dtype: object

In [7]:
df_test.head(3)

Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,width,height
0,ISIC_0052060,IP_3579794,male,70.0,,6000,4000
1,ISIC_0052349,IP_7782715,male,40.0,lower extremity,6000,4000
2,ISIC_0058510,IP_7960270,female,55.0,torso,6000,4000


In [8]:
df_test.dtypes

image_name                        object
patient_id                        object
sex                               object
age_approx                       float64
anatom_site_general_challenge     object
width                              int64
height                             int64
dtype: object

## Data analysis + preparation

In [9]:
df = df_train

In [10]:
null_df = pd.DataFrame({'columns': df.columns, 
                        'percent_null': df.isnull().sum() * 100 / len(df), 
                        'percent_zero': df.isin([0]).sum() * 100 / len(df),
                        'total_zero': df.isnull().sum() * 100 / len(df) + df.isin([0]).sum() * 100 / len(df),
                        })
null_df

Unnamed: 0,columns,percent_null,percent_zero,total_zero
image_name,image_name,0.0,0.0,0.0
patient_id,patient_id,0.0,0.0,0.0
sex,sex,0.19622,0.0,0.19622
age_approx,age_approx,0.205277,0.006038,0.211314
anatom_site_general_challenge,anatom_site_general_challenge,1.590895,0.0,1.590895
diagnosis,diagnosis,0.0,0.0,0.0
benign_malignant,benign_malignant,0.0,0.0,0.0
target,target,0.0,98.237034,98.237034


In [11]:
# getting dummy variables for gender
sex_dummies = pd.get_dummies(df['sex'], prefix='sex', dtype="int")
df = pd.concat([df, sex_dummies], axis=1)

# getting dummy variables for anatom_site_general_challenge
anatom_dummies = pd.get_dummies(df['anatom_site_general_challenge'], prefix='anatom', dtype="int")
df = pd.concat([df, anatom_dummies], axis=1)

# dropping not useful columns
df.drop(['sex','diagnosis','benign_malignant','anatom_site_general_challenge', 'image_name', 'patient_id'], axis=1, inplace=True)

# replace missing age values wiht the mean age
df['age_approx'] = df['age_approx'].fillna(int(np.mean(df['age_approx'])))

# convert age to int
df['age_approx'] = df['age_approx'].astype('int')

In [12]:
null_df = pd.DataFrame({'columns': df.columns, 
                        'percent_null': df.isnull().sum() * 100 / len(df), 
                        'percent_zero': df.isin([0]).sum() * 100 / len(df),
                        'total_zero': df.isnull().sum() * 100 / len(df) + df.isin([0]).sum() * 100 / len(df),
                        })
null_df

Unnamed: 0,columns,percent_null,percent_zero,total_zero
age_approx,age_approx,0.0,0.006038,0.006038
target,target,0.0,98.237034,98.237034
sex_female,sex_female,0.0,51.756928,51.756928
sex_male,sex_male,0.0,48.439292,48.439292
anatom_head/neck,anatom_head/neck,0.0,94.400169,94.400169
anatom_lower extremity,anatom_lower extremity,0.0,74.590956,74.590956
anatom_oral/genital,anatom_oral/genital,0.0,99.625672,99.625672
anatom_palms/soles,anatom_palms/soles,0.0,98.867959,98.867959
anatom_torso,anatom_torso,0.0,49.148705,49.148705
anatom_upper extremity,anatom_upper extremity,0.0,84.957435,84.957435


In [13]:
# Scale age column
scaler = StandardScaler()
df[['age_approx']] = scaler.fit_transform(df[['age_approx']])

In [14]:
df.head(3)

Unnamed: 0,age_approx,target,sex_female,sex_male,anatom_head/neck,anatom_lower extremity,anatom_oral/genital,anatom_palms/soles,anatom_torso,anatom_upper extremity
0,-0.269274,0,0,1,1,0,0,0,0,0
1,-0.269274,0,1,0,0,0,0,0,0,1
2,0.078784,0,1,0,0,1,0,0,0,0


In [15]:
# Get the feature and target columns
feature_columns = ['age_approx', 'sex_female', 'sex_male', 'anatom_head/neck',
       'anatom_lower extremity', 'anatom_oral/genital', 'anatom_palms/soles',
       'anatom_torso', 'anatom_upper extremity']

target_columns = ['target']

In [16]:
# Get train and test datasets to train the rf classifier
df_train, df_test = train_test_split(df, test_size=0.20, random_state=SEED)

x_train = df_train[feature_columns]
y_train = df_train[target_columns]

x_test = df_test[feature_columns]
y_test = df_test[target_columns]

## Training

In [17]:
# Source
model = RandomForestClassifier(
    n_estimators=5000, 
    max_depth=5, 
    class_weight='balanced',
    n_jobs=-1, 
    random_state=SEED)

In [18]:
model.fit(x_train, y_train)

  """Entry point for launching an IPython kernel.


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=5, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=5000,
                       n_jobs=-1, oob_score=False, random_state=1, verbose=0,
                       warm_start=False)

In [19]:
predictions = model.predict(x_test)
print('roc_auc_score:', roc_auc_score(y_test, predictions))

roc_auc_score: 0.6724916259759866


In [22]:
dump(model, 'rf_model.joblib') 

['rf_model.joblib']

In [None]:
experiment.end()