# Setup

In [143]:
import random
import numpy as np
import pandas as pd
from joblib import dump, load
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split

In [5]:
SEED = 1

random.seed(SEED)
np.random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)

# Data loading & preparation

In [26]:
df = pd.read_csv("../data/train.csv")
df_test = pd.read_csv("../data/test.csv")

In [27]:
df.head()

Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,target
0,ISIC_2637011,IP_7279968,male,45.0,head/neck,unknown,benign,0
1,ISIC_0015719,IP_3075186,female,45.0,upper extremity,unknown,benign,0
2,ISIC_0052212,IP_2842074,female,50.0,lower extremity,nevus,benign,0
3,ISIC_0068279,IP_6890425,female,45.0,head/neck,unknown,benign,0
4,ISIC_0074268,IP_8723313,female,55.0,upper extremity,unknown,benign,0


In [28]:
df.dtypes

image_name                        object
patient_id                        object
sex                               object
age_approx                       float64
anatom_site_general_challenge     object
diagnosis                         object
benign_malignant                  object
target                             int64
dtype: object

In [29]:
null_df = pd.DataFrame({'columns': df.columns, 
                        'percent_null': df.isnull().sum() * 100 / len(df), 
                        'percent_zero': df.isin([0]).sum() * 100 / len(df),
                        'total_zero': df.isnull().sum() * 100 / len(df) + df.isin([0]).sum() * 100 / len(df),
                        })
null_df

Unnamed: 0,columns,percent_null,percent_zero,total_zero
image_name,image_name,0.0,0.0,0.0
patient_id,patient_id,0.0,0.0,0.0
sex,sex,0.19622,0.0,0.19622
age_approx,age_approx,0.205277,0.006038,0.211314
anatom_site_general_challenge,anatom_site_general_challenge,1.590895,0.0,1.590895
diagnosis,diagnosis,0.0,0.0,0.0
benign_malignant,benign_malignant,0.0,0.0,0.0
target,target,0.0,98.237034,98.237034


In [30]:
# getting dummy variables for gender
sex_dummies = pd.get_dummies(df['sex'], prefix='sex', dtype="int")
df = pd.concat([df, sex_dummies], axis=1)

# getting dummy variables for anatom_site_general_challenge
anatom_dummies = pd.get_dummies(df['anatom_site_general_challenge'], prefix='anatom', dtype="int")
df = pd.concat([df, anatom_dummies], axis=1)

# dropping not useful columns
df.drop(['sex','diagnosis','benign_malignant','anatom_site_general_challenge', 'image_name', 'patient_id'], axis=1, inplace=True)

# replace missing age values wiht the mean age
df['age_approx'] = df['age_approx'].fillna(int(np.mean(df['age_approx'])))

# convert age to int
df['age_approx'] = df['age_approx'].astype('int')

In [31]:
null_df = pd.DataFrame({'columns': df.columns, 
                        'percent_null': df.isnull().sum() * 100 / len(df), 
                        'percent_zero': df.isin([0]).sum() * 100 / len(df),
                        'total_zero': df.isnull().sum() * 100 / len(df) + df.isin([0]).sum() * 100 / len(df),
                        })
null_df

Unnamed: 0,columns,percent_null,percent_zero,total_zero
age_approx,age_approx,0.0,0.006038,0.006038
target,target,0.0,98.237034,98.237034
sex_female,sex_female,0.0,51.756928,51.756928
sex_male,sex_male,0.0,48.439292,48.439292
anatom_head/neck,anatom_head/neck,0.0,94.400169,94.400169
anatom_lower extremity,anatom_lower extremity,0.0,74.590956,74.590956
anatom_oral/genital,anatom_oral/genital,0.0,99.625672,99.625672
anatom_palms/soles,anatom_palms/soles,0.0,98.867959,98.867959
anatom_torso,anatom_torso,0.0,49.148705,49.148705
anatom_upper extremity,anatom_upper extremity,0.0,84.957435,84.957435


In [37]:
# Scale age column
scaler = StandardScaler()
df[['age_approx']] = scaler.fit_transform(df[['age_approx']])

In [39]:
df.head(3)

Unnamed: 0,age_approx,target,sex_female,sex_male,anatom_head/neck,anatom_lower extremity,anatom_oral/genital,anatom_palms/soles,anatom_torso,anatom_upper extremity
0,-0.269274,0,0,1,1,0,0,0,0,0
1,-0.269274,0,1,0,0,0,0,0,0,1
2,0.078784,0,1,0,0,1,0,0,0,0


In [102]:
feature_columns = ['age_approx', 'sex_female', 'sex_male', 'anatom_head/neck',
       'anatom_lower extremity', 'anatom_oral/genital', 'anatom_palms/soles',
       'anatom_torso', 'anatom_upper extremity']

target_columns = ['target']

In [103]:
df_train, df_test = train_test_split(df, test_size=0.20, random_state=SEED)

x_train = df_train[feature_columns]
y_train = df_train[target_columns]

x_test = df_test[feature_columns]
y_test = df_test[target_columns]

In [147]:
# Source
# https://www.kaggle.com/teyang/melanoma-detection-using-effnet-and-meta-data
model = RandomForestClassifier(
    n_estimators=5000, 
    max_depth=5, 
    class_weight='balanced',
    n_jobs=-1, 
    random_state=SEED)

In [148]:
fold_no = 1
kf = StratifiedKFold(5, shuffle=True, random_state=SEED)

for train_indexes, test_index in kf.split(x_train, y_train):    
    
    x_train_fold = x_train.iloc[train_indexes]
    y_train_fold = list(y_train.iloc[train_indexes].loc[:,'target'])
    y_train_fold = [int(x) for x in y_train_fold]

    x_test_fold = x_train.iloc[test_index]
    y_test_fold = list(y_train.iloc[test_index].loc[:,'target'])
    y_test_fold = [int(x) for x in y_test_fold]


    model.fit(x_train_fold, y_train_fold)
    predictions = model.predict(x_test_fold)

    print('Fold',str(fold_no), 'roc_auc_score:', roc_auc_score(y_test_fold, predictions))
    fold_no += 1

Fold 1 roc_auc_score: 0.6772194465795542
Fold 2 roc_auc_score: 0.6191151037663336
Fold 3 roc_auc_score: 0.6493682744043043
Fold 4 roc_auc_score: 0.6086624071227261
Fold 5 roc_auc_score: 0.6220775685370228


In [149]:
dump(model, '../app/model-server/rf_model.joblib') 

['../app/model-server/rf_model.joblib']

In [150]:
predictions = model.predict(x_test)
print('roc_auc_score:', roc_auc_score(y_test, predictions))

roc_auc_score: 0.6746927558795084


In [151]:
model_2 = load('../app/model-server/rf_model.joblib')
predictions = model.predict(x_test)
print('roc_auc_score:', roc_auc_score(y_test, predictions))

roc_auc_score: 0.6746927558795084
