# Skin Cancer Classification

### Load Dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import random, os
from glob import glob
from keras.preprocessing.image import load_img, img_to_array
from tqdm import tqdm

from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split

from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Dense, Flatten, Conv2D, MaxPooling2D, Dropout
from keras.optimizers import Adam

In [None]:
lesion_type_dict = {
    'nv': 'Melanocytic nevi',
    'mel': 'Melanoma',
    'bkl': 'Benign keratosis-like lesions ',
    'bcc': 'Basal cell carcinoma',
    'akiec': 'Actinic keratoses',
    'vasc': 'Vascular lesions',
    'df': 'Dermatofibroma'
}

image_path_dict = {os.path.splitext(os.path.basename(x))[0]: x for x in glob(os.path.join('*', '*.jpg'))}

In [None]:
df = pd.read_csv("HAM10000_metadata.csv")
df.head()

In [None]:
df['cell_type'] = df['dx'].map(lesion_type_dict.get) 
df['path'] = df['image_id'].map(image_path_dict.get)

In [None]:
df.isnull().sum()

In [None]:
df['age'].fillna((df['age'].mean()), inplace=True)

In [None]:
def preprocess_image(path):
    img = load_img(path, target_size=(150, 150))
    img = img_to_array(img)
    img = np.expand_dims(img, axis=0)
    return img

In [None]:
X = []
y = []

for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    X.append(preprocess_image(row['path'])[0])
    y.append(row['dx'])

### Visualization

In [None]:
plt.figure(figsize=(18,6))
sns.countplot(df['cell_type'], label='Count')

In [None]:
plt.figure(figsize=(18,6))
sns.countplot(df['dx_type'], label='Count')

In [None]:
plt.figure(figsize=(18,6))
sns.countplot(df['localization'], label='Count')

In [None]:
plt.figure(figsize=(6,6))
sns.countplot(df['sex'], label='Count')

In [None]:
plt.figure(figsize=(18,6))
sns.countplot(x='localization', hue='sex', data=df)

In [None]:
plt.figure(figsize=(18,6))
sns.histplot(df['age'], bins=40)

In [None]:
num = 5
fig, axs = plt.subplots(7, num, figsize=(4*num, 3*7))
for ax, (name, row) in zip(axs, df.sort_values(['cell_type']).groupby('cell_type')):
    ax[0].set_title(name)
    for c_ax, (_, c_row) in zip(ax, row.sample(num, random_state=2).iterrows()):
        img = load_img(c_row['path'])
        c_ax.imshow(img)
        c_ax.axis('off')


### Preprocess

In [None]:
X, y = shuffle(X, y)
X = np.array(X)
y = np.array(y)

In [None]:
X, X_test, y, y_test = train_test_split(X, y, test_size=0.2, random_state=2)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=2)

In [None]:
plt.figure(figsize=(15,5))
n = 1
for d , name in zip([y_train, y_val , y_test] , ['Train', 'Validation', 'Test']):
    plt.subplot(1, 3, n)
    sns.countplot(x=d)
    plt.title(name)
    n+=1
plt.show()

In [None]:
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_val = le.transform(y_val)
y_test = le.transform(y_test)

In [None]:
y_train = to_categorical(y_train, 7)
y_val = to_categorical(y_val, 7)
y_test = to_categorical(y_test, 7)

In [None]:
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=40,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest')

test_datagen = ImageDataGenerator(rescale=1./255)

In [None]:
train_generator = train_datagen.flow(X_train, y_train, batch_size=32)
validation_generator = train_datagen.flow(X_val, y_val, batch_size=32)
test_generator = test_datagen.flow(X_test, y_test, batch_size=32)

### Training

In [None]:
model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3), padding='Same', activation='relu', input_shape=(150, 150, 3)))
model.add(Conv2D(32, kernel_size=(3, 3), padding='Same', activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(0.25))
model.add(Conv2D(64, (3, 3), padding='Same', activation='relu'))
model.add(Conv2D(64, (3, 3), padding='Same', activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(0.40))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(7, activation='softmax'))

optimizer = Adam(lr=0.001, beta_1=0.9, beta_2=0.999)

model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

model.summary()

In [None]:
history = model.fit_generator(
    train_generator,
    steps_per_epoch=train_generator.n//train_generator.batch_size,
    validation_data=validation_generator, 
    validation_steps=validation_generator.n//validation_generator.batch_size,
    epochs=20)

### Evaluation

In [None]:
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Val'], loc='upper left')
plt.show()

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Val'], loc='upper left')
plt.show()

In [None]:
model.evaluate_generator(test_generator, steps=test_generator.n//test_generator.batch_size)

In [None]:
'''
Inspiration
1. https://www.kaggle.com/sid321axn/step-wise-approach-cnn-model-77-0344-accuracy/notebook
'''