In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
import zipfile

# Unzip the dataset
local_zip = 'archive.zip'
zip_ref = zipfile.ZipFile(local_zip, 'r')
zip_ref.extractall('archive_file')
zip_ref.close()

In [None]:
import os
from glob import glob
from PIL import Image
HAM10000_images_part_1 = os.path.join('./archive_file/HAM10000_images_part_1')
HAM10000_images_part_2= os.path.join('./archive_file/HAM10000_images_part_2')
HAM10000_metadata = os.path.join('./archive_file/HAM10000_metadata.csv')

In [None]:
HAM10000_images_part_1

In [None]:
HAM10000_images_part_1_name = os.listdir(HAM10000_images_part_1)
print(HAM10000_images_part_1_name[:10])
HAM10000_images_part_2_name = os.listdir(HAM10000_images_part_2)

In [None]:
for i in range(0,len(HAM10000_images_part_1_name)):
    HAM10000_images_part_1_name[i] = r'./archive_file/HAM10000_images_part_1/' + HAM10000_images_part_1_name[i]

In [None]:
for i in range(0,len(HAM10000_images_part_2_name)):
    HAM10000_images_part_2_name[i] = r'./archive_file/HAM10000_images_part_2/' + HAM10000_images_part_2_name[i]

In [None]:
HAM10000_images_part_merged = HAM10000_images_part_1_name+HAM10000_images_part_2_name

In [None]:
len(HAM10000_images_part_1_name)+len(HAM10000_images_part_2_name),len(HAM10000_images_part_merged)

In [None]:
df = pd.read_csv(HAM10000_metadata)

In [None]:
df

In [None]:
HAM10000_images_part_merged

In [None]:
image_mapping = dict(zip(df['image_id'], HAM10000_images_part_merged))

# Add a new column to the DataFrame with image addresses using the mapping
df['path'] = df['image_id'].map(image_mapping)

In [None]:
df

In [None]:
df['image'] = df['path'].map(lambda x: np.asarray(Image.open(x).resize((96,96))))

In [None]:
print(df['dx'].value_counts())

In [None]:
lesion_type_dict = {
    'nv': 'Melanocytic nevi',
    'mel': 'dermatofibroma',
    'bkl': 'Benign keratosis-like lesions ',
    'bcc': 'Basal cell carcinoma',
    'akiec': 'Actinic keratoses',
    'vasc': 'Vascular lesions',
    'df': 'Dermatofibroma'
}

In [None]:
df['cell_type'] = df['dx'].map(lesion_type_dict.get)

In [None]:
df['cell_type_idx'] = pd.Categorical(df['cell_type']).codes

In [None]:
df['cell_type_idx'].unique()

In [None]:
df.isna().sum()

In [None]:
df['age'].fillna((df['age'].median()), inplace = True)

In [None]:
df['image'].map(lambda x: x.shape).value_counts()

In [None]:
df['image']=df['image']/255

In [None]:
df.head()

In [None]:
from tensorflow.keras.utils import to_categorical
X = df.drop(columns = ['cell_type_idx'], axis = 1)
y = df['cell_type_idx']
y = to_categorical(y, num_classes = 7)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)

X_train_i = np.asarray(X_train['image'].tolist())
X_test_i = np.asarray(X_test['image'].tolist())

X_train_norm = (X_train_i - np.mean(X_train_i))/np.std(X_train_i)
X_test_norm = (X_test_i - np.mean(X_test_i))/np.std(X_test_i)

In [None]:
#let's create a balanced dataset as well

In [None]:
desired_rows_per_class = 600

grouped = df.groupby('cell_type_idx')

balanced_df = pd.DataFrame()

# Iterate over each group
for group, data in grouped:
    # Sample the desired number of rows from each group
    sampled_data = data.sample(n=desired_rows_per_class,replace= True, random_state=42)
    # Append the sampled data to the balanced DataFrame
    balanced_df = balanced_df.append(sampled_data)

# Reset the index of the balanced DataFrame
balanced_df = balanced_df.reset_index(drop=True)

In [None]:
balanced_df.head()

In [None]:
X_sample = balanced_df.drop(columns = ['cell_type_idx'], axis = 1)
y_sample = balanced_df['cell_type_idx']
y_sample = to_categorical(y_sample, num_classes = 7)

In [None]:
X_sample_i = np.asarray(X_sample['image'].tolist())

In [None]:
X_sample_norm = (X_sample_i - np.mean(X_sample_i))/np.std(X_sample_i)

In [None]:
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_sample_norm, y_sample, test_size = 0.2)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import  Conv2D,MaxPool2D, Dropout, Flatten,  BatchNormalization

In [None]:
#model 1
#suggested by autokeras
num_classes = 7

model = Sequential()
model.add(Conv2D(256, (3, 3), activation="relu", input_shape=(96, 96, 3)))
#model.add(BatchNormalization())
model.add(MaxPool2D(pool_size=(2, 2)))  
model.add(Dropout(0.3))

model.add(Conv2D(128, (3, 3),activation='relu'))
#model.add(BatchNormalization())
model.add(MaxPool2D(pool_size=(2, 2)))  
model.add(Dropout(0.3))

model.add(Conv2D(64, (3, 3),activation='relu'))
#model.add(BatchNormalization())
model.add(MaxPool2D(pool_size=(2, 2)))  
model.add(Dropout(0.3))
model.add(Flatten())

model.add(Dense(32))
model.add(Dense(7, activation='softmax'))
model.summary()

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='Adam', metrics=['acc'])

In [None]:
history = model.fit(
    X_train_s, y_train_s,
    epochs=50,
    batch_size = 40,
    validation_data=(X_test_s, y_test_s),
    verbose=10)

In [None]:
score = model.evaluate(X_test_s, y_test_s)
print('Test accuracy:', score)

In [None]:
#plot the training and validation accuracy and loss at each epoch
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(loss) + 1)
plt.plot(epochs, loss, 'y', label='Training loss')
plt.plot(epochs, val_loss, 'r', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
acc = history.history['acc']
val_acc = history.history['val_acc']
plt.plot(epochs, acc, 'y', label='Training acc')
plt.plot(epochs, val_acc, 'r', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
from sklearn.metrics import confusion_matrix
# Prediction on test data
y_pred = model.predict(X_test_s)
# Convert predictions classes to one hot vectors 
y_pred_classes = np.argmax(y_pred, axis = 1) 
# Convert test data to one hot vectors
y_true = np.argmax(y_test_s, axis = 1) 
#Print confusion matrix
cm = confusion_matrix(y_true, y_pred_classes)
cm

In [None]:
# model2 
from keras.applications import MobileNetV2
from tensorflow.keras.applications.vgg16 import VGG16

In [None]:
base_model=MobileNetV2(include_top=False, weights="imagenet", input_shape=(96,96,3))

model2 = Sequential()
model2.add(base_model)
model2.add(Conv2D(64, (3, 3), activation = 'relu'))
model2.add(MaxPooling2D(pool_size = (2, 2)))
model2.add(Dropout(0.40))
model2.add(Flatten())
model2.add(Dense(128,activation='relu'))
model2.add(Dropout(0.4))
model2.add(Dense(7, activation='softmax'))

In [None]:
model2.compile(loss='categorical_crossentropy', optimizer='Adam', metrics=['acc'])

In [None]:
history2 = model2.fit(
    X_train_s, y_train_s,
    epochs=40,
    batch_size = 50,
    validation_data=(X_test_s, y_test_s),
    verbose=2)