In [2]:
import os
import pandas as pd
import numpy as np
from PIL import Image
from sklearn import svm
from sklearn.metrics import accuracy_score
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader

In [3]:
# Read csv file
metadata = pd.read_csv('ddi/ddi_metadata.csv')

In [4]:
metadata.head()

Unnamed: 0.1,Unnamed: 0,DDI_ID,DDI_file,skin_tone,malignant,disease
0,0,1,000001.png,56,True,melanoma-in-situ
1,1,2,000002.png,56,True,melanoma-in-situ
2,2,3,000003.png,56,True,mycosis-fungoides
3,3,4,000004.png,56,True,squamous-cell-carcinoma-in-situ
4,4,5,000005.png,12,True,basal-cell-carcinoma


In [5]:
# Create a new column
metadata["skin_tone_category"] = metadata["skin_tone"].astype("category")

In [6]:
# Get first five lines to see
metadata.head()

Unnamed: 0.1,Unnamed: 0,DDI_ID,DDI_file,skin_tone,malignant,disease,skin_tone_category
0,0,1,000001.png,56,True,melanoma-in-situ,56
1,1,2,000002.png,56,True,melanoma-in-situ,56
2,2,3,000003.png,56,True,mycosis-fungoides,56
3,3,4,000004.png,56,True,squamous-cell-carcinoma-in-situ,56
4,4,5,000005.png,12,True,basal-cell-carcinoma,12


In [7]:
# skin_tone_category column will be separated by three categories
new_categories = ["light", "medium", "dark"]
metadata["skin_tone_category"] = metadata["skin_tone_category"].cat.rename_categories(new_categories)

In [8]:
metadata.head()

Unnamed: 0.1,Unnamed: 0,DDI_ID,DDI_file,skin_tone,malignant,disease,skin_tone_category
0,0,1,000001.png,56,True,melanoma-in-situ,dark
1,1,2,000002.png,56,True,melanoma-in-situ,dark
2,2,3,000003.png,56,True,mycosis-fungoides,dark
3,3,4,000004.png,56,True,squamous-cell-carcinoma-in-situ,dark
4,4,5,000005.png,12,True,basal-cell-carcinoma,light


In [9]:
# Sorting skin_tone_category column
sorted_metadata = metadata.sort_values(by="skin_tone_category")

In [10]:
# Check how many rows does each category have
metadata["skin_tone_category"].value_counts()

skin_tone_category
medium    241
light     208
dark      207
Name: count, dtype: int64

In [11]:
# See the updated Dataframe
sorted_metadata

Unnamed: 0.1,Unnamed: 0,DDI_ID,DDI_file,skin_tone,malignant,disease,skin_tone_category
327,327,328,000328.png,12,False,melanocytic-nevi,light
303,303,304,000304.png,12,False,seborrheic-keratosis-irritated,light
302,302,303,000303.png,12,False,verruca-vulgaris,light
301,301,302,000302.png,12,True,squamous-cell-carcinoma-in-situ,light
300,300,301,000301.png,12,True,squamous-cell-carcinoma-in-situ,light
...,...,...,...,...,...,...,...
166,166,167,000167.png,56,False,folliculitis,dark
165,165,166,000166.png,56,True,squamous-cell-carcinoma-in-situ,dark
164,164,165,000165.png,56,True,metastatic-carcinoma,dark
175,175,176,000176.png,56,False,acrochordon,dark


In [43]:
sorted_metadata.to_csv("sorted_metadata.csv", index=False)

In [12]:
# Create a label column and it shows melanoma when disease column is melanoma-in-situ, otherwise not-melanoma
melanoma_types = ["melanoma", "melanoma-acral-lentiginous", "melanoma-in-situ", "nodular-melanoma-(nm)"]
sorted_metadata["label"] = np.where(sorted_metadata["disease"].isin(melanoma_types), "melanoma", "not-melanoma")

In [13]:
sorted_metadata["label"].value_counts()

label
not-melanoma    635
melanoma         21
Name: count, dtype: int64

In [14]:
# Create three Dataframes which includes only light, medium or dark rows
sorted_metadata_light = sorted_metadata[0:208]
sorted_metadata_medium = sorted_metadata[208:449]
sorted_metadata_dark = sorted_metadata[449:656]

In [15]:
# Double check the amount of rows in light Dataframe has same number as the count of light rows in the original Dataframe
sorted_metadata_light.value_counts()

Unnamed: 0  DDI_ID  DDI_file    skin_tone  malignant  disease                                  skin_tone_category  label       
4           5       000005.png  12         True       basal-cell-carcinoma                     light               not-melanoma    1
36          37      000037.png  12         True       basal-cell-carcinoma                     light               not-melanoma    1
37          38      000038.png  12         True       squamous-cell-carcinoma-keratoacanthoma  light               not-melanoma    1
38          39      000039.png  12         True       squamous-cell-carcinoma                  light               not-melanoma    1
39          40      000040.png  12         True       squamous-cell-carcinoma-in-situ          light               not-melanoma    1
                                                                                                                                  ..
435         436     000436.png  12         False      seborrheic-keratosis

In [16]:
sorted_metadata_medium.value_counts()

Unnamed: 0  DDI_ID  DDI_file    skin_tone  malignant  disease                           skin_tone_category  label       
9           10      000010.png  34         True       basal-cell-carcinoma-superficial  medium              not-melanoma    1
35          36      000036.png  34         False      melanocytic-nevi                  medium              not-melanoma    1
56          57      000057.png  34         False      foreign-body-granuloma            medium              not-melanoma    1
61          62      000062.png  34         False      seborrheic-keratosis-irritated    medium              not-melanoma    1
63          64      000064.png  34         False      melanocytic-nevi                  medium              not-melanoma    1
                                                                                                                           ..
651         652     000652.png  34         False      pyogenic-granuloma                medium              not-melanoma   

In [17]:
sorted_metadata_dark.value_counts()

Unnamed: 0  DDI_ID  DDI_file    skin_tone  malignant  disease                                      skin_tone_category  label       
0           1       000001.png  56         True       melanoma-in-situ                             dark                melanoma        1
1           2       000002.png  56         True       melanoma-in-situ                             dark                melanoma        1
2           3       000003.png  56         True       mycosis-fungoides                            dark                not-melanoma    1
3           4       000004.png  56         True       squamous-cell-carcinoma-in-situ              dark                not-melanoma    1
5           6       000006.png  56         True       squamous-cell-carcinoma                      dark                not-melanoma    1
                                                                                                                                      ..
246         247     000247.png  56         Fal

In [18]:
sorted_metadata_light

Unnamed: 0.1,Unnamed: 0,DDI_ID,DDI_file,skin_tone,malignant,disease,skin_tone_category,label
327,327,328,000328.png,12,False,melanocytic-nevi,light,not-melanoma
303,303,304,000304.png,12,False,seborrheic-keratosis-irritated,light,not-melanoma
302,302,303,000303.png,12,False,verruca-vulgaris,light,not-melanoma
301,301,302,000302.png,12,True,squamous-cell-carcinoma-in-situ,light,not-melanoma
300,300,301,000301.png,12,True,squamous-cell-carcinoma-in-situ,light,not-melanoma
...,...,...,...,...,...,...,...,...
4,4,5,000005.png,12,True,basal-cell-carcinoma,light,not-melanoma
121,121,122,000122.png,12,False,lipoma,light,not-melanoma
71,71,72,000072.png,12,True,basal-cell-carcinoma-superficial,light,not-melanoma
70,70,71,000071.png,12,True,basal-cell-carcinoma-nodular,light,not-melanoma


In [19]:
import PIL
import cv2

In [20]:
def image_resize(image):
    original_size = image.size
    target_size = (224, 224)
    ratio = min(target_size[0] / original_size[0], target_size[1] / original_size[1])
    new_size = (int(original_size[0] * ratio), int(original_size[1] * ratio))
    resized_image = image.resize(new_size, Image.Resampling.LANCZOS)

    new_image = Image.new("RGB", target_size, (255, 255, 255))
    paste_position = ((target_size[0] - new_size[0]) // 2, (target_size[1] - new_size[1]) // 2)
    new_image.paste(resized_image, paste_position)

    return new_image

In [21]:
folder = "ddi/ddidiversedermatologyimages"
file_names = sorted_metadata_light["DDI_file"]
images = []
for file in file_names:
    path = os.path.join(folder, file)
    image = PIL.Image.open(path)
    print(image.size) # Returns (width, height)
    images.append(np.array(image_resize(image)).flatten())
X = np.stack(images)

(934, 684)
(523, 700)
(532, 699)
(792, 418)
(1306, 878)
(1250, 886)
(431, 486)
(515, 697)
(517, 690)
(710, 699)
(514, 699)
(517, 693)
(537, 395)
(526, 707)
(517, 700)
(323, 244)
(242, 155)
(531, 708)
(279, 200)
(416, 357)
(699, 703)
(532, 519)
(247, 237)
(391, 422)
(805, 634)
(593, 445)
(930, 683)
(799, 638)
(918, 803)
(791, 639)
(788, 641)
(465, 574)
(500, 680)
(799, 642)
(792, 631)
(798, 629)
(511, 638)
(500, 604)
(902, 670)
(787, 641)
(523, 702)
(526, 712)
(330, 226)
(299, 455)
(607, 807)
(530, 704)
(531, 698)
(453, 483)
(525, 392)
(533, 700)
(463, 280)
(566, 412)
(447, 1066)
(942, 688)
(932, 640)
(496, 560)
(695, 599)
(894, 1242)
(295, 261)
(820, 924)
(924, 1258)
(538, 707)
(1206, 894)
(1634, 1212)
(930, 1244)
(514, 474)
(437, 466)
(523, 705)
(497, 353)
(546, 447)
(514, 249)
(504, 693)
(525, 701)
(445, 330)
(730, 701)
(380, 322)
(391, 282)
(519, 700)
(520, 486)
(547, 701)
(499, 467)
(519, 699)
(409, 278)
(949, 697)
(539, 702)
(715, 706)
(437, 547)
(355, 441)
(324, 318)
(531, 699)
(

In [22]:
metadata

Unnamed: 0.1,Unnamed: 0,DDI_ID,DDI_file,skin_tone,malignant,disease,skin_tone_category
0,0,1,000001.png,56,True,melanoma-in-situ,dark
1,1,2,000002.png,56,True,melanoma-in-situ,dark
2,2,3,000003.png,56,True,mycosis-fungoides,dark
3,3,4,000004.png,56,True,squamous-cell-carcinoma-in-situ,dark
4,4,5,000005.png,12,True,basal-cell-carcinoma,light
...,...,...,...,...,...,...,...
651,651,652,000652.png,34,False,pyogenic-granuloma,medium
652,652,653,000653.png,34,False,melanocytic-nevi,medium
653,653,654,000654.png,34,False,acral-melanotic-macule,medium
654,654,655,000655.png,34,True,squamous-cell-carcinoma,medium


In [23]:
pd.set_option("display.max_rows", 100)
print(metadata["disease"].value_counts())

disease
melanocytic-nevi                                119
seborrheic-keratosis                             58
verruca-vulgaris                                 50
basal-cell-carcinoma                             41
epidermal-cyst                                   35
mycosis-fungoides                                32
squamous-cell-carcinoma-in-situ                  28
dermatofibroma                                   22
acrochordon                                      19
squamous-cell-carcinoma                          17
dysplastic-nevus                                 16
pyogenic-granuloma                               14
seborrheic-keratosis-irritated                   14
neurofibroma                                     12
angioma                                          11
eccrine-poroma                                   10
squamous-cell-carcinoma-keratoacanthoma           8
melanoma                                          7
melanoma-acral-lentiginous                        7
mela

In [24]:
y = sorted_metadata_light["label"]

In [25]:
y

327    not-melanoma
303    not-melanoma
302    not-melanoma
301    not-melanoma
300    not-melanoma
           ...     
4      not-melanoma
121    not-melanoma
71     not-melanoma
70     not-melanoma
46     not-melanoma
Name: label, Length: 208, dtype: object

In [26]:
y = np.asarray(y)

In [27]:
y

array(['not-melanoma', 'not-melanoma', 'not-melanoma', 'not-melanoma',
       'not-melanoma', 'not-melanoma', 'not-melanoma', 'not-melanoma',
       'not-melanoma', 'not-melanoma', 'not-melanoma', 'not-melanoma',
       'not-melanoma', 'not-melanoma', 'not-melanoma', 'not-melanoma',
       'not-melanoma', 'not-melanoma', 'not-melanoma', 'not-melanoma',
       'not-melanoma', 'not-melanoma', 'not-melanoma', 'not-melanoma',
       'not-melanoma', 'not-melanoma', 'not-melanoma', 'not-melanoma',
       'not-melanoma', 'not-melanoma', 'not-melanoma', 'not-melanoma',
       'not-melanoma', 'not-melanoma', 'not-melanoma', 'not-melanoma',
       'not-melanoma', 'not-melanoma', 'not-melanoma', 'not-melanoma',
       'not-melanoma', 'not-melanoma', 'not-melanoma', 'not-melanoma',
       'not-melanoma', 'not-melanoma', 'not-melanoma', 'not-melanoma',
       'not-melanoma', 'not-melanoma', 'not-melanoma', 'not-melanoma',
       'not-melanoma', 'not-melanoma', 'not-melanoma', 'not-melanoma',
      

In [28]:
sorted_metadata["disease"].value_counts()

disease
melanocytic-nevi                                119
seborrheic-keratosis                             58
verruca-vulgaris                                 50
basal-cell-carcinoma                             41
epidermal-cyst                                   35
mycosis-fungoides                                32
squamous-cell-carcinoma-in-situ                  28
dermatofibroma                                   22
acrochordon                                      19
squamous-cell-carcinoma                          17
dysplastic-nevus                                 16
pyogenic-granuloma                               14
seborrheic-keratosis-irritated                   14
neurofibroma                                     12
angioma                                          11
eccrine-poroma                                   10
squamous-cell-carcinoma-keratoacanthoma           8
melanoma                                          7
melanoma-acral-lentiginous                        7
nevu

In [29]:
print(sorted_metadata["label"].value_counts())

label
not-melanoma    635
melanoma         21
Name: count, dtype: int64


In [30]:
print(sorted_metadata_light["label"].value_counts())

label
not-melanoma    201
melanoma          7
Name: count, dtype: int64


In [31]:
print(sorted_metadata_medium["label"].value_counts())

label
not-melanoma    234
melanoma          7
Name: count, dtype: int64


In [32]:
print(sorted_metadata_dark["label"].value_counts())

label
not-melanoma    200
melanoma          7
Name: count, dtype: int64


In [33]:
from sklearn.model_selection import train_test_split

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [35]:
clf = svm.SVC()

In [36]:
clf.fit(X_train, y_train)

In [37]:
y_pred = clf.predict(X_test)

In [38]:
accuracy = accuracy_score(y_test, y_pred)

In [39]:
accuracy

0.9523809523809523

In [40]:
folder = "ddi/ddidiversedermatologyimages"
file_names = sorted_metadata_medium["DDI_file"]
images = []
for file in file_names:
    path = os.path.join(folder, file)
    image = PIL.Image.open(path)
    #print(image.size) # Returns (width, height)
    images.append(np.array(image_resize(image)).flatten())
X_medium = np.stack(images)

y_medium = sorted_metadata_medium["label"]

X_medium_train, X_medium_test, y_medium_train, y_medium_test = train_test_split(X_medium, y_medium, test_size=0.2, random_state=42)

clf_medium = svm.SVC()

clf_medium.fit(X_medium_train, y_medium_train)

y_medium_pred = clf_medium.predict(X_medium_test)

accuracy_medium = accuracy_score(y_medium_test, y_medium_pred)

print(accuracy_medium)

0.9795918367346939


In [41]:
folder = "ddi/ddidiversedermatologyimages"
file_names = sorted_metadata_dark["DDI_file"]
images = []
for file in file_names:
    path = os.path.join(folder, file)
    image = PIL.Image.open(path)
    #print(image.size) # Returns (width, height)
    images.append(np.array(image_resize(image)).flatten())
X_dark = np.stack(images)

y_dark = sorted_metadata_dark["label"]

X_dark_train, X_dark_test, y_dark_train, y_dark_test = train_test_split(X_dark, y_dark, test_size=0.2, random_state=42)

clf_dark = svm.SVC()

clf_dark.fit(X_dark_train, y_dark_train)

y_dark_pred = clf_dark.predict(X_dark_test)

accuracy_dark = accuracy_score(y_dark_test, y_dark_pred)

print(accuracy_dark)

0.9761904761904762


In [42]:
folder = "ddi/ddidiversedermatologyimages"
file_names = sorted_metadata["DDI_file"]
images = []
for file in file_names:
    path = os.path.join(folder, file)
    image = PIL.Image.open(path)
    #print(image.size) # Returns (width, height)
    images.append(np.array(image_resize(image)).flatten())
X_total = np.stack(images)

y_total = sorted_metadata["label"]

X_total_train, X_total_test, y_total_train, y_total_test = train_test_split(X_total, y_total, test_size=0.2, random_state=42)

clf_total = svm.SVC()

clf_total.fit(X_total_train, y_total_train)

y_total_pred = clf_total.predict(X_total_test)

accuracy_total = accuracy_score(y_total_test, y_total_pred)

print(accuracy_total)

KeyboardInterrupt: 

In [None]:
from sklearn.metrics import precision_score

In [None]:
count = np.sum(y_pred == 'melanoma')
count_medium = np.sum(y_medium_pred == 'melanoma')
count_dark = np.sum(y_dark_pred == 'melanoma')
count_total = np.sum(y_total_pred == 'melanoma')
print(count)
print(count_medium)
print(count_dark)
print(count_total)

0
0
0
0


In [None]:
light_precision = precision_score(y_test, y_pred, labels = ['melanoma', 'not-melanoma'], pos_label = 'melanoma')
medium_precision = precision_score(y_medium_test, y_medium_pred, labels = ['melanoma', 'not-melanoma'], pos_label = 'melanoma')
dark_precision = precision_score(y_dark_test, y_dark_pred, labels = ['melanoma', 'not-melanoma'], pos_label = 'melanoma')
total_precision = precision_score(y_total_test, y_total_pred, labels = ['melanoma', 'not-melanoma'], pos_label = 'melanoma')

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
import shap

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
print(type(X_medium_train))                  # should be <class 'numpy.ndarray'>
print(X_medium_train.dtype)                  # should be float
print(type(X_medium_train[0]))               # should be <class 'numpy.ndarray'> or <class 'numpy.float64'>
print(type(X_medium_train[0][0]))            # should be <class 'numpy.float64'>

<class 'numpy.ndarray'>
uint8
<class 'numpy.ndarray'>
<class 'numpy.uint8'>


In [None]:
svc_model = svm.SVC(probability=True)
svc_model.fit(X_medium_train, y_medium_train)

: 

In [None]:
background = shap.sample(X_medium_train, 10, random_state=42)

explainer = shap.KernelExplainer(svc_model.predict_proba, background)
shap_values = explainer.shap_values(X_medium_test)

# Select a test sample (e.g., index 0)
index = 0
image = X_medium_test[index].reshape(8, 8)
shap_vals = shap_values[y_medium_test[index]][index].reshape(8, 8)
 
plt.figure(figsize=(10, 4))
 
# Original Image
plt.subplot(1, 2, 1)
plt.imshow(image, cmap='gray')
plt.title(f"True Label: {y_medium_test[index]}")
plt.axis('off')
 
# SHAP Value Heatmap
plt.subplot(1, 2, 2)
plt.imshow(shap_vals, cmap='coolwarm')
plt.title("SHAP Explanation")
plt.axis('off')
 
plt.tight_layout()
plt.show()

  0%|          | 0/49 [00:00<?, ?it/s]