In [6]:
import numpy as np
import pandas as pd
import os
from glob import glob
import seaborn as sns
from PIL import Image



image_dir = "/home/sajedhamdan/Desktop/skin-cancer-detection/images"
df = pd.read_csv('HAM10000_metadata')
df.head()


# image_id_path = {os.path.splitext(os.path.basename(x))[0]: x
#                      for x in glob(os.path.join(image_dir, '*', '*.jpg'))}

image_id_path = {os.path.splitext(os.path.basename(x))[0]: x
                 for x in glob(os.path.join(image_dir, '**', '*.jpg'), recursive=True)}


# mapping lesion type codes to their full names
lesion_types = {
    'nv': 'melanocytic_nevi',
    'mel': 'melanoma',
    'bkl': 'benign_keratosis',
    'bcc': 'basal_cell_carcinoma',
    'akiec': 'actinic_keratoses',
    'vasc': 'vascular_lesions',
    'df': 'dermatofibroma'
}

# 0 for Benign(non cancerous), 1 for Malignant(cancerous)
lesion_is_dangerous = {
    'nv': 0,
    'mel': 1, 
    'bkl': 0, 
    'bcc': 1, 
    'akiec': 1,
    'vasc': 0,
    'df': 0
}


# setting up data frame, by mapping image and its metadata
df["path"] = df["image_id"].map(image_id_path.get) 
df["cell_type"] = df["dx"].map(lesion_types.get) 
df["Malignant"] = df["dx"].map(lesion_is_dangerous.get)
df["cell_type_idx"] = pd.Categorical(df["cell_type"]).codes



from skimage.io import imread

# import imageio
# df["image"] = df["path"].map(lambda x: imageio.imread(x))

df["image"] = df["path"].map(imread)
df.iloc[0]["image"] 
# lthe shape of each value in the image column
df["image"].map(lambda x: x.shape).value_counts() 




# setting dataframe to store mean value of Red, Blue and Green for each picture
# this is the main change implemented, rather than reading images through subdirectories(folders)
image_rgb_df = df.apply(lambda x: pd.Series({'{}_mean'.format(k): v for k, v 
                                                 in zip(["Red", "Blue", "Green"], 
                                                        np.mean(x["image"], (0, 1)))}), 1)


gray_scale_color_vector = image_rgb_df.apply(lambda x: np.mean(x), 1) # mean value across columns of image_rgb_df
for c_col in image_rgb_df.columns:
    image_rgb_df[c_col] = image_rgb_df[c_col]/gray_scale_color_vector 
image_rgb_df["Gray_mean"] = gray_scale_color_vector
image_rgb_df.sample(3)

for c_col in image_rgb_df.columns:
    df[c_col] = image_rgb_df[c_col].values



reshaped_images = df["path"].map(lambda x: np.asarray(Image.open(x).resize((64,64), resample=Image.LANCZOS).\
                                                          convert("RGB")).ravel())
output_vector = np.stack(reshaped_images, 0)
output_df = pd.DataFrame(output_vector)
output_df["label"] = df["cell_type_idx"]

out_path = "hmnist_64_64_RBG.csv"
output_df.to_csv(out_path, index=False)



original_image = Image.open(df["path"][0])
original_image.size
!mkdir skin_lesion_types
df["cell_type"].unique()

array(['benign_keratosis', 'melanocytic_nevi', 'dermatofibroma',
       'melanoma', 'vascular_lesions', 'basal_cell_carcinoma',
       'actinic_keratoses'], dtype=object)

In [11]:
# for index in df.index.values.tolist():
#     path = df.iloc[index]["path"]
#     cell_type_idx = df.iloc[index]["cell_type"]
#     original_image_id = df.iloc[index]["image_id"]
#     newpath = f"/home/sajedhamdan/Desktop/skin_cancer/skin_lesion_types/{cell_type_idx}/{original_image_id}.jpg"
#     original_image = Image.open(path)
#     original_image = original_image.resize((299, 299), resample=Image.LANCZOS)
#     original_image.save(newpath)



# previous code gave me an issue on the dir, resolved by creating new dir at first 
for index in df.index.values.tolist():
    path = df.iloc[index]["path"]
    cell_type_idx = df.iloc[index]["cell_type"]
    original_image_id = df.iloc[index]["image_id"]
    
    # new path where the image will be saved
    newpath = f"/home/sajedhamdan/Desktop/skin-cancer-detection/skin_lesion_types/{cell_type_idx}/{original_image_id}.jpg"
    
    # ensure dir exists
    directory = os.path.dirname(newpath)
    os.makedirs(directory, exist_ok=True)

    original_image = Image.open(path)
    original_image = original_image.resize((299, 299), resample=Image.LANCZOS)
    original_image.save(newpath)




reshaped_images = df["path"].map(lambda x: np.asarray(Image.open(x).resize((256,192), resample=Image.LANCZOS).\
                                                          convert("RGB")))
                                                    


In [12]:
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm

num_images = len(reshaped_images)
image_shape = reshaped_images[0].shape 

# disk-backed array to avoid memory overload
output_vector = np.memmap("images_memmap.dat", dtype='float32', mode='w+', shape=(num_images, *image_shape))

print(f"Normalizing and writing {num_images} images to memmap file...")
for i, img in enumerate(tqdm(reshaped_images)):
    output_vector[i] = img.astype('float32') / 255.0

# dlush changes to disk
output_vector.flush()

labels = df["cell_type_idx"].values

# first split, for Train & Test set
X_train_orig_idx, X_test_idx, y_train_orig, y_test = train_test_split(
    np.arange(num_images), labels, test_size=0.1, random_state=0
)

# test set
np.save("images_test_256x192.npy", output_vector[X_test_idx])
np.save("test_labels.npy", y_test)

# second split, Train & Val set
X_train_idx, X_val_idx, y_train, y_val = train_test_split(
    X_train_orig_idx, y_train_orig, test_size=0.1, random_state=1
)

# sets
np.save("images_val_256x192.npy", output_vector[X_val_idx])
np.save("val_labels.npy", y_val)

np.save("images_train_256x192.npy", output_vector[X_train_idx])
np.save("train_labels.npy", y_train)


Normalizing and writing 10015 images to memmap file...


100%|██████████| 10015/10015 [00:44<00:00, 225.44it/s]
