In [2]:
# %matplotlib inline
# import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
from glob import glob
import seaborn as sns
from PIL import Image



image_dir = "/home/sajedhamdan/Desktop/skin_cancer/CNN+TDA/new_dataset/ISBI2016_ISIC_Part3B_Training_Data"
df = pd.read_csv('/home/sajedhamdan/Desktop/skin_cancer/CNN+TDA/new_dataset/ISBI2016_ISIC_Part3B_Training_GroundTruth.csv')
df.head()


image_id_path = {
    os.path.splitext(os.path.basename(x))[0]: x
    for x in glob(os.path.join(image_dir, '*.jpg'))
}

print(image_id_path)

lesion_types = {
    'melanocytic_nevi': 0,
    'melanoma': 1,
    'benign_keratosis': 2,
    'basal_cell_carcinoma': 3,
    'actinic_keratoses': 4,
    'vascular_lesions': 5,
    'dermatofibroma': 6
}


# 0 for Benign, 1 for Malignant
lesion_is_dangerous = {
    'melanocytic_nevi': 0,
    'melanoma': 1, 
    'benign_keratosis': 0, 
    'basal_cell_carcinoma': 1, 
    'actinic_keratoses': 1,
    'vascular_lesions': 0,
    'dermatofibroma': 0
}



# setting up data frame, by mapping image and its metadata
df["path"] = df["image_id"].map(image_id_path.get) # map image_id to the path of that image
df["cell_type"] = df["dx"].map(lesion_types.get) # map dx to type of lesion
df["Malignant"] = df["dx"].map(lesion_is_dangerous.get)
df["cell_type_idx"] = pd.Categorical(df["cell_type"]).codes # give each cell type a category id

{'ISIC_0010017': '/home/sajedhamdan/Desktop/skin_cancer/CNN+TDA/new_dataset/ISBI2016_ISIC_Part3B_Training_Data/ISIC_0010017.jpg', 'ISIC_0000075': '/home/sajedhamdan/Desktop/skin_cancer/CNN+TDA/new_dataset/ISBI2016_ISIC_Part3B_Training_Data/ISIC_0000075.jpg', 'ISIC_0010324': '/home/sajedhamdan/Desktop/skin_cancer/CNN+TDA/new_dataset/ISBI2016_ISIC_Part3B_Training_Data/ISIC_0010324.jpg', 'ISIC_0000505': '/home/sajedhamdan/Desktop/skin_cancer/CNN+TDA/new_dataset/ISBI2016_ISIC_Part3B_Training_Data/ISIC_0000505.jpg', 'ISIC_0001152': '/home/sajedhamdan/Desktop/skin_cancer/CNN+TDA/new_dataset/ISBI2016_ISIC_Part3B_Training_Data/ISIC_0001152.jpg', 'ISIC_0009968': '/home/sajedhamdan/Desktop/skin_cancer/CNN+TDA/new_dataset/ISBI2016_ISIC_Part3B_Training_Data/ISIC_0009968.jpg', 'ISIC_0000140': '/home/sajedhamdan/Desktop/skin_cancer/CNN+TDA/new_dataset/ISBI2016_ISIC_Part3B_Training_Data/ISIC_0000140.jpg', 'ISIC_0002439': '/home/sajedhamdan/Desktop/skin_cancer/CNN+TDA/new_dataset/ISBI2016_ISIC_Part3B_

In [4]:




from skimage.io import imread
df["image"] = df["path"].map(imread) # read the image to array values
df.iloc[0]["image"] # here is a sample
# let's see what is the shape of each value in the image column
df["image"].map(lambda x: x.shape).value_counts() 




# setting dataframe to store mean value of Red, Blue and Green for each picture
# this is the main change implemented, rather than reading images through subdirectories(folders)
image_rgb_df = df.apply(lambda x: pd.Series({'{}_mean'.format(k): v for k, v 
                                                 in zip(["Red", "Blue", "Green"], 
                                                        np.mean(x["image"], (0, 1)))}), 1)


gray_scale_color_vector = image_rgb_df.apply(lambda x: np.mean(x), 1) # take the mean value across columns of image_rgb_df
for c_col in image_rgb_df.columns:
    image_rgb_df[c_col] = image_rgb_df[c_col]/gray_scale_color_vector 
image_rgb_df["Gray_mean"] = gray_scale_color_vector
image_rgb_df.sample(3)

for c_col in image_rgb_df.columns:
    df[c_col] = image_rgb_df[c_col].values



reshaped_images = df["path"].map(lambda x: np.asarray(Image.open(x).resize((64,64), resample=Image.LANCZOS).\
                                                          convert("RGB")).ravel())
output_vector = np.stack(reshaped_images, 0)
output_df = pd.DataFrame(output_vector)
output_df["label"] = df["cell_type_idx"]

out_path = "hmnist_64_64_RBG.csv"
output_df.to_csv(out_path, index=False)



original_image = Image.open(df["path"][0])
original_image.size
!mkdir skin_lesion_types
df["cell_type"].unique()

mkdir: cannot create directory ‘skin_lesion_types’: File exists


array([4, 5, 2, 3, 0, 1])

In [5]:
df.head()

Unnamed: 0,image_id,cancerous,dx,path,cell_type,Malignant,cell_type_idx,image,Red_mean,Blue_mean,Green_mean,Gray_mean
0,ISIC_0000000,benign,actinic_keratoses,/home/sajedhamdan/Desktop/skin_cancer/CNN+TDA/...,4,1,4,"[[[249, 255, 255], [231, 241, 251], [241, 255,...",0.877362,0.987095,1.135542,150.324133
1,ISIC_0000001,benign,actinic_keratoses,/home/sajedhamdan/Desktop/skin_cancer/CNN+TDA/...,4,1,4,"[[[255, 254, 255], [229, 228, 236], [255, 254,...",1.009812,0.984288,1.0059,164.697057
2,ISIC_0000002,malignant,vascular_lesions,/home/sajedhamdan/Desktop/skin_cancer/CNN+TDA/...,5,0,5,"[[[255, 255, 255], [239, 239, 239], [255, 255,...",0.860371,0.953482,1.186147,158.165772
3,ISIC_0000004,malignant,actinic_keratoses,/home/sajedhamdan/Desktop/skin_cancer/CNN+TDA/...,4,1,4,"[[[255, 255, 255], [244, 244, 244], [255, 255,...",1.13632,0.858805,1.004875,79.599645
4,ISIC_0000006,benign,benign_keratosis,/home/sajedhamdan/Desktop/skin_cancer/CNN+TDA/...,2,0,2,"[[[255, 255, 255], [241, 241, 241], [255, 255,...",0.895381,0.984328,1.120291,149.780657


In [6]:
df["path"][0]

'/home/sajedhamdan/Desktop/skin_cancer/CNN+TDA/new_dataset/ISBI2016_ISIC_Part3B_Training_Data/ISIC_0000000.jpg'

In [7]:
# for index in df.index.values.tolist():
#     path = df.iloc[index]["path"]
#     cell_type_idx = df.iloc[index]["cell_type"]
#     original_image_id = df.iloc[index]["image_id"]
#     newpath = f"/home/sajedhamdan/Desktop/skin_cancer/skin_lesion_types/{cell_type_idx}/{original_image_id}.jpg"
#     original_image = Image.open(path)
#     original_image = original_image.resize((299, 299), resample=Image.LANCZOS)
#     original_image.save(newpath)



# previous code gave me an issue on the dir, resolved by creating new dir at first 
for index in df.index.values.tolist():
    path = df.iloc[index]["path"]
    cell_type_idx = df.iloc[index]["cell_type"]
    original_image_id = df.iloc[index]["image_id"]
    
    # Define the new path where the image will be saved
    newpath = f"/home/sajedhamdan/Desktop/skin_cancer/CNN+TDA/new_dataset/skin_lesion_types/{cell_type_idx}/{original_image_id}.jpg"
    
    # Ensure the directory exists
    directory = os.path.dirname(newpath)  # Extract the directory path
    os.makedirs(directory, exist_ok=True)  # Create the directory if it doesn't exist

    # Open, resize, and save the image
    original_image = Image.open(path)
    original_image = original_image.resize((299, 299), resample=Image.LANCZOS)
    original_image.save(newpath)




reshaped_images = df["path"].map(lambda x: np.asarray(Image.open(x).resize((256,192), resample=Image.LANCZOS).\
                                                          convert("RGB")))
                                                    


In [8]:
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# Assume reshaped_images is a list of np arrays with equal shape
num_images = len(reshaped_images)
image_shape = reshaped_images[0].shape  # e.g., (256, 192, 3)

# Prepare disk-backed array to avoid memory overload
output_vector = np.memmap("images_memmap.dat", dtype='float32', mode='w+', shape=(num_images, *image_shape))

print(f"Normalizing and writing {num_images} images to memmap file...")
for i, img in enumerate(tqdm(reshaped_images)):
    output_vector[i] = img.astype('float32') / 255.0

# Flush changes to disk
output_vector.flush()

# Get labels
labels = df["cell_type_idx"].values

# First split: Train/Test
X_train_orig_idx, X_test_idx, y_train_orig, y_test = train_test_split(
    np.arange(num_images), labels, test_size=0.1, random_state=0
)

# Save Test set
np.save("images_test_256x192.npy", output_vector[X_test_idx])
np.save("test_labels.npy", y_test)

# Second split: Train/Val from Train_Orig
X_train_idx, X_val_idx, y_train, y_val = train_test_split(
    X_train_orig_idx, y_train_orig, test_size=0.1, random_state=1
)

# Save Val and Train sets
np.save("images_val_256x192.npy", output_vector[X_val_idx])
np.save("val_labels.npy", y_val)

np.save("images_train_256x192.npy", output_vector[X_train_idx])
np.save("train_labels.npy", y_train)


Normalizing and writing 900 images to memmap file...


100%|██████████| 900/900 [00:01<00:00, 884.36it/s] 
