In [10]:
import os
import cv2
import matplotlib.pyplot as plt
import numpy as np
import kagglehub

# Download dataset
path = kagglehub.dataset_download("hasnainjaved/melanoma-skin-cancer-dataset-of-10000-images")

# Settings
img_size = 50
base_path = "/Users/tylerkwok/.cache/kagglehub/datasets/hasnainjaved/melanoma-skin-cancer-dataset-of-10000-images/versions/1/melanoma_cancer_dataset"

# Folder paths 
ben_training_folder = os.path.join(base_path, "train/benign/")
mal_training_folder = os.path.join(base_path, "train/malignant/") 
ben_testing_folder = os.path.join(base_path, "test/benign/")
mal_testing_folder = os.path.join(base_path, "test/malignant/")

# Initialize data lists
ben_training_data = []
mal_training_data = []
ben_testing_data = []
mal_testing_data = []

# Load benign training images
for filename in os.listdir(ben_training_folder):
   try:
       path = ben_training_folder + filename
       img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
       img = cv2.resize(img, (img_size, img_size))
       img_array = np.array(img)
       ben_training_data.append([img_array, np.array([1,0])])
   except:
       pass

# Load malignant training images        
for filename in os.listdir(mal_training_folder):
   try:
       path = mal_training_folder + filename
       img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
       img = cv2.resize(img, (img_size, img_size))
       img_array = np.array(img)
       mal_training_data.append([img_array, np.array([0,1])])
   except:
       pass

# Load benign testing images
for filename in os.listdir(ben_testing_folder):
   try:
       path = ben_testing_folder + filename
       img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
       img = cv2.resize(img, (img_size, img_size))
       img_array = np.array(img)
       ben_testing_data.append([img_array, np.array([1,0])])
   except:
       pass

# Load malignant testing images
for filename in os.listdir(mal_testing_folder):
   try:
       path = mal_testing_folder + filename
       img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
       img = cv2.resize(img, (img_size, img_size))
       img_array = np.array(img)
       mal_testing_data.append([img_array, np.array([0,1])])
   except:
       pass

# Balance benign training data
ben_training_data = ben_training_data[0:len(mal_training_data)]

# Print dataset sizes
print(f"Benign training count: {len(ben_training_data)}")
print(f"Malignant training count: {len(mal_training_data)}")
print(f"Benign testing count: {len(ben_testing_data)}")
print(f"Malignant testing count: {len(mal_testing_data)}")

# Combine and shuffle data
training_data = ben_training_data + mal_training_data
testing_data = ben_testing_data + mal_testing_data
np.random.shuffle(training_data)
np.random.shuffle(testing_data)

# Split into X and y arrays
X_train = np.array([item[0] for item in training_data])
y_train = np.array([item[1] for item in training_data])
X_test = np.array([item[0] for item in testing_data]) 
y_test = np.array([item[1] for item in testing_data])

# Save arrays
np.save("melanoma_X_train.npy", X_train)
np.save("melanoma_y_train.npy", y_train)
np.save("melanoma_X_test.npy", X_test)
np.save("melanoma_y_test.npy", y_test)

Benign training count: 4605
Malignant training count: 4605
Benign testing count: 500
Malignant testing count: 500
