## loading images

In [None]:
import numpy as np  
from PIL import Image
import os
import cv2


***in this code we define detData function to import all images into arraies for X_train,y_train,X_test,y_test ,note for img in sorted(os.listdir(path+data_name+'/'+typ),key=natural_sort_key) to import images in the same sort as folders which sort in name ,the benefit of that will be later when exporting images back to files that to be sure it exported right ,the way function natural_sort_key work is simply from the names of original data***

In [None]:
import re
def natural_sort_key(s):
    return [int(text) if text.isdigit() else text.lower() for text in re.split('([0-9]+)', s)]
sorted_list = sorted(["k_1", "k_2", "k_10"], key=natural_sort_key)
print(sorted_list)


In [None]:
def getData(path):
    X_train,y_train,X_test,y_test,=[],[],[],[]
    for data_name in os.listdir(path):
            for typ in os.listdir(path+data_name):
                for img in sorted(os.listdir(path+data_name+'/'+typ),key=natural_sort_key):
                    with Image.open(path+data_name+'/'+typ+'/'+img) as image:
                        if (data_name=='train'):
                            X_train.append(np.array(image))
                            y_train.append(1 if typ=='malignant' else 0)
                        else:
                            X_test.append(np.array(image))
                            y_test.append(1 if typ=='malignant' else 0)

    X_train=np.array(X_train)
    y_train=np.array(y_train)
    X_test=np.array(X_test)
    y_test=np.array(y_test)
    return X_train,y_train,X_test,y_test

In [None]:
X_train,y_train,X_test,y_test=getData(r'data/melanoma_cancer_dataset/')

## preprocess images

***we'll preprocess images by converting to grayscale,scaling/resizing the default approach is to loop throught loaded X_train ,X_test but it's very resource intensive so we'll load images using batches of 100 images a time then save whole images into new folders ***

In [None]:
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
def scaleImage(img):
    reshaped_array = img.reshape(-1, 1)
    scaler = MinMaxScaler()
    scaled_array = scaler.fit_transform(reshaped_array)
    scaled_image_array = scaled_array.reshape(img.shape)
    return scaled_image_array


In [None]:
batch_size = 100
num_images = len(X_train)
processed_X_train = []
for start in range(0, num_images, batch_size):
    end = min(start + batch_size, num_images)
    batch = X_train[start:end]
    batch_gray = [cv2.cvtColor(image_array, cv2.COLOR_BGR2GRAY) for image_array in batch]
    batch_processed = [cv2.resize(scaleImage(image), (200, 300)) for image in batch_gray]
    processed_X_train.extend(batch_processed)

processed_X_train = np.array(processed_X_train)


In [None]:
batch_size = 100
num_images = len(X_test)
processed_images_X_test = []
for start in range(0, num_images, batch_size):
    end = min(start + batch_size, num_images)
    batch = X_test[start:end]
    batch_gray = [cv2.cvtColor(image_array, cv2.COLOR_BGR2GRAY) for image_array in batch]
    batch_processed = [cv2.resize(scaleImage(image), (200, 300)) for image in batch_gray]
    processed_images_X_test.extend(batch_processed)

processed_X_test = np.array(processed_images_X_test)


In [None]:
import matplotlib.pyplot as plt
img=plt.imshow(processed_X_train[0],cmap='gray')
plt.show()

In [None]:
#function here save using cmap ,unlike function below
import matplotlib.pyplot as plt
def saveToFile(compressed_img,name):#name with extension
    plt.imsave(name, compressed_img,cmap='gray')

In [None]:
path="data/melanoma_cancer_dataset/"
newDir="dataCreated/preproceed/"
#need shuffle?later in processing ,to make this code work

os.makedirs(newDir+"train/benign", exist_ok=True)
os.makedirs(newDir+"train/malignant", exist_ok=True)
os.makedirs(newDir+"test/benign", exist_ok=True)
os.makedirs(newDir+"test/malignant", exist_ok=True)
for i,img in enumerate(processed_X_train):
    if (y_train[i]==1):
        saveToFile(processed_X_train[i],newDir+"train/malignant/"+str(i)+".jpg")#name of image changed ,not very important thing to aware
    else:
        saveToFile(processed_X_train[i],newDir+"train/benign/"+str(i)+".jpg")

for i,img in enumerate(processed_X_test):
    if (y_test[i]==1):
        saveToFile(processed_X_test[i],newDir+"test/malignant/"+str(i)+".jpg")#name of image changed ,not very important thing to aware
    else:
        saveToFile(processed_X_test[i],newDir+"test/benign/"+str(i)+".jpg")


***important: to display grayed in plt must use cmap = 'gray'***

## reducing images size

### reducing using clustering algorithm
(note preproceed with scaling,resizing have reduced some size)

***reload new preproceed data (and overwrite original data to help memory)***

In [None]:
del(processed_X_test)
del(processed_X_train)
X_train,y_train,X_test,y_test=getData(r'dataCreated/preproceed/')

In [None]:
import matplotlib.pyplot as plt
img=plt.imshow(X_train[0])
plt.show()

In [None]:
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
def imageCompressKMean(Org_image,n_colors):#image if !png should be scaled by 255
    img = Org_image.reshape((Org_image.shape[0] * Org_image.shape[1], 3))
    kmeans = KMeans(n_clusters=n_colors, random_state=42,n_init=10)
    kmeans.fit(img)
    labels = kmeans.predict(img)
    centers = kmeans.cluster_centers_
    quantized_image=centers[labels].reshape(Org_image.shape)
    quantized_image = np.reshape(quantized_image, Org_image.shape) 
    return quantized_image
def saveToFile(compressed_img,name):#name with extension
    plt.imsave(name, compressed_img)
    
  



In [None]:
#test functions
quantized_image=imageCompressKMean(X_train[70]/255,8)
saveToFile(quantized_image,"new_data.jpg")
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))
ax1.imshow(X_train[70])
ax1.set_title("Original Image")
ax1.axis("off")
ax2.imshow(quantized_image)
ax2.set_title("Quantized Image ")
ax2.axis("off")
plt.show()

***trying to do last method on some images to see if it will actually reduce storage size so memory size when loading we'll try on any 500 images (say test/benign) which is 5MB 
,Note: can be done also using loaded images in memory but this to keep it simpler !
***

In [None]:
path="data/melanoma_cancer_dataset/test/benign/"
newDir="dataCreated/testKmean/"
os.makedirs(newDir, exist_ok=True)#to not raise error if exist

for i,data_name in enumerate(os.listdir(path)):#enumerate just to say progress
     with Image.open(path+data_name) as image:
            quantized_image=imageCompressKMean(np.array(image),8)
            saveToFile(quantized_image,newDir+str(data_name))
            if (i%20==0):print("progress "+str(i))

In [None]:
set(y_test[:500]==0)
#from running last line we actually see that first 500 image in y_test are from benign folder

In [None]:
#optimize above code
newDir="dataCreated/testKmean/"
os.makedirs(newDir, exist_ok=True)#to not raise error if exist

batch_size = 50
num_images = len(X_test[:500])
name_idx=0
for start in range(0, num_images, batch_size):
    end = min(start + batch_size, num_images)  
    batch = X_test[start:end]
    for image in batch:
        quantized_image=imageCompressKMean(image/255,8)
        name_idx=name_idx+1
        saveToFile(quantized_image,newDir+str(name_idx)+'.jpg')
    print("progress "+str(start+batch_size))

***from running last code and comparing size (using just windows) this method not good here ,this may because
number of colors in images is not large so reducing the number won't compress images (small difference may be from rolling back from loading jpeg and resaving),we'll try using another 
method :PCA***

### reducing using dimensionality reduction (Feature reduction using PCA)

In [None]:
X_train,y_train,X_test,y_test=getData(r'dataCreated/preproceed/')

In [None]:
processed_X_train=X_train
processed_X_test=X_test
X_train_flattened =processed_X_train.reshape(processed_X_train.shape[0], -1)
X_test_flattened =processed_X_test.reshape(processed_X_test.shape[0], -1)

In [None]:
from sklearn.decomposition import IncrementalPCA

num_components = 100
batch_size = 200  
ipca = IncrementalPCA(n_components=num_components)

for i in range(0, len(X_test_flattened), batch_size):
    X_batch = X_test_flattened[i:i+batch_size]
    ipca.partial_fit(X_batch)
    if (i%20==0):
        print(f"proceed : {i}")


In [None]:
X_train_pca = np.vstack([ipca.transform(X_train_flattened[i:i+batch_size]) for i in range(0, len(X_train_flattened), batch_size)])
X_test_pca = np.vstack([ipca.transform(X_test_flattened[i:i+batch_size]) for i in range(0, len(X_test_flattened), batch_size)])

In [None]:
os.makedirs("PCAParts/", exist_ok=True)

for i in range(0, len(X_train_pca), batch_size):
    batch = X_train_pca[i:i+batch_size]
    reconstructed_batch = ipca.inverse_transform(batch)
    np.save(f'PCAParts/reconstructed_batch_X_train_{i}', reconstructed_batch)

for i in range(0, len(X_test_pca), batch_size):
    batch = X_test_pca[i:i+batch_size]
    reconstructed_batch = ipca.inverse_transform(batch)
    np.save(f'PCAParts/reconstructed_batch_X_test_{i}', reconstructed_batch)

In [None]:
X_train_reconstructed = np.concatenate([np.load(f'PCAParts/reconstructed_batch_X_train_{i}.npy') for i in range(0, len(X_train_pca), batch_size)], axis=0)
X_test_reconstructed = np.concatenate([np.load(f'PCAParts/reconstructed_batch_X_test_{i}.npy') for i in range(0, len(X_test_pca), batch_size)], axis=0)


## Preprocess of images

### NN algorithm

In [87]:
X_train,y_train,X_test,y_test=getData(r'dataCreated/preproceed/')

In [90]:
y_train.shape

(9605,)

In [89]:
import random
data = list(zip(X_train, y_train))
random.shuffle(data)
X_train, y_train = zip(*data)
X_train=np.array(X_train)
y_train=np.array(y_train)
del(data)

In [95]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from tensorflow.keras.regularizers import l2
from tensorflow.keras.losses import BinaryCrossentropy

lambada=.1

model = Sequential()

model.add(Flatten(input_shape=X_train.shape[1:]))
model.add(Dense(128, activation='relu',kernel_regularizer=l2(lambada)))
model.add(Dense(128, activation='relu',kernel_regularizer=l2(lambada)))
model.add(Dense(128, activation='relu',kernel_regularizer=l2(lambada)))
model.add(Dense(1, activation='linear',kernel_regularizer=l2(lambada)))  


model.compile(optimizer='adam', loss=BinaryCrossentropy(from_logits=True), metrics=['accuracy'])




batch_size = 10
epochs = 10

model.fit(X_train,y_train, batch_size=batch_size, epochs=epochs,workers=2)


test_loss, test_accuracy = model.evaluate(X_test,y_test, verbose=0)
print(f"Test accuracy: {test_accuracy}")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy: 0.5


In [97]:
model.save('NNmodel.h5')#bad accuracy .5

### CNN algorithm

In [None]:
X_train,y_train,X_test,y_test=getData(r'dataCreated/preproceed/')

In [None]:
import random
data = list(zip(X_train, y_train))
random.shuffle(data)
X_train, y_train = zip(*data)
X_train=np.array(X_train)
y_train=np.array(y_train)
del(data)

In [104]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from tensorflow.keras.regularizers import l2
from tensorflow.keras.losses import BinaryCrossentropy

lambada=.1

model = Sequential()

model.add(Conv2D(32, (3, 3), activation='relu', input_shape=X_train.shape[1:],kernel_regularizer=l2(lambada)))
model.add(MaxPooling2D((2, 2)))

model.add(Conv2D(64, (3, 3), activation='relu',kernel_regularizer=l2(lambada)))
model.add(MaxPooling2D((2, 2)))

model.add(Conv2D(128, (3, 3), activation='relu',kernel_regularizer=l2(lambada)))
model.add(MaxPooling2D((2, 2)))

model.add(Flatten())

model.add(Dense(256, activation='relu',kernel_regularizer=l2(lambada)))
model.add(Dense(256, activation='relu',kernel_regularizer=l2(lambada)))
model.add(Dense(128, activation='relu',kernel_regularizer=l2(lambada)))
model.add(Dense(1, activation='linear',kernel_regularizer=l2(lambada)))  


model.compile(optimizer='adam', loss=BinaryCrossentropy(from_logits=True), metrics=['accuracy'])


#X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


batch_size = 10
epochs = 10

model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, workers=2)


test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Test accuracy: {test_accuracy}")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy: 0.8330000042915344


In [106]:
model.save('CNN.h5') #accurracy 0.8330000042915344 ,loss 0.6077840328216553 (from 16)

### get best model and deploy to h5

## Visualizations

In [None]:
X_train,y_train,X_test,y_test=getData(r'dataCreated/preproceed/')

In [None]:
import matplotlib.pyplot as plt


# Example visualization: Histogram of the target variable (y_train)
plt.hist(y_train, bins=10)
plt.xlabel('Target Variable')
plt.ylabel('Frequency')
plt.title('Histogram of Target Variable (y_train)')
plt.show()

# Example visualization: Scatter plot of two features in X_train
feature1 = X_train[:, 0]
feature2 = X_train[:, 1]
plt.scatter(feature1, feature2)
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Scatter Plot of Feature 1 vs Feature 2')
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np


# Visualize training data
unique_classes, class_counts = np.unique(y_train, return_counts=True)

plt.figure(figsize=(8, 6))
plt.bar(unique_classes, class_counts)
plt.xlabel('Class')
plt.ylabel('Count')
plt.title('Training Data Distribution')
plt.show()

# Visualize testing data
unique_classes, class_counts = np.unique(y_test, return_counts=True)

plt.figure(figsize=(8, 6))
plt.bar(unique_classes, class_counts)
plt.xlabel('Class')
plt.ylabel('Count')
plt.title('Testing Data Distribution')
plt.show()


In [None]:
import matplotlib.pyplot as plt
import numpy as np


image = X_train[0]  # Assuming the first image in X_train
plt.imshow(image)
plt.title('Image')
plt.axis('off')
plt.show()

# Example visualization: Displaying a grid of multiple images
num_rows = 2  # Number of rows in the grid
num_cols = 3  # Number of columns in the grid

fig, axes = plt.subplots(num_rows, num_cols, figsize=(10, 6))

for i in range(num_rows):
    for j in range(num_cols):
        index = i * num_cols + j
        image = X_train[index]
        axes[i, j].imshow(image)
        axes[i, j].axis('off')

plt.tight_layout()
plt.show()