This project is to build the deep learning model based on the histopathologic image to determine whether the patients have cancer or not.

The training set include 220,025 files, and the testing set include 57458 images. The image size is 96X96 pixes, however, only the center region 32X32 pixes are labeled. 

In [5]:
#Load necessary package
import pandas as pd
from tensorflow.keras.preprocessing.image import load_img,img_to_array
import numpy as np
import os
from PIL import Image
from keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from keras.models import Sequential

In [11]:
#Load label csv files
labels_df=pd.read_csv(r'D:\ML\DeepLearning\Week3\train_labels.csv')
labels_dict = labels_df.set_index('id').to_dict()['label']
train_dir = r'D:\ML\DeepLearning\Week3\train'
test_dir = r'D:\ML\DeepLearning\Week3\test'


In [15]:
#Display a image with cancer
image_file='0000d563d5cfafc4e68acb7c9829258a298d9b6a.tif'
image_path = os.path.join(train_dir, image_file)
# Load the image
image = Image.open(image_path)
# Display the image
image.show()

In [None]:
# Crop the image to 32*32
def crop_center(image,crop_size=32):
    width,height=image.size
    left=(width-crop_size)//2
    top=(height-crop_size)//2
    right=left+crop_size
    bottom=top+crop_size
    return image.crop((left, top, right, bottom))
    


In [None]:
#Convert the image to array
def load_and_preprocess_images(folder, labels_dic=None, crop=True):
    images, labels = [], []
    for file in os.listdir(folder):
        img_id = os.path.splitext(file)[0]  # Extract the image ID as a string
        img_path = os.path.join(folder, file)
        image = Image.open(img_path)
        
        # Crop to the center if needed
        if crop:
            image = crop_center(image)
        
        # Normalize image and append
        image_array = np.array(image) / 255.0
        images.append(image_array)
        
        # Append label if in training set
        if labels_dic:
            labels.append(labels_dic[img_id])  # Use img_id as a string key
    
    return np.array(images), np.array(labels) if labels_dic else None

X_train, y_train = load_and_preprocess_images(train_dir, labels_dict)
X_test, _ = load_and_preprocess_images(test_dir)
print(f"Number of images loaded: {len(X_test)}")



In [None]:
# Build the model
model = Sequential([
    Input(shape=(32, 32, 3)),  
    Conv2D(32, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')  # Binary classification
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

#Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

# Predict the test dataset
predictions = model.predict(X_test)
predicted_labels = (predictions > 0.5).astype(int)
test_image_files = [f for f in os.listdir(test_dir) if f.endswith('.tif')]  # Ensure you use the correct directory path

submission_df = pd.DataFrame({
    'id': [os.path.splitext(file)[0] for file in test_image_files],  # Extracting image ID without the extension
    'label': predicted_labels.flatten()  # Flattening predictions if necessary
})

submission_df.to_csv('/kaggle/working/submission.csv', index=False)