In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
#Load necessary package
import pandas as pd
from tensorflow.keras.preprocessing.image import load_img,img_to_array
import numpy as np
import os
from PIL import Image
from keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from keras.models import Sequential

In [2]:
#Load label csv files
labels_df=pd.read_csv('/kaggle/input/histopathologic-cancer-detection/train_labels.csv')
labels_dict = labels_df.set_index('id').to_dict()['label']
train_dir = '/kaggle/input/histopathologic-cancer-detection/train/'
test_dir = '/kaggle/input/histopathologic-cancer-detection/test/'
# Crop the image to 32*32
def crop_center(image,crop_size=32):
    width,height=image.size
    left=(width-crop_size)//2
    top=(height-crop_size)//2
    right=left+crop_size
    bottom=top+crop_size
    return image.crop((left, top, right, bottom))
    
#Convert the image to array
def load_and_preprocess_images(folder, labels_dic=None, crop=True):
    images, labels = [], []
    for file in os.listdir(folder):
        img_id = os.path.splitext(file)[0]  # Extract the image ID as a string
        img_path = os.path.join(folder, file)
        image = Image.open(img_path)
        
        # Crop to the center if needed
        if crop:
            image = crop_center(image)
        
        # Normalize image and append
        image_array = np.array(image) / 255.0
        images.append(image_array)
        
        # Append label if in training set
        if labels_dic:
            labels.append(labels_dic[img_id])  # Use img_id as a string key
    
    return np.array(images), np.array(labels) if labels_dic else None

X_train, y_train = load_and_preprocess_images(train_dir, labels_dict)
X_test, _ = load_and_preprocess_images(test_dir)
print(f"Number of images loaded: {len(X_test)}")

# Build the model
model = Sequential([
    Input(shape=(32, 32, 3)),  
    Conv2D(32, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')  # Binary classification
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

#Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

# Predict the test dataset
predictions = model.predict(X_test)
predicted_labels = (predictions > 0.5).astype(int)
test_image_files = [f for f in os.listdir(test_dir) if f.endswith('.tif')]  # Ensure you use the correct directory path

submission_df = pd.DataFrame({
    'id': [os.path.splitext(file)[0] for file in test_image_files],  # Extracting image ID without the extension
    'label': predicted_labels.flatten()  # Flattening predictions if necessary
})

submission_df.to_csv('/kaggle/working/submission.csv', index=False)

Number of images loaded: 57458


Epoch 1/10


I0000 00:00:1733434932.226999      91 service.cc:145] XLA service 0x7d362c003ad0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1733434932.227054      91 service.cc:153]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0


[1m  73/5501[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m11s[0m 2ms/step - accuracy: 0.5776 - loss: 0.6717 

I0000 00:00:1733434935.528880      91 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m5501/5501[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 3ms/step - accuracy: 0.7447 - loss: 0.5291 - val_accuracy: 0.7787 - val_loss: 0.4799
Epoch 2/10
[1m5501/5501[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 2ms/step - accuracy: 0.7862 - loss: 0.4714 - val_accuracy: 0.8008 - val_loss: 0.4436
Epoch 3/10
[1m5501/5501[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 2ms/step - accuracy: 0.7983 - loss: 0.4513 - val_accuracy: 0.7729 - val_loss: 0.4791
Epoch 4/10
[1m5501/5501[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 2ms/step - accuracy: 0.8024 - loss: 0.4411 - val_accuracy: 0.8104 - val_loss: 0.4275
Epoch 5/10
[1m5501/5501[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 2ms/step - accuracy: 0.8083 - loss: 0.4336 - val_accuracy: 0.7828 - val_loss: 0.4682
Epoch 6/10
[1m5501/5501[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 2ms/step - accuracy: 0.8090 - loss: 0.4291 - val_accuracy: 0.8096 - val_loss: 0.4286
Epoch 7/10
[1m5501/5