<a href="https://colab.research.google.com/github/Niroth36/Convolutional_Neural_Networks_CRCslides_dataset/blob/main/CRC_slides_cnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Importing the libraries we need

In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
import seaborn as sns

## Mounting drive to use CRC_slides dataset

In [10]:
from google.colab import drive
drive.mount('/content/drive')
!cp '/content/drive/MyDrive/datasets/CRC_slides.tar.gz' .
!tar -xvzf 'CRC_slides.tar.gz'
data_dir = '/content/CRC_slides'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
cp: cannot stat '/content/drive/MyDrive/datasets/CRC_slides.tar.gz': No such file or directory
tar (child): CRC_slides.tar.gz: Cannot open: No such file or directory
tar (child): Error is not recoverable: exiting now
tar: Child returned status 2
tar: Error is not recoverable: exiting now


In [15]:
import tensorflow as tf
from sklearn.model_selection import train_test_split

def load_dataset(data_dir, train_pct=0.6, val_pct=0.2, test_pct=0.2, batch_size=64, img_size=(224, 224)):

    # Create a list of all the image file paths in the data directory
    image_paths = tf.data.Dataset.list_files(data_dir + '/*/*')

    # Get the class names from the directory structure
    classes = list(map(lambda x: x.split("/")[-2],image_paths.as_numpy_iterator()))

    # Split the data into train, validation, and test sets
    train_data,test_data,train_label,test_label = train_test_split(image_paths, classes, train_size=train_pct,test_size=test_pct)
    val_data,test_data,val_label,test_label = train_test_split(test_data, test_label, train_size=val_pct/(val_pct+test_pct),test_size=test_pct/(val_pct+test_pct))

    # Convert the data to a tf.data.Dataset
    train_ds = tf.data.Dataset.from_tensor_slices((train_data, train_label))
    val_ds = tf.data.Dataset.from_tensor_slices((val_data, val_label))
    test_ds = tf.data.Dataset.from_tensor_slices((test_data, test_label))
    devel_ds = tf.data.Dataset.from_tensor_slices((val_data, val_label))

    # Apply image preprocessing and resizing
    train_ds = train_ds.map(lambda x, y: (tf.image.resize(tf.image.decode_jpeg(tf.io.read_file(x)), img_size), y))
    val_ds = val_ds.map(lambda x, y: (tf.image.resize(tf.image.decode_jpeg(tf.io.read_file(x)), img_size), y))
    test_ds = test_ds.map(lambda x, y: (tf.image.resize(tf.image.decode_jpeg(tf.io.read_file(x)), img_size), y))
    devel_ds = devel_ds.map(lambda x, y: (tf.image.resize(tf.image.decode_jpeg(tf.io.read_file(x)), img_size), y))

    # Apply data augmentation
    train_ds = train_ds.map(lambda x, y: (tf.image.random_flip_left_right(x), y))
    train_ds = train_ds.map(lambda x, y: (tf.image.random_brightness(x, max_delta=0.5), y))
    train_ds = train_ds.map(lambda x, y: (tf.image.random_contrast(x, lower=0.5, upper=1.5), y))

    # Normalize images
    train_ds = train_ds.map(lambda x, y: (tf.image.per_image_standardization(x), y))
    val_ds = val_ds.map(lambda x, y: (tf.image.per_image_standardization(x), y))
    test_ds = test_ds.map(lambda x, y: (tf.image.per_image_standardization(x), y))
    devel_ds = devel_ds.map(lambda x, y: (tf.image.per_image_standardization(x), y))

    # create a mapping from class names to integers
    unique_labels = list(set(classes))
    label_to_index = {label: index for index, label in enumerate(unique_labels)}

    # Map labels to integers
    train_ds = train_ds.map(lambda x, y: (x, label_to_index[y]))
    val_ds = val_ds.map(lambda x, y: (x, label_to_index[y]))
    test_ds = test_ds.map(lambda x, y: (x, label_to_index[y]))
    devel_ds = devel_ds.map(lambda x, y: (x, label_to_index[y]))

    # Shuffle and batch the data
    train_ds = train_ds.shuffle(buffer_size=1000).batch(batch_size)
    val_ds = val_ds.batch(batch_size)
    test_ds = test_ds.batch(batch_size)
    devel_ds = devel_ds.batch(batch_size)
    
    # Return the datasets
    return devel_ds, train_ds, val_ds, test_ds, unique_labels


In [None]:
devel_ds, train_ds, val_ds, test_ds, classes = load_dataset(data_dir, train_pct=0.6, val_pct=0.2, test_pct=0.2, batch_size=64, img_size=(224, 224))

In [None]:
y = np.concatenate([y for x, y in devel_ds])
plt.hist(y, list(range(len(classes) + 1)))
plt.show()