In [None]:
!pip install --upgrade keras

Collecting keras
  Downloading keras-3.5.0-py3-none-any.whl.metadata (5.8 kB)
Downloading keras-3.5.0-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: keras
  Attempting uninstall: keras
    Found existing installation: keras 3.4.1
    Uninstalling keras-3.4.1:
      Successfully uninstalled keras-3.4.1
Successfully installed keras-3.5.0


In [None]:
import keras
from keras import layers
from keras import ops

import os
import numpy as np
from glob import glob
import cv2
from scipy.io import loadmat
import matplotlib.pyplot as plt

# For data preprocessing
from tensorflow import image as tf_image
from tensorflow import data as tf_data
from tensorflow import io as tf_io

In [None]:
!gdown "1B9A9UCJYMwTL4oBEo4RZfbMZMaZhKJaz&confirm=t"
!unzip -q instance-level-human-parsing.zip

Downloading...
From: https://drive.google.com/uc?id=1B9A9UCJYMwTL4oBEo4RZfbMZMaZhKJaz&confirm=t
To: /content/instance-level-human-parsing.zip
100% 2.91G/2.91G [00:29<00:00, 98.4MB/s]


In [None]:
IMAGE_SIZE = 512
BATCH_SIZE = 4
NUM_CLASSES = 20
DATA_DIR = "./instance-level_human_parsing/instance-level_human_parsing/Training"
NUM_TRAIN_IMAGES = 1000
NUM_VAL_IMAGES = 50
# cogemos solo un subconjunto del dataset original
train_images = sorted(glob(os.path.join(DATA_DIR, "Images/*")))[:NUM_TRAIN_IMAGES]
train_masks = sorted(glob(os.path.join(DATA_DIR, "Category_ids/*")))[:NUM_TRAIN_IMAGES]
val_images = sorted(glob(os.path.join(DATA_DIR, "Images/*")))[
  NUM_TRAIN_IMAGES : NUM_VAL_IMAGES + NUM_TRAIN_IMAGES
]
val_masks = sorted(glob(os.path.join(DATA_DIR, "Category_ids/*")))[
  NUM_TRAIN_IMAGES : NUM_VAL_IMAGES + NUM_TRAIN_IMAGES
]

def read_image(image_path, mask=False):
  image = tf_io.read_file(image_path)
  if mask:
    image = tf_image.decode_png(image, channels=1)
    image.set_shape([None, None, 1])
    image = tf_image.resize(images=image, size=[IMAGE_SIZE, IMAGE_SIZE])
  else:
    image = tf_image.decode_png(image, channels=3)
    image.set_shape([None, None, 3])
    image = tf_image.resize(images=image, size=[IMAGE_SIZE, IMAGE_SIZE])
  return image

def load_data(image_list, mask_list):
  image = read_image(image_list)
  mask = read_image(mask_list, mask=True)
  return image, mask

def data_generator(image_list, mask_list):
  dataset = tf_data.Dataset.from_tensor_slices((image_list, mask_list))
  dataset = dataset.map(load_data, num_parallel_calls=tf_data.AUTOTUNE)
  dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)
  return dataset

train_dataset = data_generator(train_images, train_masks)
val_dataset = data_generator(val_images, val_masks)

print("Train Dataset:", train_dataset)
print("Val Dataset:", val_dataset)

Train Dataset: <_BatchDataset element_spec=(TensorSpec(shape=(4, 512, 512, 3), dtype=tf.float32, name=None), TensorSpec(shape=(4, 512, 512, 1), dtype=tf.float32, name=None))>
Val Dataset: <_BatchDataset element_spec=(TensorSpec(shape=(4, 512, 512, 3), dtype=tf.float32, name=None), TensorSpec(shape=(4, 512, 512, 1), dtype=tf.float32, name=None))>


In [None]:
# DeepLabV3+ funciona con caps que hacen "convolución dilatada", lo que significa que los filtros que antes eran, por ejemplo, de 3x3, se meten ceros en medio
# para que sean más grandes (cuánto se agranda es el dilation rate) <-- pero esos ceros no son parámetros "activos" (entrenables) de este modo se tiene
# en cuenta más contexto global, y la imagen pierde resolución más lentamente (se puede pasar por más capas antes de hacer pooling). esto es el encoder
# después, el decoder combina el resultado de eso, con la img original (residual/skip connection), y hace un último conv/pool y ya sale la predicción
def convolution_block(
    block_input,
    num_filters=256,
    kernel_size=3,
    dilation_rate=1,
    use_bias=False,
):
  # esto crea una capa de convolución dilatada con la dilation rate que le pase
  x = layers.Conv2D(
    num_filters,
    kernel_size=kernel_size,
    dilation_rate=dilation_rate,
    padding="same",
    use_bias=use_bias,
    kernel_initializer=keras.initializers.HeNormal(),
  )(block_input)
  x = layers.BatchNormalization()(x)
  return ops.nn.relu(x)

def DilatedSpatialPyramidPooling(dspp_input):
  dims = dspp_input.shape
  # esta capa sirve para hacer pooling y combinar las features extraídas por el backbone del modelo
  x = layers.AveragePooling2D(pool_size=(dims[-3], dims[-2]))(dspp_input)
  # después lo pasa por una capa convolucional de 1x1 sin dilation, para comprimir las features en un solo canal
  x = convolution_block(x, kernel_size=1, use_bias=True)
  # la salida se devuelve ampliando el tamaño al de la imagen original, haciendo "zoom" mediante interpolación bilineal
  out_pool = layers.UpSampling2D(
    size=(dims[-3] // x.shape[1], dims[-2] // x.shape[2]),
    interpolation="bilinear",
  )(x)

  # ahora empieza mi modelo propiamente dicho
  # creo las 4 capas de convolución dilatada y las concateno con la capa última de pooling
  out_1 = convolution_block(dspp_input, kernel_size=1, dilation_rate=1)
  out_6 = convolution_block(dspp_input, kernel_size=3, dilation_rate=6)
  out_12 = convolution_block(dspp_input, kernel_size=3, dilation_rate=12)
  out_18 = convolution_block(dspp_input, kernel_size=3, dilation_rate=18)

  x = layers.Concatenate(axis=-1)([out_pool, out_1, out_6, out_12, out_18])
  # este es realmente la convolución que "aplana" (comprime) las convoluciones dilatadas con el pooling
  output = convolution_block(x, kernel_size=1)
  return output

In [None]:
def DeeplabV3Plus(image_size, num_classes):
  # capa de entrada
  model_input = keras.Input(shape=(image_size, image_size, 3))
  # como el backbone es resnet50, preproceso las imágenes como este espera
  preprocessed = keras.applications.resnet50.preprocess_input(model_input)
  # instancio backbone
  resnet50 = keras.applications.ResNet50(
    weights="imagenet", include_top=False, input_tensor=preprocessed
  )
  # extraigo el feature map de esta capa del backbone <-- capa profunda (detalles finos)
  x = resnet50.get_layer("conv4_block6_2_relu").output
  # detrás de eso pongo las convoluciones dilatadas + pooling
  x = DilatedSpatialPyramidPooling(x)
  # upsampleo (interpolando)
  input_a = layers.UpSampling2D(
    size=(image_size // 4 // x.shape[1], image_size // 4 // x.shape[2]),
    interpolation="bilinear",
  )(x)
  # por otro lado, extraigo el feature map de OTRA capa del backbone <-- capa superficial (detalles generales, o "low level")
  input_b = resnet50.get_layer("conv2_block3_2_relu").output
  # le paso una capa de convolución co filtro (kernel) de 1 para reducir a 48 filtros la profundidad
  input_b = convolution_block(input_b, num_filters=48, kernel_size=1)
  # concateno las 2 salidas
  x = layers.Concatenate(axis=-1)([input_a, input_b])
  x = convolution_block(x)
  x = convolution_block(x)
  x = layers.UpSampling2D(
    size=(image_size // x.shape[1], image_size // x.shape[2]),
    interpolation="bilinear",
  )(x)
  model_output = layers.Conv2D(num_classes, kernel_size=(1, 1), padding="same")(x)
  return keras.Model(inputs=model_input, outputs=model_output)

model = DeeplabV3Plus(image_size=IMAGE_SIZE, num_classes=NUM_CLASSES)
model.summary()

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m94765736/94765736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [None]:
loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(
  optimizer=keras.optimizers.Adam(learning_rate=0.001),
  loss=loss,
  metrics=["accuracy"],
)

history = model.fit(train_dataset, validation_data=val_dataset, epochs=25)

plt.plot(history.history["loss"])
plt.title("Training Loss")
plt.ylabel("loss")
plt.xlabel("epoch")
plt.show()

plt.plot(history.history["accuracy"])
plt.title("Training Accuracy")
plt.ylabel("accuracy")
plt.xlabel("epoch")
plt.show()

plt.plot(history.history["val_loss"])
plt.title("Validation Loss")
plt.ylabel("val_loss")
plt.xlabel("epoch")
plt.show()

plt.plot(history.history["val_accuracy"])
plt.title("Validation Accuracy")
plt.ylabel("val_accuracy")
plt.xlabel("epoch")
plt.show()

Epoch 1/25
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m186s[0m 437ms/step - accuracy: 0.6044 - loss: 1.3980 - val_accuracy: 0.5636 - val_loss: 1.5805
Epoch 2/25
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m154s[0m 435ms/step - accuracy: 0.7014 - loss: 0.9549 - val_accuracy: 0.6682 - val_loss: 1.0555
Epoch 3/25
[1m 23/250[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m1:36[0m 425ms/step - accuracy: 0.7466 - loss: 0.8396