In [1]:
# Montar el directorio que contiene los datos (IMPORTANTE: antes añadir la carpeta compartida a nuestro Drive!!)
# Comentar esta celda si la ejecución es en local
import os
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
main_path = "/content/drive/MyDrive/Segmentation/Practica 2"

In [3]:
os.listdir(main_path)

['generateTestSet.ipynb', 'CarvanaDataset']

In [4]:
import shutil
import random

# **Segmentación semántica en PyTorch: Caso práctico**

## **Máster IA3: Aprendizaje Profundo II**

### *Antonio Martínez González (antonio.martinez-gonzalez@uv.es)*

En este notebook se implementará la extracción de un número determinado de muestras del conjunto de entrenamiento asciado al dataset Carvana con el objetivo de disponer de un nuevo conjunto de datos sobre el que analizar la calidad del modelo obtenido tras las etapas de entrenamiento y validación (*test set*).

Brian Shaler, DanGill, Maggie, Mark McDonald, Patricia, Will Cukierski. (2017). Carvana Image Masking Challenge. Kaggle. https://kaggle.com/competitions/carvana-image-masking-challenge

Para ello utilizaremos la siguiente función:


In [5]:
def extractCarvanaSamples(image_dir, mask_dir, new_image_dir, new_mask_dir, ratio, shuffle=True):

    if os.path.exists(image_dir) and not os.path.isdir(image_dir):
        raise ValueError(f"The value passed to the parameter image_dir must be the path to an exiting. Received {image_dir}")
    elif not os.path.exists(image_dir):
        raise ValueError(f"The value passed to the paremeter image_dir must be the path to a directory containing Carvana images. Received {image_dir}")

    if os.path.exists(mask_dir) and not os.path.isdir(mask_dir):
        raise ValueError(f"The value passed to the parameter mask_dir must be the path to an exiting. Received {mask_dir}")
    elif not os.path.exists(mask_dir):
        raise ValueError(f"The value passed to the paremeter mask_dir must be the path to a directory containing Carvana masks. Received {mask_dir}")

    if os.path.exists(new_image_dir) and not os.path.isdir(new_image_dir):
        raise ValueError(f"The value passed to the parameter new_image_dir must be the path to an exiting or non existing directory. Received {new_image_dir}")
    elif not os.path.exists(new_image_dir):
        os.makedirs(new_image_dir)

    if os.path.exists(new_mask_dir) and not os.path.isdir(new_mask_dir):
        raise ValueError(f"The value passed to the parameter new_mask_dir must be the path to an exiting or non existing directory. Received {new_mask_dir}")
    elif not os.path.exists(new_mask_dir):
        os.makedirs(new_mask_dir)

    if ratio <= 0.0 or ratio >= 1.0:
        raise ValueError(f"Ratio must be a float value between 0 and 1. Received {ratio}")


    codes = [i.replace(".jpg", "") for i in os.listdir(image_dir)]
    if shuffle:
        random.shuffle(codes)

    n_elements_to_extract = int(len(codes) * ratio)
    to_extract = codes[:n_elements_to_extract]

    for code in to_extract:

        image_pathname = code + ".jpg"
        mask_pathname = code + "_mask.gif"

        shutil.copy(os.path.join(image_dir, image_pathname), os.path.join(new_image_dir, image_pathname))
        shutil.copy(os.path.join(mask_dir, mask_pathname), os.path.join(new_mask_dir, mask_pathname))

        os.remove(os.path.join(image_dir, image_pathname))
        os.remove(os.path.join(mask_dir, mask_pathname))

Número de muestras del conjunto de entrenamiento antes de la extracción:

In [7]:
len(os.listdir(os.path.join(main_path, "CarvanaDataset", "train", "images")))

4579

In [8]:
len(os.listdir(os.path.join(main_path, "CarvanaDataset", "train", "masks")))

4579

El conjunto de test será almacenado en el directorio local *dataset/test*

In [9]:
extractCarvanaSamples(
    image_dir=os.path.join(main_path, "CarvanaDataset", "train", "images"),
    mask_dir=os.path.join(main_path, "CarvanaDataset", "train", "masks"),
    new_image_dir=os.path.join(main_path, "CarvanaDataset", "test", "images"),
    new_mask_dir=os.path.join(main_path, "CarvanaDataset", "test", "masks"),
    ratio=0.04,
    shuffle=True
)

Comprobamos que el número de muestras del conjunto de entrenamiento se ha reducido en base a la propoción especificada en el parámetro *ratio* de la función *extractCarvanaSamples*:

In [10]:
len(os.listdir(os.path.join(main_path, "CarvanaDataset", "train", "images")))

4396

In [11]:
len(os.listdir(os.path.join(main_path, "CarvanaDataset", "train", "masks")))

4396

Conjunto de test:

In [13]:
sorted(os.listdir(os.path.join(main_path, "CarvanaDataset", "test", "images")))[:10]

['00087a6bd4dc_11.jpg',
 '00087a6bd4dc_15.jpg',
 '02159e548029_09.jpg',
 '03a857ce842d_09.jpg',
 '0495dcf27283_12.jpg',
 '0495dcf27283_15.jpg',
 '04bd942b463b_01.jpg',
 '04bd942b463b_03.jpg',
 '0789bed99cb8_14.jpg',
 '0789bed99cb8_15.jpg']

In [14]:
sorted(os.listdir(os.path.join(main_path, "CarvanaDataset", "test", "masks")))[:10]

['00087a6bd4dc_11_mask.gif',
 '00087a6bd4dc_15_mask.gif',
 '02159e548029_09_mask.gif',
 '03a857ce842d_09_mask.gif',
 '0495dcf27283_12_mask.gif',
 '0495dcf27283_15_mask.gif',
 '04bd942b463b_01_mask.gif',
 '04bd942b463b_03_mask.gif',
 '0789bed99cb8_14_mask.gif',
 '0789bed99cb8_15_mask.gif']