# Self-Correction Human Parsing

**Notebook Authors:** 
- **Segato Pietro** (2122209)  
- **Vezzosi Giacomo** (2104369)  
- **Vitali Giovanni** (2119998)

This notebook implements **Self-Correction Human Parsing (SCHP)** for clothing segmentation, which is crucial for virtual try-on applications such as VITON-HD. The SCHP model refines human parsing predictions iteratively, enhancing segmentation accuracy. The code is based on the repository by Peike Li ([Self-Correction-Human-Parsing](https://github.com/PeikeLi/Self-Correction-Human-Parsing)).

## Execution Environment  
This notebook is designed to be executed on **Google Colab** to ensure compatibility with the necessary dependencies and GPU acceleration. Before running the code, ensure that the runtime environment is set to **GPU** (Runtime → Change runtime type → GPU).

In [1]:
import os
import sys
from google.colab import files
import zipfile
import shutil
import gdown
import cv2
import numpy as np
from PIL import Image

# Repository and dependencies

In [2]:
!pip install ninja

Collecting ninja
  Downloading ninja-1.11.1.3-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (5.3 kB)
Downloading ninja-1.11.1.3-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (422 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/422.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m422.9/422.9 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ninja
Successfully installed ninja-1.11.1.3


In [3]:
!git clone https://github.com/PeikeLi/Self-Correction-Human-Parsing
%cd Self-Correction-Human-Parsing
!mkdir checkpoints
!mkdir inputs
!mkdir outputs

Cloning into 'Self-Correction-Human-Parsing'...
remote: Enumerating objects: 722, done.[K
remote: Counting objects: 100% (175/175), done.[K
remote: Compressing objects: 100% (110/110), done.[K
remote: Total 722 (delta 74), reused 64 (delta 64), pack-reused 547 (from 1)[K
Receiving objects: 100% (722/722), 3.88 MiB | 8.23 MiB/s, done.
Resolving deltas: 100% (150/150), done.
/content/Self-Correction-Human-Parsing


In [4]:
dataset = 'atr'

if dataset == 'lip':
    url = 'https://drive.google.com/uc?id=1k4dllHpu0bdx38J7H28rVVLpU-kOHmnH'
elif dataset == 'atr':
    url = 'https://drive.google.com/uc?id=1ruJg4lqR_jgQPj-9K0PP-L2vJERYOxLP'
elif dataset == 'pascal':
    url = 'https://drive.google.com/uc?id=1E5YwNKW2VOEayK9mWCS3Kpsxf-3z04ZE'

output = 'checkpoints/final.pth'
gdown.download(url, output, quiet=False)

Downloading...
From (original): https://drive.google.com/uc?id=1ruJg4lqR_jgQPj-9K0PP-L2vJERYOxLP
From (redirected): https://drive.google.com/uc?id=1ruJg4lqR_jgQPj-9K0PP-L2vJERYOxLP&confirm=t&uuid=cb7ceebb-4d30-48e1-a4cf-4ec94500d437
To: /content/Self-Correction-Human-Parsing/checkpoints/final.pth
100%|██████████| 267M/267M [00:05<00:00, 48.3MB/s]


'checkpoints/final.pth'

# Dataset

In [None]:
files.upload() #NOTE: a personal Kaggle API Key is to be uploaded here

!pip install -q kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

!kaggle datasets download -d tinkukalluri/zalando-hd-resized

Saving kaggle.json to kaggle.json
Dataset URL: https://www.kaggle.com/datasets/tinkukalluri/zalando-hd-resized
License(s): MIT
Downloading zalando-hd-resized.zip to /content/Self-Correction-Human-Parsing
100% 4.54G/4.54G [02:04<00:00, 43.0MB/s]
100% 4.54G/4.54G [02:04<00:00, 39.0MB/s]


In [6]:
# Paths
zip_path = '/content/Self-Correction-Human-Parsing/zalando-hd-resized.zip'
images_dest = '/content/Self-Correction-Human-Parsing/datasets/VITONHD/image'
seg_dest = '/content/Self-Correction-Human-Parsing/datasets/VITONHD/image-parse-v3'

os.makedirs(images_dest, exist_ok=True)
os.makedirs(seg_dest, exist_ok=True)

# Open zip file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    for file in zip_ref.namelist():
        # Getting images
        if file.startswith('test/image/') and (file.endswith('.jpg') or file.endswith('.png')):
            dest_file = os.path.join(images_dest, os.path.basename(file))
            with zip_ref.open(file) as source, open(dest_file, 'wb') as target:
                shutil.copyfileobj(source, target)

        # getting original segmentation image files
        elif file.startswith('test/image-parse-v3/') and (file.endswith('.jpg') or file.endswith('.png')):
            dest_file = os.path.join(seg_dest, os.path.basename(file))
            with zip_ref.open(file) as source, open(dest_file, 'wb') as target:
                shutil.copyfileobj(source, target)

print("Images ok")
print("Seg imgs ok")

Images ok
Seg imgs ok


In [7]:
def copy_subset_images(image_dir, parse_dir, indices): # function to create a custom subset of images and parsing maps to process

    # first we create the custom folders _test
    image_test_dir = image_dir + "_test"
    parse_test_dir = parse_dir + "_test"
    os.makedirs(image_test_dir, exist_ok=True)
    os.makedirs(parse_test_dir, exist_ok=True)

    image_files = sorted(os.listdir(image_dir))
    parse_files = sorted(os.listdir(parse_dir))

    for idx in indices:
        if idx < len(image_files) and idx < len(parse_files):
            shutil.copy(os.path.join(image_dir, image_files[idx]), image_test_dir)
            shutil.copy(os.path.join(parse_dir, parse_files[idx]), parse_test_dir)
            print(f"Copied: {image_files[idx]} e {parse_files[idx]}")
        else:
            print(f"Index {idx} out of range")

In [8]:
# Define custom indeces list (e.g. first 100 couples)
indices = [i for i in range(100)]

image_path = "datasets/VITONHD/image"
parse_path = "datasets/VITONHD/image-parse-v3"
copy_subset_images(image_path, parse_path, indices)

Copied: 00006_00.jpg e 00006_00.png
Copied: 00008_00.jpg e 00008_00.png
Copied: 00013_00.jpg e 00013_00.png
Copied: 00017_00.jpg e 00017_00.png
Copied: 00034_00.jpg e 00034_00.png
Copied: 00035_00.jpg e 00035_00.png
Copied: 00055_00.jpg e 00055_00.png
Copied: 00057_00.jpg e 00057_00.png
Copied: 00064_00.jpg e 00064_00.png
Copied: 00067_00.jpg e 00067_00.png
Copied: 00069_00.jpg e 00069_00.png
Copied: 00071_00.jpg e 00071_00.png
Copied: 00074_00.jpg e 00074_00.png
Copied: 00075_00.jpg e 00075_00.png
Copied: 00084_00.jpg e 00084_00.png
Copied: 00094_00.jpg e 00094_00.png
Copied: 00095_00.jpg e 00095_00.png
Copied: 00096_00.jpg e 00096_00.png
Copied: 00110_00.jpg e 00110_00.png
Copied: 00112_00.jpg e 00112_00.png
Copied: 00121_00.jpg e 00121_00.png
Copied: 00126_00.jpg e 00126_00.png
Copied: 00127_00.jpg e 00127_00.png
Copied: 00135_00.jpg e 00135_00.png
Copied: 00145_00.jpg e 00145_00.png
Copied: 00151_00.jpg e 00151_00.png
Copied: 00158_00.jpg e 00158_00.png
Copied: 00176_00.jpg e 00176

## Run Inference
Here, we use the trained SCHP model to segment clothing and body regions from input images.

**Image Preprocessing**: Convert images to the required input format. <br>
**Forward Pass**: Feed images through the SCHP network to obtain parsing maps. <br>
**Post-processing**: Convert model output into a usable segmentation mask. <br>

The output consists of per-pixel class labels representing different clothing categories.

In [9]:
# define input dir and output dir
input_dir = "datasets/VITONHD/image_test"
output_dir = "results/image-parse-v3"
os.makedirs(output_dir, exist_ok=True)

In [10]:
!python3 simple_extractor.py --dataset 'atr' --model-restore 'checkpoints/final.pth' --input-dir "datasets/VITONHD/image_test" --output-dir "results/image-parse-v3"

If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
Evaluating total class number 18 with ['Background', 'Hat', 'Hair', 'Sunglasses', 'Upper-clothes', 'Skirt', 'Pants', 'Dress', 'Belt', 'Left-shoe', 'Right-shoe', 'Face', 'Left-leg', 'Right-leg', 'Left-arm', 'Right-arm', 'Bag', 'Scarf']
  state_dict = torch.load(args.model_restore)['state_dict']
100% 100/100 [00:39<00:00,  2.53it/s]


# Matching the parsing map
Since SCHP uses a different label format than VITON-HD, we remap the parsed segmentation masks to the required format.

In [11]:
# ATR → CIHP custom mapping
ATR_TO_CIHP = {
    0: 0,    # Background
    1: 1,    # Hat
    2: 2,    # Hair
    3: 4,   # Sunglasses
    4: 5,    # Upper-clothes
    5: 12,    # Skirt
    6: 9,    # Pants
    7: 6,   # Dress
    8: -1,   # Belt (Noise)
    9: 18,   # Left shoe
    10: 19,  # Right shoe
    11: 13,  # Face
    12: 16,   # Left leg
    13: 17,  # Right leg
    14: 14,   # Left arm
    15: 15,   # Right arm
    16: -1,  # Bag (Noise)
    17: 11   # Scarf
}

In [12]:
def map_atr_to_cihp(input_img, output_img): # function to apply the custom mapping

    # Load mapping
    atr_map = np.array(Image.open(input_img))

    # Default mapping
    cihp_map = np.zeros_like(atr_map)

    # ATR → CIHP
    for atr_class, cihp_class in ATR_TO_CIHP.items():
        if cihp_class == -1:
            continue  # Ignore noise classes
        cihp_map[atr_map == atr_class] = cihp_class

    cihp_image = Image.fromarray(cihp_map.astype(np.uint8))
    cihp_image.save(output_img)

In [13]:
def process_segmentation_images(input_path, output_path): # function to apply map_atr_to_cihp to all processed images
    if not os.path.exists(output_path):
        os.makedirs(output_path)

    images = [f for f in os.listdir(input_path) if f.endswith(('.png', '.jpg'))]

    for img_name in images:
        input_img = os.path.join(input_path, img_name)
        output_img = os.path.join(output_path, img_name)

        map_atr_to_cihp(input_img, output_img)

    print(f"Processing completed. Mapped images are saved in {output_path}")


In [14]:
input_path = "results/image-parse-v3"
output_path = "results/image-parse-v3-mapped"
os.makedirs(output_path, exist_ok=True)

In [15]:
process_segmentation_images(input_path, output_path)

Processing completed. Mapped images are saved in results/image-parse-v3-mapped


# Metrics
- **(relative) Intersection over Union (IoU)**: Measures the overlap between the predicted and "ground truth" (relative to VITONHD outputs) segmentation masks. In particular we computed the per-parts values and the overall value. Higher is better.
- **(relative) Dice Coefficient** (F1 Score for segmentation): This metric is similar to IoU but gives more weight to correctly predicted pixels. A Dice score closer to 1 means higher segmentation quality.

In [41]:
CIHP_CLASS_NAMES = {
    0: "Background",
    1: "Hat",
    2: "Hair",
    4: "Sunglasses",
    5: "Upper-clothes",
    6: "Dress",
    9: "Pants",
    11: "Scarf",
    12: "Skirt",
    13: "Face",
    14: "Left arm",
    15: "Right arm",
    16: "Left leg",
    17: "Right leg",
    18: "Left shoe",
    19: "Right shoe"
}

In [49]:
# def load_mask(image_path): #binary mask convertion
#     mask = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
#     return mask / 255

def iou_score(y_true, y_pred): #get IoU between original mask and generated
    intersection = np.logical_and(y_true, y_pred).sum()
    union = np.logical_or(y_true, y_pred).sum()
    return intersection / union if union > 0 else 1.0

def dice_coefficient(y_true, y_pred): #Dice coefficient
    intersection = np.logical_and(y_true, y_pred).sum()
    return (2. * intersection) / (y_true.sum() + y_pred.sum()) if (y_true.sum() + y_pred.sum()) > 0 else 1.0

def mean_iou(y_true, y_pred, num_classes): # mean IoU for multiple classes masks
    ious = []
    for cls in range(num_classes):
        iou = iou_score(y_true == cls, y_pred == cls)
        ious.append(iou)
    return np.mean(ious)

In [50]:
def compute_metrics(mapped_dir, original_dir):
    mapped_files = set(os.listdir(mapped_dir))
    original_files = set(os.listdir(original_dir))
    common_files = mapped_files.intersection(original_files)

    iou_scores = []
    dice_scores = []
    mean_ious = []

    # Dictionary for each body part
    per_class_ious = {cls: [] for cls in set(ATR_TO_CIHP.values()) if cls != -1}

    for file in common_files:
        img_true = Image.open(os.path.join(original_dir, file))
        img_pred = Image.open(os.path.join(mapped_dir, file))
        y_true = np.array(img_true)
        y_pred = np.array(img_pred)

        iou_scores.append(iou_score(y_true, y_pred))
        dice_scores.append(dice_coefficient(y_true, y_pred))
        mean_ious.append(mean_iou(y_true, y_pred, 18))

        # Get IoU for class
        for atr_cls, cihp_cls in ATR_TO_CIHP.items():
            if cihp_cls == -1:
                continue  # Skip noise

            true_mask = (y_true == cihp_cls)
            pred_mask = (y_pred == cihp_cls)

            # assign nan for non-matching classes
            if np.sum(true_mask) == 0 and np.sum(pred_mask) == 0:
                iou = np.nan
            else:
                iou = iou_score(true_mask, pred_mask)

            per_class_ious[cihp_cls].append(iou)

    # Get mIoU for class
    per_class_mean_iou = {
        cls: np.nanmean(iou_list) if len(iou_list) > 0 else 0
        for cls, iou_list in per_class_ious.items()
    }

    # Print overall results
    print("Mean IoU:", np.nanmean(iou_scores))
    print("Mean Dice Coefficient:", np.nanmean(dice_scores))

    # Print mIoU for each class
    print("\nMean IoU for each body part:")
    for cls, m_iou in per_class_mean_iou.items():
        body_part = CIHP_CLASS_NAMES.get(cls, f"Classe {cls}")
        print(f"{body_part}: {m_iou:.4f}")


In [51]:
mapped_path = "results/image-parse-v3-mapped"
original_path = "datasets/VITONHD/image-parse-v3_test"
compute_metrics(mapped_path, original_path)

Mean IoU: 0.9743288739239847
Mean Dice Coefficient: 0.12905259760735505

Mean IoU for each body part:
Background: 0.9808
Hat: 0.0000
Hair: 0.8288
Sunglasses: nan
Upper-clothes: 0.9011
Dress: 0.1269
Pants: 0.8130
Scarf: 0.0000
Skirt: 0.4309
Face: 0.6564
Left arm: 0.8218
Right arm: 0.7829
Left leg: 0.5529
Right leg: 0.4640
Left shoe: nan
Right shoe: nan


  cls: np.nanmean(iou_list) if len(iou_list) > 0 else 0
