In [1]:
# Import necessary libraries
import os
import json
import torch
import numpy as np
import pandas as pd
from PIL import Image
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import torch.nn as nn
from torchvision import models
import torch.optim as optim
import segmentation_models_pytorch as smp
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:

# Paths
base_path = "../DeepFashionData"
img_dir = os.path.join(base_path, "images")
text_desc_path = os.path.join(base_path, "captions.json")
shape_label_path = os.path.join(base_path, "labels", "shape", "shape_anno_all.txt")
segm_dir = os.path.join(base_path, "segm")

# Check paths
def check_paths():
    paths = {
        "Images Directory": img_dir,
        "Captions File": text_desc_path,
        "Shape Labels": shape_label_path,
        "Segmentation Directory": segm_dir
    }
    
    for name, path in paths.items():
        if os.path.exists(path):
            print(f"✓ {name} exists: {path}")
        else:
            print(f"✗ {name} does not exist: {path}")

check_paths()

✓ Images Directory exists: ../DeepFashionData\images
✓ Captions File exists: ../DeepFashionData\captions.json
✓ Shape Labels exists: ../DeepFashionData\labels\shape\shape_anno_all.txt
✓ Segmentation Directory exists: ../DeepFashionData\segm


In [3]:
#Function to get all image paths and ensure they are valid
valid_extensions = ['.jpg', '.jpeg', '.png', '.bmp']
image_paths = [
    os.path.join(img_dir, fname) 
    for fname in os.listdir(img_dir) 
    if any(fname.endswith(ext) for ext in valid_extensions)
]

print(f"Found {len(image_paths)} valid image files.")

Found 44096 valid image files.


In [4]:
#Parse shape_anno_all.txt to get labels
labels = []
with open(shape_label_path, 'r') as f:
    for line in f:
        parts = line.strip().split()
        image_name = parts[0]  # Image filename
        label = list(map(int, parts[1:]))  # Convert the rest to integers (or any appropriate label format)
        
        # Ensure labels match the image paths (image name in path should match)
        image_path = os.path.join(img_dir, image_name)
        if image_path in image_paths:
            labels.append(label)

print(f"Loaded {len(labels)} labels.")

Loaded 42544 labels.


In [5]:
# Function to load shape annotations
def load_shape_labels(label_path):
    with open(label_path, 'r') as f:
        labels = f.readlines()
    # Process labels (assuming each line corresponds to an image and contains 12 attributes)
    labels = [list(map(int, label.strip().split())) for label in labels]
    return labels
class ClothingSegmDataset(Dataset):
    def __init__(self, image_paths, labels, device, segmentation_model):
        self.image_paths = image_paths
        self.labels = labels
        self.device = device
        self.segmentation_model = segmentation_model  # Assuming segmentation model is passed

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image = Image.open(self.image_paths[idx]).convert('RGB')

        # Ensure labels are a tensor and move them to the device
        label = torch.tensor(self.labels[idx], dtype=torch.float32)

        # Generate segmentation mask using the pre-trained model
        image_tensor = preprocess_image_and_mask(image)
        image_tensor = image_tensor.unsqueeze(0).to(self.device)

        # Generate segmentation mask from the model
        with torch.no_grad():
            seg_out, _ = self.segmentation_model(image_tensor)  # Assuming model returns the mask as output

        # Assuming the mask is already in the correct format, you can return it
        return image_tensor.squeeze(0), seg_out.squeeze(0), label  # Return image, mask, and label


In [6]:
# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cpu


In [7]:
# Pre-trained Segmentation Model (best_model.pth)
segmentation_model = smp.Unet(
    encoder_name="resnet34",    # Or try 'mobilenet_v2'
    encoder_weights="imagenet", # Pretrained on ImageNet
    in_channels=3,              # RGB images
    classes=1,                  # Binary segmentation (1 class + background)
).to(device)

In [8]:
# Load the pre-trained model weights
segmentation_model.load_state_dict(torch.load("best_model.pth"))
segmentation_model.eval()  # Set model to evaluation mode

Unet(
  (encoder): ResNetEncoder(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track

In [9]:
# Preprocessing function
def preprocess_image(image, image_size=(256, 256)):
    # Define transformations
    image_transform = transforms.Compose([
        transforms.Resize(image_size),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])  # Normalize to [-1, 1]
    ])
    
    image_tensor = image_transform(image)
    return image_tensor

In [10]:
# Create DataLoader
transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.ToTensor(),
])

In [11]:
dataset = ClothingSegmDataset(image_paths=image_paths, labels=labels, device=device, segmentation_model=segmentation_model)
train_loader = DataLoader(dataset, batch_size=8, shuffle=True)

In [12]:
# Define Classification Model for Shape Labels
class ShapeClassificationModel(nn.Module):
    def __init__(self, num_classes):
        super(ShapeClassificationModel, self).__init__()
        self.segmentation = smp.Unet(
            encoder_name="resnet34",
            encoder_weights="imagenet",
            in_channels=3,
            classes=1  # Segmentation output (binary mask)
        )
        self.classification_base = models.resnet18(weights=models.ResNet18_Weights.DEFAULT)
        self.classification_base.fc = nn.Identity()  # Removing the final classification layer

        # Add custom head for shape classification
        self.shape_head = nn.Linear(512, num_classes)

    def forward(self, x):
        # Get segmentation output
        seg_out = self.segmentation(x)  # Segmentation model might return only the mask (1 output)
        
        # Extract features using ResNet18
        features = self.classification_base(x)
        
        # Shape classification output
        shape_out = self.shape_head(features)
        
        return seg_out, shape_out  # Ensure both segmentation mask and shape classification are returned

In [13]:
# Initialize model, loss function, and optimizer
model = ShapeClassificationModel(num_classes=12).to(device)
criterion_cls = nn.BCEWithLogitsLoss()  # Binary Cross Entropy for multi-label classification
criterion_seg = nn.BCEWithLogitsLoss()  # Loss for segmentation
optimizer = optim.Adam(model.parameters(), lr=1e-4)

In [14]:
import torchvision.transforms as transforms

def preprocess_image_and_mask(image):
    """
    Preprocess the image: resize, convert to tensor, and normalize.
    """
    # Define the transformations
    preprocess = transforms.Compose([
        transforms.Resize((224, 224)),  # Resize to a fixed size (e.g., 224x224)
        transforms.ToTensor(),  # Convert image to tensor
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),  # Normalize (for pre-trained models like ResNet)
    ])

    # Apply transformations to the image
    image_tensor = preprocess(image)
    
    return image_tensor


In [15]:
# Training Loop
for epoch in range(5):  # You can increase the number of epochs
    model.train()
    total_loss = 0

    for images, generated_masks, labels in train_loader:
        images = images.to(device)

        # Ensure labels are a tensor and move them to the device
        labels = torch.tensor(labels, dtype=torch.float32).to(device)  # Convert labels to float32 for BCEWithLogitsLoss

        optimizer.zero_grad()

        # Use the pre-trained segmentation model to generate the mask (no need to load from dataset)
        with torch.no_grad():
            output = self.segmentation_model(image_tensor)  # Returns a dictionary
            seg_out = output['out']

        # Classification output from the current model
        _, shape_out = model(images)

        # Compute the segmentation loss
        loss_seg = criterion_seg(seg_out, generated_masks)

        # Compute the classification loss
        loss_cls = criterion_cls(shape_out, labels)

        # Total loss (segmentation + classification)
        loss = loss_seg + loss_cls
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}")


ValueError: not enough values to unpack (expected 2, got 1)