In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
from torchvision import transforms
import pandas as pd
import cv2
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
import tqdm
import numpy as np

from warnings import filterwarnings
filterwarnings("ignore")

In [3]:
# import torch
# if torch.backends.mps.is_available():
#     mps_device = torch.device("mps")
#     x = torch.ones(1, device=mps_device)
#     print (x)
# else:
#     print ("MPS device not found.")

In [11]:
# Global image directory path
IMG_DIR = "/Users/susanketsarkar/Desktop/Code/Meesho/data/train_images"

# Custom transform for cropping margins
class CenterCropWithMargin:
    def __init__(self, margin_percent=0.1):
        self.margin_percent = margin_percent
    
    def __call__(self, img):
        width, height = img.size
        crop_margin_w = int(width * self.margin_percent)
        crop_margin_h = int(height * self.margin_percent)
        
        cropped_img = img.crop((
            crop_margin_w, crop_margin_h,
            width - crop_margin_w, height - crop_margin_h
        ))
        return cropped_img

# Custom dataset
class CustomImageDataset(Dataset):
    def __init__(self, dataframe, transform=None):
        self.dataframe = dataframe
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        img_path = self.dataframe.iloc[idx]['image_path']
        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        if self.transform:
            image = self.transform(image)

        label = self.dataframe.iloc[idx]['label']
        return image, label

def load_data(csv_path, attr_to_predict):
    df = pd.read_csv(csv_path)
    df = df[['id', 'Category', attr_to_predict]]
    l1 = len(df)
    df.dropna(subset=[attr_to_predict], inplace=True)
    print("Number of nan objects: ", l1 - len(df))
    df['image_path'] = df['id'].apply(lambda x: os.path.join(IMG_DIR, f"{str(x).zfill(6)}.jpg"))
    return df

def preprocess_data(df, attr_to_predict):
    le = LabelEncoder()
    df['label'] = le.fit_transform(df[attr_to_predict])
    return df, le

class CommonBlock(nn.Module):
    def __init__(self):
        super(CommonBlock, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1)
        self.relu1 = nn.ReLU()
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.relu2 = nn.ReLU()
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)

    def forward(self, x):
        x = self.relu1(self.conv1(x))
        x = self.pool(x)
        x = self.relu2(self.conv2(x))
        x = self.pool(x)
        return x

class SpecificBlock(nn.Module):
    def __init__(self, num_classes):
        super(SpecificBlock, self).__init__()
        self.conv1 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1)
        self.relu1 = nn.ReLU()
        self.conv2 = nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1)
        self.relu2 = nn.ReLU()
        self.conv3 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1)
        self.relu3 = nn.ReLU()
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(256 * 16 * 16, 128)
        self.fc_relu1 = nn.ReLU()
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, x):
        x = self.relu1(self.conv1(x))
        x = self.pool(x)
        x = self.relu2(self.conv2(x))
        x = self.pool(x)
        x = self.relu3(self.conv3(x))
        x = self.pool(x)
        x = self.flatten(x)
        x = self.fc_relu1(self.fc1(x))
        x = self.fc2(x)
        return x

class MultiAttributeCNN(nn.Module):
    def __init__(self, num_classes_image, num_classes_length):
        super(MultiAttributeCNN, self).__init__()
        
        # Common Convolutional Block
        self.common_block = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.BatchNorm2d(32),
            nn.Conv2d(32, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.BatchNorm2d(32),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        
        # Specific Block for Image Attributes
        self.image_specific_block = nn.Sequential(
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Flatten(),
            nn.Linear(64 * 16 * 16*4, 128),  # Ensure correct flatten size based on the output dimensions
            nn.ReLU(),
            nn.Linear(128, num_classes_image)
        )
        
        # Specific Block for Length Attributes
        self.length_specific_block = nn.Sequential(
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Flatten(),
            nn.Linear(64 * 16 * 16 * 4, 128),  # Ensure correct flatten size based on the output dimensions
            nn.ReLU(),
            nn.Linear(128, num_classes_length)
        )

    def forward(self, x):
        x = self.common_block(x)
        image_output = self.image_specific_block(x)
        length_output = self.length_specific_block(x)
        return image_output, length_output

def train_cnn_on_multiple_attributes(csv_path, image_attrs, length_attrs, epochs=10, batch_size=4):
    # Load and prepare data
    df_image = load_data(csv_path, image_attrs[0])  # Load data for image attributes
    df_length = load_data(csv_path, length_attrs[0])  # Load data for length attributes
    
    df_image, le_image = preprocess_data(df_image, image_attrs[0])
    df_length, le_length = preprocess_data(df_length, length_attrs[0])

    transform = transforms.Compose([
        transforms.ToPILImage(),
        transforms.Resize((64, 64)),
        transforms.ToTensor(),
    ])
    
    dataset_image = CustomImageDataset(dataframe=df_image, transform=transform)
    dataset_length = CustomImageDataset(dataframe=df_length, transform=transform)

    # Combine datasets (for simplicity; adjust based on actual requirements)
    dataset = dataset_image  # Assuming both datasets are the same for this example

    # Split the dataset
    train_size = int(0.8 * len(dataset))
    test_size = len(dataset) - train_size
    train_dataset, test_dataset = random_split(dataset, [train_size, test_size])
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Build CNN model
    print("Building the model...")
    model = MultiAttributeCNN(num_classes_image=len(le_image.classes_), num_classes_length=len(le_length.classes_))
    
    # Move model to GPU if available
    device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
    model.to(device)

    # Loss and optimizer
    criterion_image = nn.CrossEntropyLoss()
    criterion_length = nn.CrossEntropyLoss()  # Assuming same loss for both outputs
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # Train the model
    print(f"Training the model on {train_size} data points...")
    model.train()
    for epoch in range(epochs):
        print(f"Epoch {epoch} running...")
        for images, labels in tqdm.tqdm(train_loader):
            images = images.to(device)

            # Forward pass
            image_outputs, length_outputs = model(images)

            # Calculate loss for both outputs
            loss_image = criterion_image(image_outputs, labels.to(device))
            loss_length = criterion_length(length_outputs, labels.to(device))  # Adjust this based on labels
            loss = loss_image + loss_length

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    # Evaluate the model
    print("Evaluating the model...")
    model.eval()
    all_image_preds = []
    all_length_preds = []
    all_labels = []

    with torch.no_grad():
        for images, labels in test_loader:
            images = images.to(device)
            image_outputs, length_outputs = model(images)
            _, image_preds = torch.max(image_outputs, 1)
            _, length_preds = torch.max(length_outputs, 1)
            all_image_preds.append(image_preds.cpu().numpy())
            all_length_preds.append(length_preds.cpu().numpy())
            all_labels.append(labels.cpu().numpy())

    all_image_preds = np.concatenate(all_image_preds)
    all_length_preds = np.concatenate(all_length_preds)
    all_labels = np.concatenate(all_labels)

    # Print metrics for both attributes
    print("Classification Report for Image Attributes:")
    print(classification_report(all_labels, all_image_preds, target_names=le_image.classes_))
    print(f"Accuracy (Image Attributes): {accuracy_score(all_labels, all_image_preds) * 100:.2f}%")
    
    print("Classification Report for Length Attributes:")
    print(classification_report(all_labels, all_length_preds, target_names=le_length.classes_))
    print(f"Accuracy (Length Attributes): {accuracy_score(all_labels, all_length_preds) * 100:.2f}%")

In [12]:
csv_path = "../data/cat_wise_csv/Women_Tops_&_Tunics_data.csv"  
attr_to_predict = 'color'  
train_cnn_on_multiple_attributes(csv_path, ["color"], ["length"], epochs=10, batch_size=32)

Number of nan objects:  4946
Number of nan objects:  5938
Building the model...
Training the model on 11246 data points...
Epoch 0 running...


100%|██████████| 352/352 [01:22<00:00,  4.27it/s]


Epoch 1 running...


 43%|████▎     | 150/352 [00:37<01:17,  2.59it/s]