# Deep Learning Clothes Classification

This notebook requires Pytorch, torchtext, and timm libraries. Training the models requires at least 6 GB of GPU memory. I trained the models on my laptop, but the code should also run on Google Colab with a GPU setting.

GPU on my laptop: NVIDIA GeForce RTX 3060

The code expects the following directory structure:
- code.ipynb
- models
- dataset
    - noisy-images
        - 3257.jpg
        - ...
    - train.csv
    - test.csv

## Import

In [2]:
import pandas as pd
import numpy as np
import re
import math
from PIL import Image

# pytorch
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import random_split
from torchvision import transforms
from torchtext.vocab import build_vocab_from_iterator
from torchtext.vocab import Vocab

# pytorch image models
import timm

# ensemble
from scipy.stats import mode

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Preprocess

Preprocess the noisy text description in the train and test dataset. Remove prunctuations and rare words. Appends the categorical data to the front of the noisy text description. Write the cleaned description to new dataframes.

In [4]:
# train data
train_data = pd.read_csv('./dataset/train.csv')

# words in categorical data
common_words = {c.lower() for c in train_data['baseColour']}
common_words.update({use.lower() for use in train_data['usage']})
common_words.update({s.lower() for s in train_data['season']})
common_words = list(common_words)
common_words.extend(['women', 'woman', 'womens', 'men', 'man', 'mens'
                     'unisex', 'girls', 'girl', 'boys', 'boy'])

# rare words in description
occurrenecs = dict()
for i in range(len(train_data)):
    description = train_data.iloc[i]['noisyTextDescription'].lower()
    tokens = description.lower().split()
    for token in tokens:
        if token not in occurrenecs:
            occurrenecs[token] = 1
        else:
            occurrenecs[token] += 1
rare_words = [key for key, value in occurrenecs.items() if value <= 2]

In [5]:
def clean_tokens(text: str, common_words: list, rare_words: list) -> list:
    """
    Removes punctuations, rare words and common words from a single text description.
    Splits the text description into a list of tokens.
    
    :param text: A single noisy text description
    :param common_words: A list of words that occur in the categorical data,
        will remove them from the noisy text description
    :param rare_words: A list of rare words in the noisy text description,
        will remove them from the noisy text description
    :return: A list of tokens corresponding to a single text description
    """
    text = text.lower()
    # remove rare words and any words that contain r"[0-9\/\=\*\"\n]"
    new_tokens = []
    for token in text.split():
        if (token not in rare_words and
            not re.search(r"[0-9\/\=\*\"\n]", token)):
            new_tokens.append(token)
    text = " ".join(new_tokens)
    # remove "'s"
    text = re.sub(r"\'s", " ", text)
    # remove "\&"
    text = re.sub(r"\&+", " ", text)
    # remove "\-"
    text = re.sub(r"\-+", " ", text)
    # remove "\+"
    text = re.sub(r"\++", " ", text)
    # remove words that appear in categorical data
    tokens = []
    for token in text.split():
        if token not in common_words:
            tokens.append(token)
    return tokens

In [6]:
def clean_data(dataframe: pd.DataFrame, common_words: list, rare_words: list) -> tuple:
    """
    Cleans the noisy text description column in a dataframe and
    puts the cleaned desciption in a new dataframe.
    Removes punctuations, rare words and common words from the noisy text description.
    Appends categorical data to the front of text description.

    :param dataframe: A single dataframe that contains the columns
        'gender', 'season', 'usage', 'baseColour', and 'noisyTextDescription'
    :param common_words: A list of words that occur in the categorical data,
        will remove them from the noisy text description
    :param rare_words: A list of rare words in the noisy text description,
        will remove them from the noisy text description
    :return: Tuple of the following:
    * A new dataframe with a column 'description' added to the original dataframe
    * Maximum number of tokens in a single entry under the column 'description'
    """
    description = []
    max_seq = 0
    for i in range(len(dataframe)):
        gender = dataframe.iloc[i]['gender'].lower()
        season = dataframe.iloc[i]['season'].lower()
        usage = dataframe.iloc[i]['usage'].lower()
        colour = dataframe.iloc[i]['baseColour'].lower()
        text = dataframe.iloc[i]['noisyTextDescription'].lower()
        new_tokens = clean_tokens(text, common_words, rare_words)
        # append words in categorical data
        tokens = [gender, season, usage, colour]
        tokens.extend(new_tokens)
        max_seq = max(max_seq, len(tokens))
        text = " ".join(tokens)
        description.append(text)
    dataframe['description'] = description
    return dataframe, max_seq

In [7]:
# clean train data
train_data, max_seq= clean_data(train_data, common_words, rare_words)
print('Train data max sequence:', max_seq)

# clean test data
test_data = pd.read_csv('./dataset/test.csv')
test_data, max_seq = clean_data(test_data, common_words, rare_words)
print('Test data max sequence:', max_seq)

# save
train_ratio = 0.8
train_len = int(len(train_data) * train_ratio)
train_data.iloc[:train_len].to_csv('./dataset/train_cleaned.csv', index=False)
train_data.iloc[train_len:].to_csv('./dataset/val_cleaned.csv', index=False)
test_data.to_csv('./dataset/test_cleaned.csv', index=False)

Train data max sequence: 5
Test data max sequence: 9


## Build vocabulary

In [8]:
def yield_description_tokens(dataframe: pd.DataFrame):
    """
    Generates lists of tokens for each description entry in a dataframe.

    :param dataframe: Training data that contains column 'description'
    """
    for i in range(len(dataframe)):
        description = dataframe.iloc[i]['description']
        yield description.split()

def yield_category_tokens(dataframe: pd.DataFrame):
    """
    Generates category for each target entry in a dataframe.

    :param dataframe: Training data that contains column 'target'
    """
    for i in range(len(dataframe)):
        target = dataframe.iloc[i]['category']
        yield [target]

From training data, build vocabulary, i.e. mapping from tokens to indices.

In [9]:
### run this locally, for older version of torchtext
# builds vocabulary for text description from training data
description_vocab = build_vocab_from_iterator(yield_description_tokens(train_data))
# builds vocubulary
category_vocab = build_vocab_from_iterator(yield_category_tokens(train_data))
### run this on Colab, for new version of torchtext
# # builds vocabulary for text description from training data
# description_vocab = build_vocab_from_iterator(yield_description_tokens(train_data), specials=['<unk>', '<pad>'])
# description_vocab.set_default_index(0)
# # builds vocubulary
# category_vocab = build_vocab_from_iterator(yield_category_tokens(train_data), specials=[])

In [10]:
print('The first few tokens in the text description vocabulary:')
print(description_vocab.get_itos()[:10])
print('The number of tokens in the text description vocabulary:', len(description_vocab.get_itos()))
print('\nAll tokens in the target category vocabulary:')
print(category_vocab.get_itos())
print('The number of tokens in the target category vocabulary:', len(category_vocab.get_itos()))

The first few tokens in the text description vocabulary:
['casual', 'summer', 'men', 'blue', 'women', 'watch', 'black', 'fall', 'formal', 'green']
The number of tokens in the text description vocabulary: 24

All tokens in the target category vocabulary:
['Topwear', 'Bottomwear', 'Sandal', 'Shoes', 'Fragrance', 'Innerwear', 'Loungewear and Nightwear', 'Ties', 'Watches']
The number of tokens in the target category vocabulary: 9


## Dataset Class

In [11]:
# custom dataset, loads images, text descriptions, and categories
# apply resize, random horizontal flip and random noise to images

class ImageTextDataset(Dataset):
    def __init__(self, dataframe_path: str, image_root: str,
                 text_vocab: Vocab, category_vocab: Vocab,
                 max_seq: int, noise_prob:float, transform, train=True):
        self.dataframe = pd.read_csv(dataframe_path)
        self.text_vocab = text_vocab
        self.texts = torch.ones(len(self.dataframe), max_seq, dtype=torch.long)
        self.image_root = image_root
        self.transform_image = transform
        self.category_vocab = category_vocab
        self.max_seq = max_seq
        self.noise = None
        self.noise_prob = noise_prob
        self.train = train
        for i in range(len(self.dataframe)):
            # text description
            description = self.dataframe.iloc[i]['description']
            tokens = description.split()
            for j, token in enumerate(tokens):
                self.texts[i][j] = text_vocab[token]
    
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        # image
        image_idx = int(self.dataframe.iloc[idx]['id'])
        image_path = f'{self.image_root}/{image_idx}.jpg'
        image = Image.open(image_path)
        image = self.transform_image(image)
        self.noise = image
        prob = np.random.uniform(0, 1, 1)
        if prob < self.noise_prob and self.noise is not None:
            image_transformed = 0.95 * image + 0.05 * self.noise
        else:
            image_transformed = image
        # text description
        text = self.texts[idx]
        if not self.train:
            return image_transformed, text
        # category
        category = self.dataframe.iloc[idx]['category']
        category = self.category_vocab[category]

        return image_transformed, text, category

## Model Architecture

### Transformer

Transformer model for text data, only uses encoding layers but does not use decoding layers. Added a fully connected classification layer.

In [12]:
# transformer positional embedding
class PositionalEncoding(nn.Module):
    """
    https://pytorch.org/tutorials/beginner/transformer_tutorial.html
    """

    def __init__(self, d_model: int, max_length: int, dropout: float = 0.1):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_length, d_model)
        position = torch.arange(0, max_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)
        )
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer("pe", pe)

    def forward(self, x):
        x = x + self.pe[:, : x.size(1), :]
        return self.dropout(x)

In [13]:
# Transformer text classifier
class Transformer(nn.Module):
    def __init__(
        self,
        input_size: int,
        d_model: int,
        output_size: int,
        max_length: int,
        nhead: int = 8,
        dim_feedforward: int = 512,
        num_layers: int = 6,
        dropout: float = 0.1
    ):
        super().__init__()
        assert d_model % nhead == 0, "nheads must divide evenly into d_model"

        self.embedding = nn.Embedding(input_size, d_model)
        self.pos_encoder = PositionalEncoding(
            d_model=d_model, max_length=max_length, dropout=dropout
        )
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
        )
        self.transformer_encoder = nn.TransformerEncoder(
            encoder_layer,
            num_layers=num_layers
        )
        self.fc = nn.Linear(d_model, output_size)
        self.d_model = d_model
        self.num_features = d_model
        self.num_classes = output_size

    def forward(self, x):
        x = self.embedding(x) * math.sqrt(self.d_model)
        x = self.pos_encoder(x)
        x = self.transformer_encoder(x)
        x = x.mean(dim=1)
        x = self.fc(x)
        return x
    
    def forward_features(self, x):
        x = self.embedding(x) * math.sqrt(self.d_model)
        x = self.pos_encoder(x)
        x = self.transformer_encoder(x)
        x = x.mean(dim=1)
        return x

### Image Models

A wrapper class for the image models (Resnet, Efficientnet, etc.) in timm.

In [14]:
class ImageModel(nn.Module):
    def __init__(self, backbone, output_size=27):
        super(ImageModel, self).__init__()
        self.backbone = backbone
        self.fc1 = nn.Linear(1000, 256)
        self.fc2 = nn.Linear(256, output_size)
        self.relu = nn.ReLU()

    def forward(self, x):
        output = self.backbone(x)
        output = self.relu(self.fc1(output))
        return self.fc2(output)

### Image Text Models

Combine image models with text models. Concatenate the embeddings of image and text models. Add a classifier to the embeddings.

In [15]:
# image text model
class ImageTextClassifier(nn.Module):
    def __init__(self, image_model, text_model, hidden_size: int,
                 num_classes: int, dropout: float = 0.1):
        super().__init__()
        self.image_model = image_model
        self.text_model = text_model
        self.num_features = image_model.num_features + text_model.num_features
        self.fc1 = nn.Linear(self.num_features, hidden_size)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        self.fc2 = nn.Linear(hidden_size, 128)
        self.fc3 = nn.Linear(128, num_classes)

    def forward(self, image, text):
        x1 = self.image_model.forward_features(image)
        x1 = self.image_model.forward_head(x1, pre_logits=True).flatten(start_dim=1)
        x2 = self.text_model.forward_features(text)
        x = torch.concat((x1, x2), dim=1)
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

## Training Loop

In [16]:
def train(model, dataloaders, criterion, optimizer, n_epoch, checkpoint_path,
          use_image=True, use_text=True):
    """
    Train loop. Performs both training and validation for each epoch.
    Saves the best model according to validation loss.

    :param model: Pytorch model
    :param dataloaders: A dictionary of two dataloaders that load
        image, text, and target category;
        training dataloader is at key 'Train',
        validation dataloader is at key 'Validation'
    :param criterion: Classification loss function
    :param n_epoch: Number of epochs
    :param checkpoint_path: Output checkpoint path including filename
    :param use_image: Whether to input image to the model in the forward step
    :param use_text: Whether to input text description to the model in the forward step
    :return: Tuple of the following
    * A list of train accuracies
    * A list of validation accuracies
    """
    # best validation accuracy over all epochs
    best_accuracy = 0.0
    # train/validation accuracy for each epoch
    accuracy_dict = {'Train': [], 'Validation': []}
    # Each epoch consists of train and validation
    phases = ['Train', 'Validation']
    for epoch in range(n_epoch):
        print('-'*10)
        print(f'Epoch {epoch + 1}/{n_epoch}:')
        for phase in phases:
            if phase == 'Train':
                model.train()
            else:
                model.eval()
            running_loss = 0.0
            running_correct = 0
            running_total = 0
            for image, text, target in dataloaders[phase]:
                image, text, target = image.to(device), text.to(device), target.to(device)
                optimizer.zero_grad()
                if use_image and use_text: # image text model
                    output = model(image, text)
                elif use_image:            # image model
                    output = model(image)
                else:                      # text model
                    output = model(text)
                loss = criterion(output, target)
                if phase == 'Train':
                    loss.backward()
                    optimizer.step()
                preds = torch.argmax(output, dim=1)
                running_loss += loss.item()
                running_correct += (preds == target).sum().item()
                running_total += target.size(0)
            epoch_loss = running_loss / len(dataloaders[phase])
            epoch_accuracy = running_correct / running_total
            accuracy_dict[phase].append(epoch_accuracy)
            print(f'{phase} loss:', round(epoch_loss, 4),
                  f'\t{phase} accuracy:', round(epoch_accuracy, 4))
            if phase == 'Validation' and epoch_accuracy > best_accuracy:
                print('Validation accuracy increases from', round(best_accuracy, 4),
                      'to', round(epoch_accuracy, 4))
                best_accuracy = epoch_accuracy
                # save best model according to validation loss
                torch.save(
                    {'epoch': epoch,
                     'model_state_dict': model.state_dict(),
                     'optimizer_state_dict': optimizer.state_dict(),
                     'loss': epoch_loss},
                    checkpoint_path
                )
    return accuracy_dict['Train'], accuracy_dict['Validation']

## Inference Loop

In [17]:
def predict(model, dataloader, use_image=True, use_text=True, val=True):
    """
    Inference loop. Returns the predicted category indices from a single model.
    
    :param model: Pytorch model.
    :param dataloader: A validation or test dataloader.
    :param use_image: Whether to input image to the model in the forward step.
    :param use_text: Whether to input text description to the model in the forward step.
    :param val: Whether to compare predictions with targets and print accuracy.
    :return: A numpy array of predicted category indices.
    """
    model.eval()
    running_correct = 0
    running_total = 0
    pred_list = []
    for batch in dataloader:
        image, text = batch[0].to(device), batch[1].to(device)
        if use_image and use_text:
            output = model(image, text)
        elif use_image:
            output = model(image)
        else:
            output = model(text)
        preds = torch.argmax(output, dim=1)
        preds = preds.data.cpu().numpy()
        if val:
            target = batch[2].data.cpu().numpy()
            running_correct += (preds == target).sum()
            running_total += target.shape[0]
        pred_list.append(preds)
    pred_list = np.concatenate(pred_list)
    if val:
        accuracy = running_correct / running_total
        print('Accuracy:', round(accuracy, 4))
    return pred_list

In [18]:
def predict_voting(models, dataloader, use_image=True, use_text=True,
                   val=False, weights=None):
    """
    Inference loop. Perform voting with a number of base models.
    Weights are optional. If weights are given, then the predictions are 
    the categories with the maximum weighted sum of logits from base models.
    Otherwise, the predictions are the mode of the categories predicted by base models.
    
    :param models: A list of Pytorch models.
    :param dataloader: A validation or test dataloader.
    :param use_image: Whether to input image to the model in the forward step.
    :param use_text: Whether to input text description to the model in the forward step.
    :param val: Whether to compare predictions with targets and print accuracy.
    :param weights: Optional list of weights applied to each model when voting.
    :return: A numpy array of predicted category indices.
    """
    for model in models:
        model.eval()
    running_correct = 0
    running_total = 0
    pred_list = []
    with torch.no_grad():
        for batch in dataloader:
            image, text = batch[0].to(device), batch[1].to(device)
            preds = []
            if weights is None:
                for i, model in enumerate(models):
                    if use_image and use_text:
                        output = model(image, text)
                    elif use_image:
                        output = model(image)
                    else:
                        output = model(text)
                    pred = torch.argmax(output, dim=1)
                    preds.append(pred.data.cpu())
                preds = torch.stack(preds)
                preds = mode(preds, axis=0, keepdims=False)[0]
            else:
                for i, model in enumerate(models):
                    if use_image and use_text:
                        output = model(image, text)
                    elif use_image:
                        output = model(image)
                    else:
                        output = model(text)
                    output = output * weights[i]
                    preds.append(output.data.cpu().numpy())
                preds = np.stack(preds)
                preds = np.argmax(np.sum(preds, axis=0), axis=1)
            if val:
                target = batch[2].data.cpu().numpy()
                running_correct += (preds == target).sum()
                running_total += target.shape[0]
            pred_list.append(preds)

        pred_list = np.concatenate(pred_list)
        if val:
            accuracy = running_correct / running_total
            print('Accuracy:', round(accuracy, 4))
    return pred_list

In [19]:
### run this locally, for older version of torchtext
def index_to_category(pred_list, category_vocab):
    """
    Converts category indices to category names.

    :param pred_list: A list or numpy array of predicted category indices.
    :param category_vocab: A torchtext Vocab object containing all tokens.
    :return: A list of category names.
    """
    categories = category_vocab.itos
    pred_list = [categories[int(idx)] for idx in pred_list]
    return pred_list


### run this on Colab, for new version of torchtext
# def index_to_category(pred_list, category_vocab):
#     categories = category_vocab.get_itos()
#     pred_list = [categories[int(idx)] for idx in pred_list]
#     return pred_list

## Configuration

In [20]:
# configure dataset
train_dataframe_path = './dataset/train_cleaned.csv'
val_dataframe_path = './dataset/val_cleaned.csv'
image_root = './dataset/noisy-images'
max_seq = 16
batch_size = 48
train_noise_prob = 0.5
val_noise_prob = 0.0

In [21]:
train_transform = transforms.Compose([transforms.Resize(90),
                                      transforms.RandomResizedCrop(80), 
                                      transforms.RandomHorizontalFlip(), 
                                      transforms.ToTensor(),
                                      transforms.Normalize((0.8193, 0.8041, 0.7969), (0.2224, 0.2341, 0.2369))])
train_dataset = ImageTextDataset(train_dataframe_path, image_root,
                           text_vocab=description_vocab,
                           category_vocab=category_vocab,
                           max_seq=max_seq, noise_prob=train_noise_prob,
                           transform=train_transform, train=True)
val_dataset = ImageTextDataset(val_dataframe_path, image_root,
                           text_vocab=description_vocab,
                           category_vocab=category_vocab,
                           max_seq=max_seq, noise_prob=val_noise_prob,
                           transform=train_transform, train=True)

train_dataloader = DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True, drop_last=True
)

val_dataloader = DataLoader(
    val_dataset, batch_size=batch_size, shuffle=False, drop_last=False
)

train_dataloaders = {'Train': train_dataloader, 'Validation': val_dataloader}

In [22]:
test_dataframe_path = './dataset/test_cleaned.csv'
image_root = './dataset/noisy-images/noisy-images'
max_seq = 16
batch_size = 48
test_noise_prob = 0.0

In [23]:
test_transform = transforms.Compose([transforms.Resize(90),
                                     transforms.CenterCrop(80),
                                     transforms.ToTensor(),
                                     transforms.Normalize((0.8193, 0.8041, 0.7969), (0.2224, 0.2341, 0.2369))])
test_dataset = ImageTextDataset(test_dataframe_path, image_root,
                                text_vocab=description_vocab,
                                category_vocab=category_vocab,
                                max_seq=max_seq, noise_prob=test_noise_prob,
                                transform=test_transform, train=False)

test_dataloader = DataLoader(
    test_dataset, batch_size=batch_size, shuffle=False, drop_last=False
)

: 

In [None]:
# sample train data
image, text, target = next(iter(train_dataloaders['Train']))
print(image[0])
print('Image shape:', image.shape)
print(text[0])
print('Text shape:', text.shape)
print('Target:', target[0])

In [None]:
# loss function
criterion = nn.CrossEntropyLoss()

## Training

### Training Resnet

In [None]:
resnet34 = timm.create_model('resnet34', num_classes=27, pretrained=True, drop_rate=0.7)
resnet34 = resnet34.to(device)
optimizer = torch.optim.SGD(resnet34.parameters(), lr=3e-4, momentum=0.9)

# checkpoint = torch.load('./models/resnet34.pth')
# resnet34.load_state_dict(checkpoint['model_state_dict'])
# optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

In [None]:
# train resnet34, takes an hour
n_epoch = 60  # can modify
checkpoint_path = './models/resnet34.pth'
resnet34_train_accuracy, resnet34_val_accuracy = train(
    resnet34, train_dataloaders, criterion, optimizer, n_epoch=n_epoch,
    checkpoint_path=checkpoint_path, use_image=True, use_text=False
)

### Training Efficientnet

In [None]:
efficientnet = timm.create_model('efficientnetv2_rw_t', num_classes=27, pretrained=True, drop_rate=0.7)
efficientnet = efficientnet.to(device)
optimizer = torch.optim.SGD(efficientnet.parameters(), lr=3e-4, momentum=0.9)

# checkpoint = torch.load('./models/efficientnetv2.pth')
# efficientnet.load_state_dict(checkpoint['model_state_dict'])
# optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

In [None]:
# train efficientnet v2, takes an hour
n_epoch = 60  # can modify
checkpoint_path = './models/efficientnetv2.pth'
efficientnet_train_accuracy, efficientnet_val_accuracy = train(
    efficientnet, train_dataloaders, criterion, optimizer, n_epoch=n_epoch,
    checkpoint_path=checkpoint_path, use_image=True, use_text=False
)

### Training Convnext

In [None]:
convnext = timm.create_model('convnext_small_in22k', num_classes=27, pretrained=True, drop_rate=0.7)
convnext = convnext.to(device)
optimizer = torch.optim.SGD(convnext.parameters(), lr=3e-4, momentum=0.9)

# checkpoint = torch.load('./models/convnext_small.pth')
# convnext.load_state_dict(checkpoint['model_state_dict'])
# optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

In [None]:
# train convnext small, takes over an hour
n_epoch = 60  # can modify
checkpoint_path = './models/convnext_small.pth'
convnext_train_accuracy, convnext_val_accuracy = train(
    convnext, train_dataloaders, criterion, optimizer, n_epoch=n_epoch,
    checkpoint_path=checkpoint_path, use_image=True, use_text=False
)

### Training Text Transformer

In [None]:
### run this locally, for older version of torchtext
input_size = len(description_vocab.itos)

### run this on Colab, for new version of torchtext
# input_size = len(description_vocab.get_itos())

In [None]:
d_model = 512
output_size = 27
max_length = 16
nhead = 8
dim_feedforward = 512
num_layers = 6
dropout = 0.4

transformer = Transformer(
    input_size=input_size,
    d_model=d_model,
    output_size=output_size,
    max_length=max_length,
    nhead=nhead,
    dim_feedforward=dim_feedforward,
    num_layers=num_layers,
    dropout=dropout)
transformer.to(device)
optimizer = torch.optim.SGD(transformer.parameters(), lr=3e-4, momentum=0.9)

# checkpoint = torch.load('./models/transformer.pth')
# transformer.load_state_dict(checkpoint['model_state_dict'])
# optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

In [None]:
# train transformer, takes an hour
n_epoch = 80  # can modify
checkpoint_path = './models/transformer.pth'
transformer_train_accuracy, transformer_val_accuracy = train(
    transformer, train_dataloaders, criterion, optimizer, n_epoch=n_epoch,
    checkpoint_path=checkpoint_path, use_image=False, use_text=True
)

### Training Resnet + Transformer

In [None]:
# create resnet34 and load checkpoint, no grad
resnet34 = timm.create_model('resnet34', num_classes=27, pretrained=True, drop_rate=0.7)
resnet34 = resnet34.to(device)
checkpoint = torch.load('./models/resnet34.pth')
resnet34.load_state_dict(checkpoint['model_state_dict'])
for param in resnet34.parameters():
    param.requires_grad = False

# create transformer and load checkpoint, no grad
# input_size defined before
d_model = 512
output_size = 27
max_length = 16
nhead = 8
dim_feedforward = 512
num_layers = 6
dropout = 0.4

transformer = Transformer(
    input_size=input_size,
    d_model=d_model,
    output_size=output_size,
    max_length=max_length,
    nhead=nhead,
    dim_feedforward=dim_feedforward,
    num_layers=num_layers,
    dropout=dropout)
transformer.to(device)
checkpoint = torch.load('./models/transformer.pth')
transformer.load_state_dict(checkpoint['model_state_dict'])
for param in transformer.parameters():
    param.requires_grad = False

# create resnet34 + transformer model and load checkpoint
resnet34_transformer = ImageTextClassifier(resnet34, transformer, hidden_size=256, num_classes=27, dropout=0.3)
resnet34_transformer.to(device)
optimizer = torch.optim.SGD(resnet34_transformer.parameters(), lr=3e-4, momentum=0.9)

# checkpoint = torch.load('./models/resnet34_transformer.pth')
# resnet34_transformer.load_state_dict(checkpoint['model_state_dict'])
# optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

In [None]:
# train, takes 30 min
n_epoch = 60  # can modify
checkpoint_path = './models/resnet34_transformer.pth'
resnet34_transformer_train_accuracy, resnet34_transformer_val_accuracy = train(
    resnet34_transformer, train_dataloaders, criterion, optimizer, n_epoch=n_epoch,
    checkpoint_path=checkpoint_path, use_image=True, use_text=True
)

### Training Efficientnet + Transformer

In [None]:
# create efficientnet v2 and load checkpoint, no grad
efficientnet = timm.create_model('efficientnetv2_rw_t', num_classes=27, pretrained=True, drop_rate=0.7)
efficientnet = efficientnet.to(device)
checkpoint = torch.load('./models/efficientnetv2.pth')
efficientnet.load_state_dict(checkpoint['model_state_dict'])
for param in efficientnet.parameters():
    param.requires_grad = False

# create transformer and load checkpoint, no grad
# input_size defined before
d_model = 512
output_size = 27
max_length = 16
nhead = 8
dim_feedforward = 512
num_layers = 6
dropout = 0.4

transformer = Transformer(
    input_size=input_size,
    d_model=d_model,
    output_size=output_size,
    max_length=max_length,
    nhead=nhead,
    dim_feedforward=dim_feedforward,
    num_layers=num_layers,
    dropout=dropout)
transformer.to(device)
checkpoint = torch.load('./models/transformer.pth')
transformer.load_state_dict(checkpoint['model_state_dict'])
for param in transformer.parameters():
    param.requires_grad = False

# create efficientnet v2 + transformer and load checkpoint
efficientnet_transformer = ImageTextClassifier(efficientnet, transformer, hidden_size=256, num_classes=27, dropout=0.3)
efficientnet_transformer.to(device)
optimizer = torch.optim.SGD(efficientnet_transformer.parameters(), lr=3e-4, momentum=0.9)

# checkpoint = torch.load('./models/efficientnetv2_transformer.pth')
# efficientnet_transformer.load_state_dict(checkpoint['model_state_dict'])
# optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

In [None]:
# train, takes 30 min
n_epoch = 60  # can modify
checkpoint_path = './models/efficientnetv2_transformer.pth'
efficientnet_transformer_train_accuracy, efficientnet_transformer_val_accuracy = train(
    efficientnet_transformer, train_dataloaders, criterion, optimizer, n_epoch=n_epoch,
    checkpoint_path=checkpoint_path, use_image=True, use_text=True
)

### Training Convnext + Transformer

In [None]:
# create convnext small and load checkpoint, no grad
convnext = timm.create_model('convnext_small_in22k', num_classes=27, pretrained=True, drop_rate=0.7)
convnext = convnext.to(device)
checkpoint = torch.load('./models/convnext_small.pth')
convnext.load_state_dict(checkpoint['model_state_dict'])
for param in convnext.parameters():
    param.requires_grad = False

# create transformer and load checkpoint, no grad
# input_size defined before
d_model = 512
output_size = 27
max_length = 16
nhead = 8
dim_feedforward = 512
num_layers = 6
dropout = 0.4

transformer = Transformer(
    input_size=input_size,
    d_model=d_model,
    output_size=output_size,
    max_length=max_length,
    nhead=nhead,
    dim_feedforward=dim_feedforward,
    num_layers=num_layers,
    dropout=dropout)
transformer.to(device)
checkpoint = torch.load('./models/transformer.pth')
transformer.load_state_dict(checkpoint['model_state_dict'])
for param in transformer.parameters():
    param.requires_grad = False

# create convnext small + transformer model and load checkpoint
convnext_transformer = ImageTextClassifier(convnext, transformer, hidden_size=256, num_classes=27, dropout=0.3)
convnext_transformer.to(device)
optimizer = torch.optim.SGD(convnext_transformer.parameters(), lr=3e-4, momentum=0.9)

# checkpoint = torch.load('./models/convnext_small_transformer.pth')
# convnext_transformer.load_state_dict(checkpoint['model_state_dict'])
# optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

In [None]:
# train, takes 30 min
n_epoch = 60  # can modify
checkpoint_path = './models/convnext_small_transformer.pth'
convnext_transformer_train_accuracy, convnext_transformer_val_accuracy = train(
    convnext_transformer, train_dataloaders, criterion, optimizer, n_epoch=n_epoch,
    checkpoint_path=checkpoint_path, use_image=True, use_text=True
)

## Inference

Voting ensemble using the three base models: Resnet34 + Transformer, Efficient V2 + Ensemble, and Convnext Small + Ensemble.

In [None]:
### run this locally, for older version of torchtext
input_size = len(description_vocab.itos)

### run this on Colab, for new version of torchtext
# input_size = len(description_vocab.get_itos())

In [None]:
# create resnet34 and load checkpoint
resnet34 = timm.create_model('resnet34', num_classes=27, pretrained=True, drop_rate=0.7)
resnet34 = resnet34.to(device)

# create transformer and load checkpoint
# input_size defined above
d_model = 512
output_size = 27
max_length = 16
nhead = 8
dim_feedforward = 512
num_layers = 6
dropout = 0.4

transformer = Transformer(
    input_size=input_size,
    d_model=d_model,
    output_size=output_size,
    max_length=max_length,
    nhead=nhead,
    dim_feedforward=dim_feedforward,
    num_layers=num_layers,
    dropout=dropout)
transformer.to(device)

# create resnet34 + transformer model and load checkpoint
resnet34_transformer = ImageTextClassifier(resnet34, transformer, hidden_size=256, num_classes=27, dropout=0.3)
resnet34_transformer.to(device)
checkpoint = torch.load('./models/resnet34_transformer.pth')
resnet34_transformer.load_state_dict(checkpoint['model_state_dict'])

In [None]:
# create resnet34 and load checkpoint
efficientnet = timm.create_model('efficientnetv2_rw_t', num_classes=27, pretrained=True, drop_rate=0.7)
efficientnet = efficientnet.to(device)

# create transformer and load checkpoint
# input_size defined above
d_model = 512
output_size = 27
max_length = 16
nhead = 8
dim_feedforward = 512
num_layers = 6
dropout = 0.4

transformer = Transformer(
    input_size=input_size,
    d_model=d_model,
    output_size=output_size,
    max_length=max_length,
    nhead=nhead,
    dim_feedforward=dim_feedforward,
    num_layers=num_layers,
    dropout=dropout)
transformer.to(device)

# create efficientnet v2 + transformer model and load checkpoint
efficientnet_transformer = ImageTextClassifier(efficientnet, transformer, hidden_size=256, num_classes=27, dropout=0.3)
efficientnet_transformer.to(device)
checkpoint = torch.load('./models/efficientnetv2_transformer.pth')
efficientnet_transformer.load_state_dict(checkpoint['model_state_dict'])

In [None]:
# create resnet34 and load checkpoint
convnext = timm.create_model('convnext_small_in22k', num_classes=27, pretrained=True, drop_rate=0.7)
convnext = convnext.to(device)

# create transformer and load checkpoint
# input_size defined above
d_model = 512
output_size = 27
max_length = 16
nhead = 8
dim_feedforward = 512
num_layers = 6
dropout = 0.4

transformer = Transformer(
    input_size=input_size,
    d_model=d_model,
    output_size=output_size,
    max_length=max_length,
    nhead=nhead,
    dim_feedforward=dim_feedforward,
    num_layers=num_layers,
    dropout=dropout)
transformer.to(device)

# create convnext small + transformer model and load checkpoint
convnext_transformer = ImageTextClassifier(convnext, transformer, hidden_size=256, num_classes=27, dropout=0.3)
convnext_transformer.to(device)
checkpoint = torch.load('./models/convnext_small_transformer.pth')
convnext_transformer.load_state_dict(checkpoint['model_state_dict'])

In [None]:
# base models validation
print('Validation started...')
print('Resnet34 + Transformer ', end='')
pred_list = predict(resnet34_transformer, val_dataloader, use_image=True, use_text=True, val=True)
print('Efficient V2 + Transformer ', end='')
pred_list = predict(efficientnet_transformer, val_dataloader, use_image=True, use_text=True, val=True)
print('Convnext Small + Transformer ', end='')
pred_list = predict(convnext_transformer, val_dataloader, use_image=True, use_text=True, val=True)
print('Validation completed')

In [None]:
# weighted voting ensemble, use validation accuracies as weights
models = [resnet34_transformer, efficientnet_transformer, convnext_transformer]
weights = [0.9036, 0.9008, 0.9115]

# weighted voting ensemble validation
print('Validation started...')
print('Voting ', end='')
pred_list = predict_voting(models, val_dataloader, use_image=True, use_text=True, val=True, weights=weights)
print('Validation completed')

In [None]:
# weighted voting ensemble testing
print('Inference started...')
pred_list = predict_voting(models, test_dataloader, use_image=True, use_text=True, val=False, weights=weights)
print('Inference completed')

print('Converting category indices to category names...')
pred_list = index_to_category(pred_list, category_vocab)

In [None]:
# prediction file to be submitted
test_data = pd.read_csv('./dataset/test_cleaned.csv')
pred_data = test_data[['id']]
pred_data.insert(1, 'category', pred_list)
pred_data.to_csv('./dataset/predict.csv', index=False)