# COMP5329 CNN-Ttransformer Classifier

## 1. Data Preparation and Package Import

### 1.1 Clone data and images from GitHub

In [1]:
# All the data were saved in my GitHub, it's easy to get them all by running this code.
!git clone https://github.com/ShuXin79/5329ASS2.git

Cloning into '5329ASS2'...
remote: Enumerating objects: 40008, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Total 40008 (delta 0), reused 3 (delta 0), pack-reused 40005[K
Receiving objects: 100% (40008/40008), 393.00 MiB | 36.21 MiB/s, done.
Resolving deltas: 100% (2/2), done.
Updating files: 100% (40003/40003), done.


### 1.2 Libraries install and import

In [2]:
# Make sure your environment contains these libraries.
!pip install torch pandas pillow
!pip install torchvision
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.1-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m71.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m26.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m103.2 M

In [3]:
import os
import pandas as pd
import numpy as np
import ast
from PIL import Image
from torch.utils.data import Dataset
from torchvision import transforms
import torchvision
# We used the pre-trained model BERT for our NLP task
from transformers import BertTokenizer, BertModel
import torch
from torch import nn
from torch.utils.data import DataLoader
import torch.nn.functional as F
# sklearn is only used for splitting the train-validation sets as well as calculating the f1-score
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

## 2. Data Reading Class

In [4]:
class ImageTextDataset(Dataset):
  # This class is to convert the input data into the format we want to see.
  
  def __init__(self, df, img_dir, transform=None, max_length=128):
    # df: the dataframe of our input data
    # img_dir: saved the path to the images
    # transform: determine the format of input images data
    # max_lenth: determine the max lenth of input captions data
    self.df = df
    self.img_dir = img_dir
    self.transform = transform
    # We use bert-base-uncased for tokenizing the words
    self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    self.max_length = max_length

  def __len__(self):
    # The size of the data
    return len(self.df)

  def __getitem__(self, idx):
    # Get the images number
    row = self.df.iloc[idx]
    
    # Process image
    img_name = os.path.join(self.img_dir, row.ImageID)
    # For increasing the running speed and decrease the memory usage, we use only one color channel, not RGB
    image = Image.open(img_name).convert("L")
    # According to the format of 'transform', transform the images data into tensors
    if self.transform:
      image = self.transform(image)
    
    # Process text
    text = row.Caption
    # Tokenize the text
    inputs = self.tokenizer.encode_plus(
      text,
      None,
      add_special_tokens=True,
      max_length=self.max_length,
      padding='max_length',
      return_token_type_ids=True,
      truncation=True
    )
    # Get caption number
    ids = inputs['input_ids']
    # Get the mask
    mask = inputs['attention_mask']

    # Process labels
    # Initialize a list and save the multi-labels
    label_indices = list(map(int, row.Labels.strip('[]').split()))
    # One-hot the labels. It's very useful when dealing with multi-labels.
    # We have labels of 1-19, though we don't have class 0 and 12, we should new a 20-dimentional tensor for processing.
    labels = torch.zeros(20)
    # Convert labels into binary vectors
    labels[label_indices] = 1 

    # Processed data
    return {
      'ids': torch.tensor(ids, dtype=torch.long),
      'mask': torch.tensor(mask, dtype=torch.long),
      'image': image,
      'labels': labels
    }

## 3. Build the CNN-Transformer Fusion Model (Resnet and BERT) Class

In [5]:
class ImageTextModel(nn.Module):
  # This class is for combining the image model and the NLP model

  def __init__(self, bert_model_name='bert-base-uncased', num_classes=2):
    # Call initialization
    super(ImageTextModel, self).__init__()        
    # NLP text model: BERT
    self.text_model = BertModel.from_pretrained(bert_model_name)        
    # Image model: ResNet (pre-trained)
    self.image_model = torchvision.models.resnet50(pretrained=True)
    # change the first layer, since we changed the color channel into 1
    self.image_model.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)
    # Match the fully connected layer with dimension of BERT
    self.image_model.fc = nn.Linear(2048, 768)         
    # Classification head
    self.classifier = nn.Linear(768, num_classes)

  def forward(self, input_ids, attention_mask, image):
    # Forward propagation, passing data
    # Text (vectors)
    text_outputs = self.text_model(input_ids=input_ids, attention_mask=attention_mask)
    # CLS token
    text_features = text_outputs.last_hidden_state[:, 0] 

    # Images (vectors)
    image_features = self.image_model(image)
        
    # Concatenate features of text and images
    combined_features = text_features + image_features
    output = self.classifier(combined_features)
    return output

## 4. Define the Training and Testing Function

### 4.1 Training function

In [6]:
def train_loop(dataloader, model, loss_fn, optimizer):
  # dataloader: the data in dataloader form
  # model: our fusion model
  # loss_fn: loss functions for gradient descent
  # optimizer: add other optimization parameters, such as learning rate

  # Get the size of training set
  size = len(dataloader.dataset)
  # Start the training
  model.train()
    
  for batch, data in enumerate(dataloader):
    # Compute prediction vectors
    preds = model(data['ids'].to(device), data['mask'].to(device), data['image'].to(device))
    # Update the loss
    loss = loss_fn(preds, data['labels'].to(device))

    # Back propagation, update the weights and bias
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if batch % 100 == 0:
      # Print the loss
      loss, current = loss.item(), batch * len(data['image'])
      print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

### 4.2 Testing (predicting) function

In [35]:
def test_loop(dataloader, model, loss_fn, test=True):
  # dataloader: test data in dataloader form
  # model: our trained fusion model
  # loss_fn: loss function
  # test: run it on data with or without labels

  size = len(dataloader.dataset)
  # Initialization
  test_loss, correct = 0, 0
  all_preds = []
  all_labels = []
  # Switch to evaluation mode
  model.eval()

  # Traverse process
  with torch.no_grad():
    for data in dataloader:
      # Make predictions, results are given in vectors form
      preds = model(data['ids'].to(device), data['mask'].to(device), data['image'].to(device))
      for i in range(len(preds)):
        # If we have the labels, calculate the loss and accuracy as reference
        if test:
          test_loss += loss_fn(preds[i], data['labels'][i].to(device)).item()
          # preds[i] > 0 at where the labels are predicted. If all the labels are predicted correctly, it will return the tensor with all True value.
          if ((preds[i] > 0) == data['labels'][i].to(device)).all():
            correct += 1
          all_labels.append(data['labels'][i])
        # Transform the predictions result into one-hot form
        pred = torch.where(preds[i] < 0, torch.tensor(0), torch.tensor(1))
        # Return the data to cpu since they are saved in gpu currently
        all_preds.append(pred.cpu())

    # Calculate the final results
    test_loss /= size
    correct /= size
    # Print the loss and accuracy (0 are shown if we don't have true labels)
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    # Return the predictions and true labels for reference
    return all_preds, all_labels

## 5. Main Process: Training

### 5.1 Transfer model training to GPU

In [8]:
#The amount of tasks is too large, so we have to try to perform calculations on the GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### 5.2 Prepare the data

In [9]:
# Load the whole training data, drop the bad lines
data_given = pd.read_csv("/content/5329ASS2/train.csv", error_bad_lines=False)

# Split the training data for evaluating the model in my environment
train_df, test_df = train_test_split(data_given, test_size = 0.2)

# Define the transformations, which transform the images data into vectors features
transform = transforms.Compose([
  # Resize the images
  transforms.Resize((256, 256)), 
  # Convert PIL image to tensors
  transforms.ToTensor(), 
  # Normalize the images
  transforms.Normalize(mean=[0.5], std=[0.5]), 
])

# Train on the whole training set
train_data = ImageTextDataset(data_given, "/content/5329ASS2/data", transform=transform)

# Train on the 80% of the training set, and test on the other 20% training set
#train_data = ImageTextDataset(train_df, "/content/5329ASS2/data", transform=transform)
#test_data = ImageTextDataset(test_df, "/content/5329ASS2/data", transform=transform)

# Create the dataloaders
train_dataloader = DataLoader(train_data, batch_size=64, shuffle=True)

# Create the model, loss function and optimizer
model = ImageTextModel(num_classes=20).to(device)
# Use BCEWithLogitsLoss for multi-label classification because it's better for multi-labels task
loss_fn = nn.BCEWithLogitsLoss() 
# Use the optim.Adam. Learning rate: 0.0001
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)




  data_given = pd.read_csv("/content/5329ASS2/train.csv", error_bad_lines=False)
Skipping line 9086: expected 3 fields, saw 4
Skipping line 9510: expected 3 fields, saw 4
Skipping line 18114: expected 3 fields, saw 4
Skipping line 27169: expected 3 fields, saw 4



Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M

### 5.3 Training

In [10]:
# Start training
epochs = 50
for t in range(epochs):
  print(f"Epoch {t+1}\n-------------------------------")
  train_loop(train_dataloader, model, loss_fn, optimizer)
print("Model training completed")

Epoch 1
-------------------------------
loss: 0.695406  [    0/29996]
loss: 0.062018  [ 6400/29996]
loss: 0.094769  [12800/29996]
loss: 0.073205  [19200/29996]
loss: 0.065463  [25600/29996]
Epoch 2
-------------------------------
loss: 0.068415  [    0/29996]
loss: 0.091977  [ 6400/29996]
loss: 0.063320  [12800/29996]
loss: 0.057112  [19200/29996]
loss: 0.094133  [25600/29996]
Epoch 3
-------------------------------
loss: 0.042415  [    0/29996]
loss: 0.044104  [ 6400/29996]
loss: 0.038299  [12800/29996]
loss: 0.073202  [19200/29996]
loss: 0.054632  [25600/29996]
Epoch 4
-------------------------------
loss: 0.039425  [    0/29996]
loss: 0.035325  [ 6400/29996]
loss: 0.034018  [12800/29996]
loss: 0.043947  [19200/29996]
loss: 0.040127  [25600/29996]
Epoch 5
-------------------------------
loss: 0.031572  [    0/29996]
loss: 0.019866  [ 6400/29996]
loss: 0.013573  [12800/29996]
loss: 0.020196  [19200/29996]
loss: 0.027719  [25600/29996]
Epoch 6
-------------------------------
loss: 0.01

## 6. Evaluation

### 6.1 Prepare the test data

In [96]:
# Load the test data, drop the bad lines temporarily

test_file = pd.read_csv('/content/5329ASS2/test.csv', error_bad_lines = False)
#test_file = data_given[18000:24000]
# New a column 'Labels' for avoiding bug
test_file['Labels'] = '0'
test_file



  test_file = pd.read_csv('/content/5329ASS2/test.csv', error_bad_lines = False)
Skipping line 6891: expected 2 fields, saw 3



Unnamed: 0,ImageID,Caption,Labels
0,30000.jpg,A little girl waring a krispy kreme hat holdin...,0
1,30001.jpg,A beautiful young woman holding an orange fris...,0
2,30002.jpg,A group of people sitting on couch next to a c...,0
3,30003.jpg,A person on a snowboard rides on the hill.,0
4,30004.jpg,A man riding a skateboard with a helmet on in ...,0
...,...,...,...
9994,39995.jpg,A group of men riding surfboards riding a mass...,0
9995,39996.jpg,A motorcycle parked next to a car in a parking...,0
9996,39997.jpg,a little boy that is playing with a wii,0
9997,39998.jpg,group of kids play Frisbee golf in the middle ...,0


In [97]:
# Fill bad lines
pd_arr1 = test_file[:6889]
pd_arr2 = test_file[6889:]
pd_arr1.loc[6889] = ['36889.jpg', 'Stop sign with added war" annotation at an intersection."', '0']
test_file = pd_arr1.append(pd_arr2, ignore_index=True)
test_file

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pd_arr1.loc[6889] = ['36889.jpg', 'Stop sign with added war" annotation at an intersection."', '0']
  test_file = pd_arr1.append(pd_arr2, ignore_index=True)


Unnamed: 0,ImageID,Caption,Labels
0,30000.jpg,A little girl waring a krispy kreme hat holdin...,0
1,30001.jpg,A beautiful young woman holding an orange fris...,0
2,30002.jpg,A group of people sitting on couch next to a c...,0
3,30003.jpg,A person on a snowboard rides on the hill.,0
4,30004.jpg,A man riding a skateboard with a helmet on in ...,0
...,...,...,...
9995,39995.jpg,A group of men riding surfboards riding a mass...,0
9996,39996.jpg,A motorcycle parked next to a car in a parking...,0
9997,39997.jpg,a little boy that is playing with a wii,0
9998,39998.jpg,group of kids play Frisbee golf in the middle ...,0


In [98]:
# Process the test data as well as create the dataloaders
test_data = ImageTextDataset(test_file, "/content/5329ASS2/data", transform=transform)
test_dataloader = DataLoader(test_data, batch_size=64, shuffle=True)

### 6.2 Make prediction

In [99]:
# Acc and loss will be 0 if we don't have true labels of test data
pred, rightlabel = test_loop(test_dataloader, model, loss_fn, test=False)
print(len(pred))

Test Error: 
 Accuracy: 0.0%, Avg loss: 0.000000 

10000


In [100]:
print(len(rightlabel))

0


In [101]:
# Transform into numpy form
y_pred = [t.numpy() for t in pred]

### 6.3 Calculate the f1 score (if we have true labels)

In [92]:
y_true = [t.numpy() for t in rightlabel]
f1 = f1_score(y_true, y_pred, average='micro')
f1

0.9906047516198704

## 7. Export output

### 7.1 Restore labels format

In [102]:
# Since we save the predict labels as one-hot format, we should transform them to original format
y_pred_indices = [np.where(arr==1)[0] for arr in y_pred]
y_pred_strings = [' '.join(map(str, arr)) for arr in y_pred_indices]
y_pred_strings[:10]

['1', '1', '17', '1', '1', '1', '1', '1', '1', '1']

### 7.2 Export .csv file that saves the labels of the test set

In [107]:
predict_output = pd.DataFrame({'ImageID': test_file['ImageID'], 'Labels': y_pred_strings})
predict_output.to_csv('predict_output.csv', index=False)
predict_output

Unnamed: 0,ImageID,Labels
0,30000.jpg,1
1,30001.jpg,1
2,30002.jpg,17
3,30003.jpg,1
4,30004.jpg,1
...,...,...
9995,39995.jpg,1
9996,39996.jpg,1 3
9997,39997.jpg,1 6
9998,39998.jpg,1


### 7.3 Export the model

In [19]:
# Model compression
torch.save(model, 'original_model')
quantized_model = torch.quantization.quantize_dynamic(
    model, {torch.nn.Linear}, dtype=torch.qint8
)
torch.save(quantized_model, 'mymodel')