# Models Comparison and Ablation Experiment

### The code in this part is only used for my model performance comparison and ablation experiments. **The purpose of uploading this code is only to prove that we have done these things. For the model implementation and evaluation code of this assignment, please refer to CNN_Transformer_Xin.ipynb. Please DO NOT look at this part of the code first!** Please refer to the code in this section to see how we performed the ablation experiments and which models and hyperparameters we compared.

## 1. Data Preparation and Package Import

### 1.1 Clone data and images from GitHub

In [1]:
# All the data were saved in my GitHub, it's easy to get them all by running this code.
!git clone https://github.com/ShuXin79/5329ASS2.git

Cloning into '5329ASS2'...
remote: Enumerating objects: 40017, done.[K
remote: Counting objects: 100% (12/12), done.[K
remote: Compressing objects: 100% (8/8), done.[K
remote: Total 40017 (delta 4), reused 10 (delta 2), pack-reused 40005[K
Receiving objects: 100% (40017/40017), 393.03 MiB | 18.23 MiB/s, done.
Resolving deltas: 100% (6/6), done.
Updating files: 100% (40004/40004), done.


### 1.2 Libraries install and import

In [2]:
# Make sure your environment contains these libraries.
!pip install torch pandas pillow
!pip install torchvision
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m105.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m23.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m105.9 

In [3]:
import os
import pandas as pd
import numpy as np
import ast
from PIL import Image
from torch.utils.data import Dataset
from torchvision import transforms
import torchvision
# We used the pre-trained model BERT for our NLP task
from transformers import BertTokenizer, BertModel
import torch
from torch import nn
from torch.utils.data import DataLoader
import torch.nn.functional as F
# sklearn is only used for splitting the train-validation sets as well as calculating the f1-score
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

## 2. Data Reading Class

In [4]:
class ImageTextDataset(Dataset):
  # This class is to convert the input data into the format we want to see.
  
  def __init__(self, df, img_dir, transform=None, max_length=128):
    # df: the dataframe of our input data
    # img_dir: saved the path to the images
    # transform: determine the format of input images data
    # max_lenth: determine the max lenth of input captions data
    self.df = df
    self.img_dir = img_dir
    self.transform = transform
    # We use bert-base-uncased for tokenizing the words
    self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    self.max_length = max_length

  def __len__(self):
    # The size of the data
    return len(self.df)

  def __getitem__(self, idx):
    # Get the images number
    row = self.df.iloc[idx]
    
    # Process image
    img_name = os.path.join(self.img_dir, row.ImageID)
    # For increasing the running speed and decrease the memory usage, we use only one color channel, not RGB
    image = Image.open(img_name).convert("RGB")
    # According to the format of 'transform', transform the images data into tensors
    if self.transform:
      image = self.transform(image)
    
    # Process text
    text = row.Caption
    # Tokenize the text
    inputs = self.tokenizer.encode_plus(
      text,
      None,
      add_special_tokens=True,
      max_length=self.max_length,
      padding='max_length',
      return_token_type_ids=True,
      truncation=True
    )
    # Get caption number
    ids = inputs['input_ids']
    # Get the mask
    mask = inputs['attention_mask']

    # Process labels
    # Initialize a list and save the multi-labels
    label_indices = list(map(int, row.Labels.strip('[]').split()))
    # One-hot the labels. It's very useful when dealing with multi-labels.
    # We have labels of 1-19, though we don't have class 0 and 12, we should new a 20-dimentional tensor for processing.
    labels = torch.zeros(20)
    # Convert labels into binary vectors
    labels[label_indices] = 1 

    # Processed data
    return {
      'ids': torch.tensor(ids, dtype=torch.long),
      'mask': torch.tensor(mask, dtype=torch.long),
      'image': image,
      'labels': labels,
      'image_names' : img_name
    }

# Part A. Find the best epochs and optimize the model structure and hyper parameters(pre-trained, learning rate, kernel size, numbers of neurons).

## 3. Build the CNN-Transformer Fusion Model (Resnet and BERT) Class

In [122]:
class ImageTextModel(nn.Module):
  # This class is for combining the image model and the NLP model

  def __init__(self, bert_model_name='bert-base-uncased', num_classes=2):
    # Call initialization
    super(ImageTextModel, self).__init__()        
    # NLP text model: BERT
    self.text_model = BertModel.from_pretrained(bert_model_name)        
    # Image model: ResNet (pre-trained)
    self.image_model = torchvision.models.resnet50(pretrained=True)
    # change the first layer, since we changed the color channel into 1
    self.image_model.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
    # Match the fully connected layer with dimension of BERT
    self.image_model.fc = nn.Linear(2048, 768)         
    # Classification head
    self.classifier = nn.Linear(768, num_classes)

  def forward(self, input_ids, attention_mask, image):
    # Forward propagation, passing data
    # Text (vectors)
    text_outputs = self.text_model(input_ids=input_ids, attention_mask=attention_mask)
    # CLS token
    text_features = text_outputs.last_hidden_state[:, 0] 

    # Images (vectors)
    image_features = self.image_model(image)
        
    # Concatenate features of text and images
    combined_features = text_features + image_features
    output = self.classifier(combined_features)
    # output = self.classifier(image_features)
    return output

## 4. Define the Training and Testing Function

### 4.1 Training function

In [123]:
def train_loop(dataloader, model, loss_fn, optimizer):
  # dataloader: the data in dataloader form
  # model: our fusion model
  # loss_fn: loss functions for gradient descent
  # optimizer: add other optimization parameters, such as learning rate

  # Get the size of training set
  size = len(dataloader.dataset)
  # Start the training
  model.train()
    
  for batch, data in enumerate(dataloader):
    # Compute prediction vectors
    preds = model(data['ids'].to(device), data['mask'].to(device), data['image'].to(device))
    # Update the loss
    loss = loss_fn(preds, data['labels'].to(device))

    # Back propagation, update the weights and bias
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if batch % 100 == 0:
      # Print the loss
      loss, current = loss.item(), batch * len(data['image'])
      print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

### 4.2 Testing (predicting) function

In [124]:
def test_loop(dataloader, model, loss_fn, test=True):
  # dataloader: test data in dataloader form
  # model: our trained fusion model
  # loss_fn: loss function
  # test: run it on data with or without labels

  size = len(dataloader.dataset)
  # Initialization
  test_loss, correct = 0, 0
  all_preds = []
  all_labels = []
  all_numbers = []
  # Switch to evaluation mode
  model.eval()

  # Traverse process
  with torch.no_grad():
    for data in dataloader:
      # Make predictions, results are given in vectors form
      preds = model(data['ids'].to(device), data['mask'].to(device), data['image'].to(device))
      all_numbers.append(data['image_names'])
      for i in range(len(preds)):
        # If we have the labels, calculate the loss and accuracy as reference
        if test:
          test_loss += loss_fn(preds[i], data['labels'][i].to(device)).item()
          # preds[i] > 0 at where the labels are predicted. If all the labels are predicted correctly, it will return the tensor with all True value.
          if ((preds[i] > 0) == data['labels'][i].to(device)).all():
            correct += 1
          all_labels.append(data['labels'][i])
        # Transform the predictions result into one-hot form
        pred = torch.where(preds[i] < 0, torch.tensor(0), torch.tensor(1))
        # Return the data to cpu since they are saved in gpu currently
        all_preds.append(pred.cpu())

    # Calculate the final results
    test_loss /= size
    correct /= size
    # Print the loss and accuracy (0 are shown if we don't have true labels)
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    # Return the predictions and true labels for reference
    return all_preds, all_labels, all_numbers

## 5. Main Process: Training

### 5.1 Transfer model training to GPU

In [8]:
#The amount of tasks is too large, so we have to try to perform calculations on the GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### 5.2 Prepare the data

In [125]:
# Load the whole training data, drop the bad lines
data_given = pd.read_csv("/content/5329ASS2/train.csv", error_bad_lines=False)

# Split the training data for evaluating the model in my environment
train_df, test_df = train_test_split(data_given, test_size = 0.2)

# Define the transformations, which transform the images data into vectors features
transform = transforms.Compose([
    # Resize the images
    transforms.Resize((256, 256)),
    # Randomly apply horizontal flipping
    transforms.RandomHorizontalFlip(),
    # Randomly apply rotation
    transforms.RandomRotation(20),
    # Convert PIL image to tensors
    transforms.ToTensor(),
    # Normalize the images (specific values of mean and std are the means and standard deviations of the pytorch.ImageNet dataset)
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Train on the whole training set
# train_data = ImageTextDataset(data_given, "/content/5329ASS2/data", transform=transform)

# Train on the 80% of the training set, and test on the other 20% training set
train_data = ImageTextDataset(train_df, "/content/5329ASS2/data", transform=transform)
test_data = ImageTextDataset(test_df, "/content/5329ASS2/data", transform=transform)

# Create the dataloaders
train_dataloader = DataLoader(train_data, batch_size=64, shuffle=True)

# Create the model, loss function and optimizer
model = ImageTextModel(num_classes=20).to(device)
# Use BCEWithLogitsLoss for multi-label classification because it's better for multi-labels task
loss_fn = nn.BCEWithLogitsLoss() 
# Use the optim.Adam. Learning rate: 0.0001
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)




  data_given = pd.read_csv("/content/5329ASS2/train.csv", error_bad_lines=False)
Skipping line 9086: expected 3 fields, saw 4
Skipping line 9510: expected 3 fields, saw 4
Skipping line 18114: expected 3 fields, saw 4
Skipping line 27169: expected 3 fields, saw 4

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exac

### 5.3 Training

In [145]:
# Start training
epochs = 1
for t in range(epochs):
  print(f"Epoch {t+1}\n-------------------------------")
  train_loop(train_dataloader, model, loss_fn, optimizer)
print("Model training completed")

Epoch 1
-------------------------------
loss: 0.035200  [    0/23996]
loss: 0.056670  [ 6400/23996]
loss: 0.040780  [12800/23996]
loss: 0.048060  [19200/23996]
Model training completed


## 6. Evaluation

In [146]:
# Process the test data as well as create the dataloaders
# test_data = ImageTextDataset(test_file, "/content/5329ASS2/data", transform=transform)
# For validation only
test_data = ImageTextDataset(test_df, "/content/5329ASS2/data", transform=transform)

test_dataloader = DataLoader(test_data, batch_size=64, shuffle=True)

### 6.2 Make prediction

In [147]:
# Acc and loss will be 0 if we don't have true labels of test data
pred, rightlabel, all_numbers = test_loop(test_dataloader, model, loss_fn, test=True)
print(len(pred))

Test Error: 
 Accuracy: 66.6%, Avg loss: 0.079560 

6000


In [148]:
# Transform the predicted labels into numpy form
y_pred = [t.numpy() for t in pred]

### 6.3 Calculate the f1 score if we have true labels (validation)

In [149]:
y_true = [t.numpy() for t in rightlabel]
f1 = f1_score(y_true, y_pred, average='micro')
f1

0.8320864037801654

# Part B. Use CNN model (ResNet) only

## 3. Build the CNN Model Resnet only

In [53]:
class ImageModel(nn.Module):


  def __init__(self, bert_model_name='bert-base-uncased', num_classes=2):
    # Call initialization
    super(ImageModel, self).__init__()               
    # Image model: ResNet (pre-trained)
    self.image_model = torchvision.models.resnet50(pretrained=True)
    # change the first conv1 layer
    self.image_model.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
    self.image_model.fc = nn.Linear(2048, 768)         
    # Classification head
    self.classifier = nn.Linear(768, num_classes)

  def forward(self, input_ids, attention_mask, image):
    # Images (vectors)
    image_features = self.image_model(image)
    output = self.classifier(image_features)
    return output

## 5. Main Process: Training

### 5.1 Transfer model training to GPU

### 5.2 Prepare the data

In [54]:
# Load the whole training data, drop the bad lines
data_given = pd.read_csv("/content/5329ASS2/train.csv", error_bad_lines=False)

# Split the training data for evaluating the model in my environment
train_df, test_df = train_test_split(data_given, test_size = 0.2)

# Define the transformations, which transform the images data into vectors features
transform = transforms.Compose([
    # Resize the images
    transforms.Resize((256, 256)),
    # Randomly apply horizontal flipping
    transforms.RandomHorizontalFlip(),
    # Randomly apply rotation
    transforms.RandomRotation(20),
    # Convert PIL image to tensors
    transforms.ToTensor(),
    # Normalize the images (specific values of mean and std are the means and standard deviations of the pytorch.ImageNet dataset)
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Train on the whole training set
# train_data = ImageTextDataset(data_given, "/content/5329ASS2/data", transform=transform)

# Train on the 80% of the training set, and test on the other 20% training set
train_data = ImageTextDataset(train_df, "/content/5329ASS2/data", transform=transform)
test_data = ImageTextDataset(test_df, "/content/5329ASS2/data", transform=transform)

# Create the dataloaders
train_dataloader = DataLoader(train_data, batch_size=64, shuffle=True)

# Create the model, loss function and optimizer
model = ImageModel(num_classes=20).to(device)
# Use BCEWithLogitsLoss for multi-label classification because it's better for multi-labels task
loss_fn = nn.BCEWithLogitsLoss() 
# Use the optim.Adam. Learning rate: 0.0001
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)




  data_given = pd.read_csv("/content/5329ASS2/train.csv", error_bad_lines=False)
Skipping line 9086: expected 3 fields, saw 4
Skipping line 9510: expected 3 fields, saw 4
Skipping line 18114: expected 3 fields, saw 4
Skipping line 27169: expected 3 fields, saw 4

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 329MB/s]


### 5.3 Training

In [86]:
# Start training
epochs = 10
for t in range(epochs):
  print(f"Epoch {t+1}\n-------------------------------")
  train_loop(train_dataloader, model, loss_fn, optimizer)
print("Model training completed")

Epoch 1
-------------------------------
loss: 0.028043  [    0/23996]
loss: 0.032276  [ 6400/23996]
loss: 0.047948  [12800/23996]
loss: 0.048229  [19200/23996]
Epoch 2
-------------------------------
loss: 0.032780  [    0/23996]
loss: 0.019022  [ 6400/23996]
loss: 0.036389  [12800/23996]
loss: 0.048085  [19200/23996]
Model training completed


## 6. Evaluation

In [87]:
# Process the test data as well as create the dataloaders
# test_data = ImageTextDataset(test_file, "/content/5329ASS2/data", transform=transform)
# For validation only
test_data = ImageTextDataset(test_df, "/content/5329ASS2/data", transform=transform)

test_dataloader = DataLoader(test_data, batch_size=64, shuffle=True)

### 6.2 Make prediction

In [88]:
# Acc and loss will be 0 if we don't have true labels of test data
pred, rightlabel, all_numbers = test_loop(test_dataloader, model, loss_fn, test=True)
print(len(pred))

Test Error: 
 Accuracy: 57.5%, Avg loss: 0.123323 

6000


In [89]:
# Transform the predicted labels into numpy form
y_pred = [t.numpy() for t in pred]

### 6.3 Calculate the f1 score if we have true labels (validation)

In [90]:
y_true = [t.numpy() for t in rightlabel]
f1 = f1_score(y_true, y_pred, average='micro')
f1

0.7675891583452211

In [85]:
# Model compression
torch.save(model, 'original_model')
quantized_model = torch.quantization.quantize_dynamic(
    model, {torch.nn.Linear}, dtype=torch.qint8
)
torch.save(quantized_model, 'quantized_model')

# Part C. SqueezeNet1.1 with BERT

## 3. Build the CNN-Transformer Fusion Model (SqueezeNet1.1 and BERT) Class

In [102]:
class ImageTextModel_new(nn.Module):
  # This class is for combining the image model and the NLP model

  def __init__(self, bert_model_name='bert-base-uncased', num_classes=2):
    # Call initialization
    super(ImageTextModel_new, self).__init__()        
    # NLP text model: BERT
    self.text_model = BertModel.from_pretrained(bert_model_name)        
    # Image model: SqueezeNet1_1 (pre-trained)
    self.image_model = torchvision.models.squeezenet1_1(pretrained=True)
    self.image_model.classifier[1] = torch.nn.Conv2d(512, 768, kernel_size=(1,1), stride=(1,1))

    # Reset final convolution layer's weights
    self.image_model.classifier[1].weight.data.normal_(0, 0.01)
    self.image_model.classifier[1].bias.data.zero_()
    
    # Classifier to reduce the combined features to the number of classes
    self.classifier = torch.nn.Linear(768*2, num_classes)

  def forward(self, input_ids, attention_mask, image):
    # Forward propagation, passing data
    # Text (vectors)
    text_outputs = self.text_model(input_ids=input_ids, attention_mask=attention_mask)
    # CLS token
    text_features = text_outputs.last_hidden_state[:, 0] 

    # Images (vectors)
    image_features = self.image_model(image)
    image_features = image_features.view(image_features.size(0), -1)  # Flattening

    # Concatenate features of text and images
    combined_features = torch.cat((text_features, image_features), dim=1)
    output = self.classifier(combined_features)
    return output

## 4. Define the Training and Testing Function

### 4.1 Training function

In [103]:
def train_loop(dataloader, model, loss_fn, optimizer):
  # dataloader: the data in dataloader form
  # model: our fusion model
  # loss_fn: loss functions for gradient descent
  # optimizer: add other optimization parameters, such as learning rate

  # Get the size of training set
  size = len(dataloader.dataset)
  # Start the training
  model.train()
    
  for batch, data in enumerate(dataloader):
    # Compute prediction vectors
    preds = model(data['ids'].to(device), data['mask'].to(device), data['image'].to(device))
    # Update the loss
    loss = loss_fn(preds, data['labels'].to(device))

    # Back propagation, update the weights and bias
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if batch % 100 == 0:
      # Print the loss
      loss, current = loss.item(), batch * len(data['image'])
      print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

### 4.2 Testing (predicting) function

In [104]:
def test_loop(dataloader, model, loss_fn, test=True):
  # dataloader: test data in dataloader form
  # model: our trained fusion model
  # loss_fn: loss function
  # test: run it on data with or without labels

  size = len(dataloader.dataset)
  # Initialization
  test_loss, correct = 0, 0
  all_preds = []
  all_labels = []
  all_numbers = []
  # Switch to evaluation mode
  model.eval()

  # Traverse process
  with torch.no_grad():
    for data in dataloader:
      # Make predictions, results are given in vectors form
      preds = model(data['ids'].to(device), data['mask'].to(device), data['image'].to(device))
      all_numbers.append(data['image_names'])
      for i in range(len(preds)):
        # If we have the labels, calculate the loss and accuracy as reference
        if test:
          test_loss += loss_fn(preds[i], data['labels'][i].to(device)).item()
          # preds[i] > 0 at where the labels are predicted. If all the labels are predicted correctly, it will return the tensor with all True value.
          if ((preds[i] > 0) == data['labels'][i].to(device)).all():
            correct += 1
          all_labels.append(data['labels'][i])
        # Transform the predictions result into one-hot form
        pred = torch.where(preds[i] < 0, torch.tensor(0), torch.tensor(1))
        # Return the data to cpu since they are saved in gpu currently
        all_preds.append(pred.cpu())

    # Calculate the final results
    test_loss /= size
    correct /= size
    # Print the loss and accuracy (0 are shown if we don't have true labels)
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    # Return the predictions and true labels for reference
    return all_preds, all_labels, all_numbers

## 5. Main Process: Training

### 5.2 Prepare the data

In [105]:
# Load the whole training data, drop the bad lines
data_given = pd.read_csv("/content/5329ASS2/train.csv", error_bad_lines=False)

# Split the training data for evaluating the model in my environment
train_df, test_df = train_test_split(data_given, test_size = 0.2)

# Define the transformations, which transform the images data into vectors features
transform = transforms.Compose([
    # Resize the images
    transforms.Resize((256, 256)),
    # Randomly apply horizontal flipping
    transforms.RandomHorizontalFlip(),
    # Randomly apply rotation
    transforms.RandomRotation(20),
    # Convert PIL image to tensors
    transforms.ToTensor(),
    # Normalize the images (specific values of mean and std are the means and standard deviations of the pytorch.ImageNet dataset)
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Train on the whole training set
# train_data = ImageTextDataset(data_given, "/content/5329ASS2/data", transform=transform)

# Train on the 80% of the training set, and test on the other 20% training set
train_data = ImageTextDataset(train_df, "/content/5329ASS2/data", transform=transform)
test_data = ImageTextDataset(test_df, "/content/5329ASS2/data", transform=transform)

# Create the dataloaders
train_dataloader = DataLoader(train_data, batch_size=64, shuffle=True)

# Create the model, loss function and optimizer
model = ImageTextModel_new(num_classes=20).to(device)
# Use BCEWithLogitsLoss for multi-label classification because it's better for multi-labels task
loss_fn = nn.BCEWithLogitsLoss() 
# Use the optim.Adam. Learning rate: 0.0001
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)




  data_given = pd.read_csv("/content/5329ASS2/train.csv", error_bad_lines=False)
Skipping line 9086: expected 3 fields, saw 4
Skipping line 9510: expected 3 fields, saw 4
Skipping line 18114: expected 3 fields, saw 4
Skipping line 27169: expected 3 fields, saw 4

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exac

### 5.3 Training

In [117]:
# Start training
epochs = 2
for t in range(epochs):
  print(f"Epoch {t+1}\n-------------------------------")
  train_loop(train_dataloader, model, loss_fn, optimizer)
print("Model training completed")

Epoch 1
-------------------------------
loss: 0.056411  [    0/23996]
loss: 0.036178  [ 6400/23996]
loss: 0.055326  [12800/23996]
loss: 0.056350  [19200/23996]
Epoch 2
-------------------------------
loss: 0.029216  [    0/23996]
loss: 0.021243  [ 6400/23996]
loss: 0.041047  [12800/23996]
loss: 0.053141  [19200/23996]
Model training completed


## 6. Evaluation

In [118]:
# Process the test data as well as create the dataloaders
# test_data = ImageTextDataset(test_file, "/content/5329ASS2/data", transform=transform)
# For validation only
test_data = ImageTextDataset(test_df, "/content/5329ASS2/data", transform=transform)

test_dataloader = DataLoader(test_data, batch_size=64, shuffle=True)

### 6.2 Make prediction

In [119]:
# Acc and loss will be 0 if we don't have true labels of test data
pred, rightlabel, all_numbers = test_loop(test_dataloader, model, loss_fn, test=True)
print(len(pred))

Test Error: 
 Accuracy: 65.4%, Avg loss: 0.088095 

6000


In [120]:
# Transform the predicted labels into numpy form
y_pred = [t.numpy() for t in pred]

### 6.3 Calculate the f1 score if we have true labels (validation)

In [121]:
y_true = [t.numpy() for t in rightlabel]
f1 = f1_score(y_true, y_pred, average='micro')
f1

0.8175322117819164

In [116]:
# Model compression
torch.save(model, 'original_model')
quantized_model = torch.quantization.quantize_dynamic(
    model, {torch.nn.Linear}, dtype=torch.qint8
)
torch.save(quantized_model, 'quantized_model')

# Part D. SqueezeNet1.1 only

In [160]:
class ImageModel_new(nn.Module):
  # This class is for combining the image model and the NLP model

  def __init__(self, bert_model_name='bert-base-uncased', num_classes=2):
    # Call initialization
    super(ImageModel_new, self).__init__()        
       
    # Image model: SqueezeNet1_1 (pre-trained)
    self.image_model = torchvision.models.squeezenet1_1(pretrained=True)
    self.image_model.classifier[1] = torch.nn.Conv2d(512, 768, kernel_size=(1,1), stride=(1,1))

    # Reset final convolution layer's weights
    self.image_model.classifier[1].weight.data.normal_(0, 0.01)
    self.image_model.classifier[1].bias.data.zero_()
    
    # Classifier to reduce the combined features to the number of classes
    self.classifier = nn.Linear(768, num_classes)

  def forward(self, input_ids, attention_mask, image):
    # Forward propagation, passing data


    # Images (vectors)
    image_features = self.image_model(image)
    # image_features = image_features.view(image_features.size(0), -1)  # Flattening


    output = self.classifier(image_features)
    return output

In [161]:
# Load the whole training data, drop the bad lines
data_given = pd.read_csv("/content/5329ASS2/train.csv", error_bad_lines=False)

# Split the training data for evaluating the model in my environment
train_df, test_df = train_test_split(data_given, test_size = 0.2)

# Define the transformations, which transform the images data into vectors features
transform = transforms.Compose([
    # Resize the images
    transforms.Resize((256, 256)),
    # Randomly apply horizontal flipping
    transforms.RandomHorizontalFlip(),
    # Randomly apply rotation
    transforms.RandomRotation(20),
    # Convert PIL image to tensors
    transforms.ToTensor(),
    # Normalize the images (specific values of mean and std are the means and standard deviations of the pytorch.ImageNet dataset)
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Train on the whole training set
# train_data = ImageTextDataset(data_given, "/content/5329ASS2/data", transform=transform)

# Train on the 80% of the training set, and test on the other 20% training set
train_data = ImageTextDataset(train_df, "/content/5329ASS2/data", transform=transform)
test_data = ImageTextDataset(test_df, "/content/5329ASS2/data", transform=transform)

# Create the dataloaders
train_dataloader = DataLoader(train_data, batch_size=64, shuffle=True)

# Create the model, loss function and optimizer
model = ImageModel_new(num_classes=20).to(device)
# Use BCEWithLogitsLoss for multi-label classification because it's better for multi-labels task
loss_fn = nn.BCEWithLogitsLoss() 
# Use the optim.Adam. Learning rate: 0.0001
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)




  data_given = pd.read_csv("/content/5329ASS2/train.csv", error_bad_lines=False)
Skipping line 9086: expected 3 fields, saw 4
Skipping line 9510: expected 3 fields, saw 4
Skipping line 18114: expected 3 fields, saw 4
Skipping line 27169: expected 3 fields, saw 4



In [162]:
# Start training
epochs = 2
for t in range(epochs):
  print(f"Epoch {t+1}\n-------------------------------")
  train_loop(train_dataloader, model, loss_fn, optimizer)
print("Model training completed")

Epoch 1
-------------------------------
loss: 0.708764  [    0/23996]
loss: 0.156727  [ 6400/23996]
loss: 0.143899  [12800/23996]
loss: 0.128263  [19200/23996]
Epoch 2
-------------------------------
loss: 0.092530  [    0/23996]
loss: 0.134718  [ 6400/23996]
loss: 0.091756  [12800/23996]
loss: 0.118144  [19200/23996]
Model training completed


In [163]:
# Process the test data as well as create the dataloaders
# test_data = ImageTextDataset(test_file, "/content/5329ASS2/data", transform=transform)
# For validation only
test_data = ImageTextDataset(test_df, "/content/5329ASS2/data", transform=transform)

test_dataloader = DataLoader(test_data, batch_size=64, shuffle=True)

In [164]:
# Acc and loss will be 0 if we don't have true labels of test data
pred, rightlabel, all_numbers = test_loop(test_dataloader, model, loss_fn, test=True)
print(len(pred))

Test Error: 
 Accuracy: 52.7%, Avg loss: 0.116757 

6000


In [165]:
# Transform the predicted labels into numpy form
y_pred = [t.numpy() for t in pred]

In [166]:
y_true = [t.numpy() for t in rightlabel]
f1 = f1_score(y_true, y_pred, average='micro')
f1

0.7043238270469181

In [167]:
# Model compression
torch.save(model, 'original_model')
quantized_model = torch.quantization.quantize_dynamic(
    model, {torch.nn.Linear}, dtype=torch.qint8
)
torch.save(quantized_model, 'quantized_model')