# README
## Instruction for code runninng:
1. To start with running the cells in the 'Loading data from the google drive' to download the data. 
2. Run the cell in the 'Define fix parameter' to define the data path. 
3. The cells in 'Define Helper Functions' section need to be executed to make sure data preprocessing and training process run smoothly. 
4. The 'Define hyper parameters part' define the lstm layers, lstm hidden units, pretrain cnn model used to extract image, epoch to run and learning rate. User can adjust these paramters to achieve different performance. 
5. To experiment with the effect of pos weight for the loss function, go to 'Training process' section and 'Define model, loss function and optimizer' section, uncomment the code with pos_weight added.
6. After adjusting the hyper parameters, sequentially execute the following sections to train the model and make predictions on the test dataset

## !!!NOTE: 
This code is the version for running on colab. If you want to run on local device, you HAVE TO:

1. change the path of data and csv files in 'Define fix parameter'
2. comment the 'Loading data from the google drive' part

In [1]:
import pandas as pd 
import re
import numpy as np
from io import StringIO
import torch 
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import datasets, transforms, models
from PIL import Image
from sklearn.metrics import f1_score
import torch.optim as optim
import time
import copy
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
import random
from sklearn import metrics
from gensim.models import KeyedVectors
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


# Loading data from the google drive

In [2]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [3]:
id = '1aB6M6YLGXM4u2F8eyb8_iHRLCRzM-Vm4'
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('2021s1comp5329assignment2.zip')  

In [4]:
! unzip -qq 2021s1comp5329assignment2.zip 

# Define fix parameters

In [5]:
 # change your path here
img_dir = './COMP5329S1A2Dataset/data/'
test_csv_dir = './COMP5329S1A2Dataset/test.csv'
train_csv_dir = './COMP5329S1A2Dataset/train.csv'
seq_len = 49 # max sequence length for captions

# Define Helper Functions

In [6]:
# remove all the punctuation in the captions
def remove_punctuation_re(x):
    x = x.lower()
    x = re.sub(r'[^\w\s]','',x)
    return x

# changing the labels to one hot vector
def convert_label_to_vector(labels):
    label_arr = np.zeros(20)
    parsed_label = [int(x) for x in labels.split()]
    for index in parsed_label:
        label_arr[index] = 1
    return label_arr

In [7]:
# preprocessing for captions, this function will remove all the punctuations,
# tokenize the captions into list of words, and then lemmatize each word in the list.
def data_preprocess(data_path, img_dir, train=True):
    with open(data_path) as file:
        lines = [re.sub(r'([^,])"(\s*[^\n])', r'\1/"\2', line) for line in file]
    df = pd.read_csv(StringIO(''.join(lines)), escapechar="/")
    df['Image_Path'] = df['ImageID'].map(lambda x: img_dir + x)
    df['Caption'] = df['Caption'].map(lambda x: remove_punctuation_re(x))
    df['Caption'] = df['Caption'].map(lambda x: nltk.word_tokenize(x))
    lemmatizer = WordNetLemmatizer()
    # lemmatization process
    df['Caption'] = df['Caption'].map(lambda x: [lemmatizer.lemmatize(word) for word in x])
    if train:
        df['LabelsVector'] = df['Labels'].map(lambda x: convert_label_to_vector(x))
    return df 

In [8]:
# generate dictionary to convert words into one hot vector
def get_word_wrapper(df):
    word_set = set()
    for caption in df["Caption"]:
        for word in caption:
            word_set.add(word)
    word_set.add('[PAD]')
    word_set.add('[UNKNOWN]')
    word_list = list(word_set)
    word_index = {}
    ind = 0
    for word in word_list:
        word_index[word] = ind 
        ind += 1
    return word_index, word_list

In [9]:
# add padding so the vector size will be equal to seq_length
def encode_and_pad(caption_arr, seq_length, word_index):
    sent_encode = []
    if len(caption_arr) > seq_length:
        caption_arr = caption_arr[:seq_length]
    for word in caption_arr:
        try: 
            sent_encode.append(word_index[word])
        except:
            sent_encode.append(word_index["[UNKNOWN]"])
    if len(caption_arr) < seq_length:
        delta = seq_length - len(caption_arr)
        sent_encode.extend([word_index['[PAD]']] *  delta)
    return np.array(sent_encode)

In [10]:
# Write a customized dataset class
class ImageCaptionDatatset(Dataset):

    def __init__(self, dataframe, transform, seq_len,  word_index, is_train_data=True):
        self.data = dataframe
        self.is_train_data = is_train_data
        if is_train_data:
            self.targets = self.data.LabelsVector
        self.img_paths = self.data.Image_Path 
        self.transform = transform
        self.captions = dataframe.Caption
        self.seq_len = seq_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self,idx):
        img_path = self.img_paths[idx]
        image = Image.open(img_path)
        if self.transform:
            image = self.transform(image)
        caption = self.captions[idx]
        caption = encode_and_pad(caption, self.seq_len, word_index)
        caption = torch.from_numpy(caption)
        if self.is_train_data:
            targets = torch.from_numpy(self.targets[idx])
            return {
                'img': image,
                'caption': caption,
                'targets': targets
            }
        else:
            return {
                'img': image,
                'caption': caption
            }

In [11]:
# Define preprocessing for images data  
train_transform = transforms.Compose([
    transforms.Resize(224),
    transforms.RandomHorizontalFlip(),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])

test_transform = transforms.Compose([
    transforms.Resize(224),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])



In [12]:
# get the labels when it is bigger than threshold
def output_to_pred(y_hat_prob, threshold):
    # y_hat_prob = torch.sigmoid(output)
    zero = torch.zeros_like(y_hat_prob)
    one = torch.ones_like(y_hat_prob)
    y_hat = torch.where(y_hat_prob < threshold, zero, y_hat_prob)
    y_hat = torch.where(y_hat >= threshold, one, y_hat)
    return y_hat

In [13]:
# change one hot encoding of the predictions to string
def vector_pred_to_string(arr):
    result = ""
    for i in range(20):
        if arr[i] > 0:
            result += str(i)
            result += " "
    return result.strip()

# Loading the pretrained w2v model

In [14]:
import gensim.downloader as api
word_emb_model = api.load("glove-wiki-gigaword-100")



# Model Structure

In [15]:
class CNN_LSTM(nn.Module):

    def __init__(self, vocab_size, emb_dim, emb_table, lstm_layers, lstm_hidden, num_class, pretrain_cnn='Resnet'):
        super(CNN_LSTM, self).__init__()
        # loadding the pre-trained w2v matrix
        self.embed = nn.Embedding(vocab_size, emb_dim)
        self.embed.weight.data.copy_(torch.from_numpy(emb_table))
        self.embed.weight.requires_grad = False
        if pretrain_cnn == 'Resnet':
            resnet_model = models.resnet50(pretrained=True)
            self.resnet = nn.Sequential(*list(resnet_model.children())[:-1]) # (batch,2048,1,1)
            for param in self.resnet.parameters():
                param.require_grad = False
            input_size = 2048
        if pretrain_cnn == 'VGG':
            self.vgg = models.vgg16_bn(pretrained=True)
            for param in self.vgg.parameters():
                param.require_grad = False
            input_size = 1000
        self.lstm = nn.LSTM(input_size=emb_dim, hidden_size=lstm_hidden, num_layers=lstm_layers, bidirectional=True, batch_first=True)
        self.relu = nn.ReLU()
        self.bn1 = nn.BatchNorm1d(input_size + lstm_hidden * 2)
        self.fc1 = nn.Linear(input_size + lstm_hidden * 2, 512)
        self.bn2 = nn.BatchNorm1d(512)
        self.fc2 = nn.Linear(512, num_class)


    def forward(self, caption, img):
        img_feature = self.resnet(img)
        img_feature = img_feature.squeeze(-1)
        img_feature = img_feature.squeeze(-1) #((batch,2048)
        caption_emb = self.embed(caption)
        ouput, (h_n, _) = self.lstm(caption_emb) # h_n(layer * direction, batch, lstm_hidden)
        hidden_out = torch.cat((h_n[0,:,:],h_n[1,:,:]),1) # (batch, lstm_hidden*2)
        x = torch.cat((img_feature, hidden_out), 1)
        x = self.bn1(x)
        x = self.fc1(x)
        x = self.relu(self.bn2(x))
        x = self.fc2(x)
        return x

# Define hyper parameters

In [16]:
lstm_layers = 1
lstm_hidden = 128
num_class = 20
batch_size = 32
num_epoch = 30
learning_rate = 5
num_epochs = 10
pretrain_cnn='Resnet'

# Loading the data

In [17]:
# pre_processing the data
train_df = data_preprocess(train_csv_dir, img_dir, True)
test_df = data_preprocess(test_csv_dir, img_dir, False)
word_index, word_list = get_word_wrapper(train_df)
train_dataset = ImageCaptionDatatset(train_df, train_transform, seq_len,  word_index, True)
test_dataset = ImageCaptionDatatset(test_df, test_transform, seq_len,  word_index, False)
train_dataloader = DataLoader(train_dataset, drop_last=True, batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, batch_size=1)
vocab_size = len(word_list)

# Generate embedding matrix 

In [18]:
#generate matrix for embedding layer
emb_dim = word_emb_model.vector_size
emb_table = []
for i, word in enumerate(word_list):
    if word in word_emb_model:
        emb_table.append(word_emb_model[word])
    else:
        emb_table.append([0]*emb_dim)
emb_table = np.array(emb_table)
print("embedding_dim:",emb_dim )

embedding_dim: 100


# Training process

## Define the training function

In [19]:
def train_model(model, dataloader, criterion, optimizer, threshold=0.5, num_epochs=25):
    since = time.time()

    val_f1_history = []

    best_model_wts = copy.deepcopy(model.state_dict())
    best_f1 = 0.0
    sigmoid = nn.Sigmoid()
    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        running_loss = 0.0
        running_f1 = 0

        # Iterate over data.
        for data in dataloader:
            img = data['img'].to(device)
            caption = data['caption'].to(device)
            labels = data['targets'].to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward
            # track history if only in train
            with torch.set_grad_enabled(True):
                outputs = model(caption, img)
                loss = criterion(outputs, labels)
                output_pred = sigmoid(outputs)
                preds = output_to_pred(output_pred, threshold)
                loss.backward()
                optimizer.step()

            # statistics
            running_loss += loss.item() * caption.size(0)
            running_f1 += metrics.f1_score(preds.cpu().detach().numpy(), labels.cpu().detach().numpy(), average='macro')

        epoch_loss = running_loss / len(dataloader)
        epoch_f1 = running_f1 / len(dataloader)
        print('Loss: {:.4f} f1: {:.4f}'.format(epoch_loss, epoch_f1))

            # deep copy the model
    if epoch_f1 > best_f1:
        best_f1 = epoch_f1
        best_model_wts = copy.deepcopy(model.state_dict())
    val_f1_history.append(epoch_f1)

    print()
    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    print('Best val f1: {:4f}'.format(best_f1))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model, val_f1_history

In [None]:
# https://discuss.pytorch.org/t/weighted-binary-cross-entropy/51156/2
# calculate pos weight
total_sample = len(train_df)
positive_sample_distribution = np.zeros(20)
for label in train_df['LabelsVector']:
    positive_sample_distribution += label
negative_sample_distribution = total_sample - positive_sample_distribution
for i in range(len(positive_sample_distribution)):
    if positive_sample_distribution[i] == 0:
        # to avoid zero division
        positive_sample_distribution[i]  = 0.01
pos_weight = negative_sample_distribution / positive_sample_distribution
pos_weight = torch.from_numpy(pos_weight)
print(pos_weight)

tensor([2.9996e+06, 3.1596e-01, 2.4814e+01, 5.8735e+00, 2.2582e+01, 2.5545e+01,
        2.0518e+01, 2.3567e+01, 1.2573e+01, 2.7787e+01, 1.9392e+01, 4.8662e+01,
        2.9996e+06, 4.8580e+01, 1.1851e+02, 1.4510e+01, 2.6294e+01, 1.9976e+01,
        1.8670e+01, 2.8408e+01], dtype=torch.float64)


## Define model, loss function and optimizer
To add positional weight to the loss function, please the comment in the following cell

In [20]:
model = CNN_LSTM(vocab_size, emb_dim, emb_table, lstm_layers, lstm_hidden, num_class, pretrain_cnn)
optimizer = optim.Adadelta(model.parameters(),lr=learning_rate, rho=0.9, eps=1e-06, weight_decay=0)
# pos_weight = pos_weight.to(device)
# criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
criterion = nn.BCEWithLogitsLoss()
model = model.to(device)

Downloading: "https://download.pytorch.org/models/resnet50-19c8e357.pth" to /root/.cache/torch/hub/checkpoints/resnet50-19c8e357.pth


HBox(children=(FloatProgress(value=0.0, max=102502400.0), HTML(value='')))




## Start training

In [None]:
model,val_f1_history = train_model(model, train_dataloader, criterion, optimizer, num_epochs=num_epochs)

Epoch 0/29
----------


  average, "true nor predicted", 'F-score is', len(true_sum)


Loss: 2.2041 f1: 0.4567
Epoch 1/29
----------
Loss: 2.1515 f1: 0.4586
Epoch 2/29
----------
Loss: 2.0836 f1: 0.4642
Epoch 3/29
----------
Loss: 2.0307 f1: 0.4684
Epoch 4/29
----------
Loss: 1.9923 f1: 0.4713
Epoch 5/29
----------
Loss: 1.9676 f1: 0.4727
Epoch 6/29
----------
Loss: 1.9081 f1: 0.4777
Epoch 7/29
----------
Loss: 1.8716 f1: 0.4816
Epoch 8/29
----------
Loss: 1.8554 f1: 0.4820
Epoch 9/29
----------
Loss: 1.8124 f1: 0.4877
Epoch 10/29
----------
Loss: 1.7625 f1: 0.4915
Epoch 11/29
----------
Loss: 1.7436 f1: 0.4927
Epoch 12/29
----------
Loss: 1.7349 f1: 0.4938
Epoch 13/29
----------
Loss: 1.6711 f1: 0.4996
Epoch 14/29
----------
Loss: 1.6360 f1: 0.5041
Epoch 15/29
----------
Loss: 1.6161 f1: 0.5045
Epoch 16/29
----------
Loss: 1.6008 f1: 0.5079
Epoch 17/29
----------
Loss: 1.7133 f1: 0.4999
Epoch 18/29
----------
Loss: 1.7979 f1: 0.4889
Epoch 19/29
----------
Loss: 1.6742 f1: 0.5013
Epoch 20/29
----------
Loss: 1.6687 f1: 0.5012
Epoch 21/29
----------
Loss: 1.5565 f1: 0.508

## Save the model weight

In [None]:
torch.save(model.state_dict(), "lstm_cnn_weight")

# Generate prediction for testing data 

In [None]:
result = []
sigmoid = nn.Sigmoid()
model.eval()
for data in test_dataloader: 
    img = data['img'].to(device)
    caption = data['caption'].to(device)
    output = model(caption, img)
    y_hat = output_to_pred(sigmoid(output), threshold=0.5)
    y_hat = y_hat.cpu().detach().numpy()
    y_hat = y_hat.squeeze(0)
    y_hat = vector_pred_to_string(y_hat)
    result.append(y_hat)

In [None]:
# save the result 
test_df["Labels"] = result
test_df = test_df.drop(columns=["Caption", "Image_Path"])
test_df.to_csv('prediction.csv', index=False)  
print("finish writing")

finish writing


In [None]:
from google.colab import files
files.download('prediction.csv') 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
print('csv downloaded')

csv downloaded


# Model Size

In [22]:
model_size = sum(p.numel() for p in model.parameters())  * 4/ (1024*1024)
print('model size is ', model_size, "MB")

model size is  97.49566650390625 MB
