In [27]:
import os
import sys
import numpy as np
from numpy import asarray,zeros
import pandas as pd 
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, Dataset
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
import matplotlib.pyplot as plt
import copy
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report
import transformers
from transformers import BertTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import AutoTokenizer, AutoModel, AutoConfig, AdamW, get_linear_schedule_with_warmup
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import timm

In [34]:
# if torch.cuda.is_available():
#     device = torch.device("cuda")
# else:
#     device = torch.device("cpu")
device = torch.device("cpu") # Force CPU
print("Using device", device)

Using device cpu


In [2]:
# Load the data
img_data = np.load("../data/image_array.npy")
txt_data = np.load("../data/text_array.npy")
labels_data = np.load("../data/labels.npy")
ids_data = np.load("../data/ids.npy")
# Printing the shapes
print(img_data.shape)
print(txt_data.shape)
print(labels_data.shape)
print(ids_data.shape)

(11766, 2, 224, 224, 3)
(11766, 2)
(11766, 1)
(11766, 1)


In [8]:
# Reshape image to -> num_images, sources, num_channels, width, heigth
num_images, sources, width, height, num_channels = img_data.shape
img_data_reshape = np.reshape(img_data, newshape=(num_images, sources, num_channels, width, height))
print('New Shape', img_data_reshape.shape)

New Shape (11766, 2, 3, 224, 224)


In [16]:
# Loading Pre-Trained ResNet model
base_model = timm.create_model('resnet50', pretrained=True)
print(base_model)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (act1): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (act1): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (act2): ReLU(inplace=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (act3): ReLU(inplace=True)
      (downsample): Sequen

In [20]:
# Model to extract penultimate layer features
class ResNet50Bottom(nn.Module):
    def __init__(self, original_model):
        super(ResNet50Bottom, self).__init__()
        self.features = nn.Sequential(*list(original_model.children())[:-1])
        
    def forward(self, x):
        x = self.features(x)
        return x

In [21]:
res50_conv2 = ResNet50Bottom(base_model)
print(res50_conv2) # New model

ResNet50Bottom(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (4): Sequential(
      (0): Bottleneck(
        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act1): ReLU(inplace=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act2): ReLU(inplace=True)
        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act3): R

In [25]:
with torch.no_grad():
    out = res50_conv2(torch.tensor(img_data_reshape[0:32,0,:,:,:]))
    print(out)
    print(out.shape)
    print('Max value', out.max())
    print('Min value', out.min())

tensor([[5.8605e-02, 5.1225e-02, 0.0000e+00,  ..., 6.6340e-03, 7.7736e-02,
         8.3336e-03],
        [9.3709e-03, 6.3617e-04, 1.4821e-02,  ..., 2.0285e-02, 5.5894e-02,
         3.4557e-01],
        [9.3709e-03, 6.3617e-04, 1.4821e-02,  ..., 2.0285e-02, 5.5894e-02,
         3.4557e-01],
        ...,
        [1.4405e-01, 1.7607e-03, 9.8179e-04,  ..., 5.3647e-02, 1.7863e-01,
         1.7842e-01],
        [0.0000e+00, 2.6843e-04, 7.3658e-02,  ..., 4.0140e-03, 2.0584e-02,
         2.4735e-01],
        [0.0000e+00, 2.6843e-04, 7.3658e-02,  ..., 4.0140e-03, 2.0584e-02,
         2.4735e-01]])
torch.Size([32, 2048])
Max value tensor(2.4431)
Min value tensor(0.)


In [26]:
print(txt_data[0]) # Sample text

['fifteen homeless people in chicago were found dead on the street because of record low temperatures that hit the city in late january'
 'Fifteen homeless people in Chicago were reportedly found dead on the street as a result of the record temperatures that hit the city in late January when the devastating winter storm swept through Chicago in late January, perhaps worried social media users have fallen into error with a meme that he inflated the budget of the storm on the city the post that originated on twitter on january said homeless in chicago are frozen dead take a moment and think how lucky you are right now forget the trump wall the moves of the furry the jordan you want the new purse you want for a moment think about how lucky and blessed you are how blessed we are as the tweet was viewed multiple times with more likes the accompanying photo depicting a homeless man sleeping on a snowy street was actually taken in January and posted in the Canadian newspaper national post als

In [45]:
#TODO: Build BERT based model for feature extraction
class BERTModel(nn.Module):
    def __init__(self, bert_model="bert-base-uncased", freeze_bert=False):
        super(BERTModel, self).__init__()
        self.model_name = bert_model
        #  Instantiating BERT-based model object
        self.config = AutoConfig.from_pretrained(bert_model, output_hidden_states=False)
        self.bert_layer = AutoModel.from_pretrained(bert_model, config = self.config)
        # Freeze bert layers and only train the classification layer weights
        if freeze_bert:
            for p in self.bert_layer.parameters():
                p.requires_grad = False
    def forward(self, input_ids, attn_masks, token_type_ids):
        '''
        Inputs:
            -input_ids : Tensor  containing token ids
            -attn_masks : Tensor containing attention masks to be used to focus on non-padded values
            -token_type_ids : Tensor containing token type ids to be used to identify sentence1 and sentence2
        '''
        hidden_state  = self.bert_layer(input_ids, attn_masks, token_type_ids)
        pooler_output = hidden_state[0][:,0]

        return pooler_output


def get_transformer_model(modelname):
    trans_tokenizer = AutoTokenizer.from_pretrained(modelname, do_lower_case = True)
    trans_model = BERTModel(modelname)
    print(trans_tokenizer)
    print(trans_model)
    return trans_tokenizer, trans_model

In [46]:
trans_model_name = 'bert-base-uncased'
trans_tokenizer, trans_model = get_transformer_model(trans_model_name)

PreTrainedTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=False, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})
BERTModel(
  (bert_layer): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
   

In [47]:
################ Tokenizer ####################
###############################################
def tokenize(data_list, tokenizer, MAX_LEN):
	print('Tokenizing')
	# add special tokens for BERT to work properly
	sentences = ["[CLS] " + data_list[i] + " [SEP]" for i in range(0,len(data_list))]
	tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
	# print ("Tokenize the first sentence:")
	# print (tokenized_texts[0])
	# Pad our input tokens
	input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
	                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
	# Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
	input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
	input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
	# Create attention masks
	attention_masks = []
	# Create a mask of 1s for each token followed by 0s for padding
	for seq in input_ids:
	  seq_mask = [float(i>0) for i in seq]
	  attention_masks.append(seq_mask)

	# Printing the input_ids
	# print('Input_ids[0]', input_ids[0])
	# print('Input_ids[0] elements')
	# for i in input_ids[0]:
	# 	print(i, type(i), end = " ")

	token_type_ids = []
	for seq in input_ids:
		type_id = []
		condition = 'sent1'
		for i in seq:
			if condition == 'sent1':
				type_id.append(0)
				if i == 102:
					condition = 'sent2'
			elif condition == 'sent2':
				type_id.append(1)
		token_type_ids.append(type_id)
	# Finally convert this into torch tensors
	data_inputs = torch.tensor(input_ids, device =device)
	data_masks = torch.tensor(attention_masks, device =device)
	data_token_ids = torch.tensor(token_type_ids, device = device)
	return data_inputs, data_masks, data_token_ids

In [48]:
MAX_LEN = 512
text_inputs, text_masks, text_token_ids = tokenize(txt_data[:,0], trans_tokenizer, MAX_LEN)

Tokenizing


In [49]:
print(text_inputs.shape)
print(text_masks.shape)
print(text_token_ids.shape)

torch.Size([11766, 512])
torch.Size([11766, 512])
torch.Size([11766, 512])


In [50]:
text_model_output = trans_model(text_inputs[0:32,:], text_masks[0:32,:], text_token_ids[0:32,:])

In [52]:
print(text_model_output)
print('Max value', text_model_output.max())
print('Min value', text_model_output.min())
print(text_model_output.shape)

tensor([[ 0.0605, -0.2929,  0.0140,  ..., -0.2325,  0.4449,  0.3719],
        [-0.4321, -0.1234, -0.2324,  ...,  0.0157,  0.4601,  0.1857],
        [-0.4321, -0.1234, -0.2324,  ...,  0.0157,  0.4601,  0.1857],
        ...,
        [-0.0718,  0.0297, -0.1426,  ..., -0.3676,  0.5061,  0.1015],
        [-0.3934,  0.0486, -0.1581,  ..., -0.0803,  0.1111,  0.4811],
        [-0.3934,  0.0486, -0.1581,  ..., -0.0803,  0.1111,  0.4811]],
       grad_fn=<SelectBackward>)
Max value tensor(3.7036, grad_fn=<MaxBackward1>)
Min value tensor(-7.2204, grad_fn=<MinBackward1>)
torch.Size([32, 768])


In [59]:
#TODO: Multimoda (Image+Text) model
class MultimodalHead(nn.Module):
    def __init__(self):
        super(MultimodalHead, self).__init__()
        self.vision_base_model = timm.create_model('resnet50', pretrained=True)
        self.vision_model_head = ResNet50Bottom(self.vision_base_model)
        self.text_head = BERTModel('bert-base-uncased')
        self.normalize = nn.LayerNorm(2816)
    def forward(self, img_features, txt_features):
        img_out = self.vision_model_head(img_features)
        txt_out = self.text_head(txt_features[0], txt_features[1], txt_features[2])
        multimodal_concat = self.normalize(torch.cat((img_out, txt_out), 1))
        return multimodal_concat

In [60]:
#TODO: Multimodal forward pass
multimodal_model = MultimodalHead().to(device)

In [61]:
image_input = torch.tensor(img_data_reshape[0:32,0,:,:,:]).to(device)
text_input = text_inputs[0:32,:].to(device), text_masks[0:32,:].to(device), text_token_ids[0:32,:].to(device)
multimodal_out = multimodal_model(image_input, text_input)

In [62]:
print(multimodal_out)
print(multimodal_out.shape)

tensor([[ 0.0059, -0.0186, -0.1889,  ..., -0.9620,  1.2906,  1.0479],
        [-0.1504, -0.1821, -0.1306,  ..., -0.1274,  1.4854,  0.4896],
        [-0.1504, -0.1821, -0.1306,  ..., -0.1274,  1.4854,  0.4896],
        ...,
        [ 0.1474, -0.3220, -0.3246,  ..., -1.5404,  1.3419,  0.0071],
        [-0.2155, -0.2146,  0.0403,  ..., -0.4945,  0.1705,  1.4553],
        [-0.2155, -0.2146,  0.0403,  ..., -0.4945,  0.1705,  1.4553]],
       grad_fn=<NativeLayerNormBackward>)
torch.Size([32, 2816])
