# **Imports**

In [106]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [107]:
import torch
import numpy as np
import pandas as pd
import transformers
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from PIL import Image
from torch import nn
from tqdm import tqdm

# **Data**

In [108]:
class VQA_Dataset(Dataset):
    def __init__(self, df, vocab, is_test=False, gen_path=''):
        self.df = df
        self.vocab = vocab
        self.is_test = is_test
        self.gen_path = gen_path

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        answer = self.df['answer'][idx].split(',')[0]
        answer = self.vocab.index(answer)
        question = self.df['question'][idx]
        path = f"{self.gen_path}/{self.df['image_id'][idx]}.jpg"
        image = Image.open(path)

        if not self.is_test:
            return image, question, answer
        else:
            return image, question

# **Model**

In [109]:
from transformers import CLIPProcessor, CLIPVisionModel
class Visual_Encoder(nn.Module):
    def __init__(self):
        super(Visual_Encoder, self).__init__()
        self.clip_model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch16")
        self.clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")

    def forward(self, image, device='cuda'):
        inputs = self.clip_processor(images=image, return_tensors="pt")
        pixel_values = inputs["pixel_values"].to(device)
        clip_outputs = self.clip_model(pixel_values=pixel_values)
        pooler_output = clip_outputs.pooler_output
        return pooler_output

In [110]:
from transformers import AutoTokenizer, RobertaModel
class Text_Encoder(nn.Module):
    def __init__(self):
        super(Text_Encoder, self).__init__()
        self.tokenizer = AutoTokenizer.from_pretrained("roberta-base")
        self.roberta_model = RobertaModel.from_pretrained("roberta-base")
    
    def forward(self, text, device='cuda'):
        inputs = self.tokenizer(text, return_tensors="pt")
        outputs = self.roberta_model(**inputs.to(device))
        pooler_output = outputs.pooler_output
        return pooler_output

In [111]:
class Classifier(nn.Module):
    def __init__(self, input_size=768*2, output_size=582, hidden_size=1024, dropout_prob=0.35):
        super(Classifier, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size)
        self.dropout = nn.Dropout(dropout_prob)
        self.fc1 = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        lstm_out = self.dropout(lstm_out)
        logits = self.fc1(lstm_out)
        return logits

In [112]:
class VQA_Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.visual_encoder = Visual_Encoder()
        self.textual_encoder = Text_Encoder()
        self.classifier = Classifier()
    
    def forward(self, image, answer, device='cuda'):
        text_out = self.textual_encoder(answer).to(device)
        image_out = self.visual_encoder(image).to(device)
        x = torch.cat((text_out, image_out), dim=1)
        x = self.classifier(x)
        return x
    
    def freeze(self, visual=True, textual=False, clas=False):
        if visual:
            for n, p in self.visual_encoder.named_parameters():
                p.requires_grad = False
        if textual:
            for n, p in self.textual_encoder.named_parameters():
                p.requires_grad = False
        if clas:
            for n, p in self.classifier.named_parameters():
                p.requires_grad = False

# **Train/Valid func**

In [113]:
def train_one_epoch(model, train_dataloader, optimizer, loss_fn, epoch, device, verbose=False):
    model.train()
    running_loss = 0
    
    print(f"Number of batches: {len(train_dataloader)}")
    prog_bar = tqdm(enumerate(train_dataloader), total=len(train_dataloader))
    
    for batch, (image, question, answer) in prog_bar:
        optimizer.zero_grad()
        answer = torch.tensor([answer]).to(device)
        preds = model(image, question)
        loss = loss_fn(preds, answer)
        loss.backward()
        optimizer.step()
        loss_item = loss.item()
        
        if batch >= 129:
            break
        
        running_loss += loss_item
        prog_bar.set_description(f"loss: {loss_item:.4f}")
        
        if verbose and batch % 20 == 0:
            print(f"Batch: {batch}, Loss: {loss_item}")
    
    avg_loss = running_loss / min(len(train_dataloader), 30)
    
    return avg_loss


In [114]:
@torch.no_grad()
def valid_one_epoch(model, valid_dataloader, loss_fn, epoch, device, log_wandb=True, verbose=False):
    model.eval()
    running_loss = 0
    prog_bar = tqdm(enumerate(valid_dataloader), total=len(valid_dataloader))
    for batch, (image, quetion, answer) in prog_bar:
        answer = torch.tensor([answer]).to(device)
        preds = model(image,quetion)
        loss = loss_fn(preds, answer)
        
        loss_item = loss.item()
        running_loss += loss_item
        
        prog_bar.set_description(f"val_loss: {loss_item:.4f}")        
        if verbose == True and batch % 10 == 0:
            print(f"Batch: {batch}, Loss: {loss_item}")
        
        # stop iterating if we have reached the end of the test set
        if batch >= 10:
            break
    
    avg_val_loss = running_loss / len(valid_dataloader)
    
    return avg_val_loss


# **Main**

In [115]:
train_df = pd.read_csv('./data_train.csv')
test_df = pd.read_csv('./data_eval.csv')

# Set the path to the images directory
gen_path = 'path/to/Final'

# Read the vocabulary file
with open('./answer_space.txt') as f:
    vocab = f.read().splitlines()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [116]:
trainset = VQA_Dataset(train_df,vocab,gen_path=gen_path)
testset = VQA_Dataset(test_df,vocab,gen_path=gen_path)

In [117]:
model = VQA_Model()
model = model.to('cuda')

Some weights of the model checkpoint at openai/clip-vit-base-patch16 were not used when initializing CLIPVisionModel: ['text_model.encoder.layers.7.self_attn.k_proj.bias', 'text_model.encoder.layers.6.self_attn.v_proj.bias', 'text_model.encoder.layers.0.self_attn.q_proj.bias', 'text_model.encoder.layers.11.self_attn.k_proj.bias', 'text_model.encoder.layers.6.self_attn.out_proj.weight', 'text_model.encoder.layers.2.self_attn.v_proj.bias', 'text_model.encoder.layers.4.self_attn.q_proj.weight', 'text_model.encoder.layers.11.self_attn.v_proj.bias', 'text_model.encoder.layers.7.self_attn.v_proj.weight', 'text_model.encoder.layers.1.mlp.fc1.bias', 'text_model.encoder.layers.7.self_attn.k_proj.weight', 'text_model.encoder.layers.9.mlp.fc2.bias', 'text_model.encoder.layers.11.self_attn.q_proj.weight', 'text_model.encoder.layers.3.self_attn.q_proj.bias', 'text_model.encoder.layers.1.self_attn.k_proj.weight', 'text_model.encoder.layers.9.layer_norm1.weight', 'text_model.encoder.layers.1.self_att

In [118]:
optimizer = transformers.AdamW(model.parameters(),lr=1e-5)
criterion = nn.CrossEntropyLoss()



In [119]:
model.freeze(visual=True,textual=True)

In [120]:
for epoch in range(10):
    train_loss = train_one_epoch(model, trainset, optimizer, criterion, epoch=1, device='cuda')
    print(f'train_loss - {train_loss}')
    valid_loss = valid_one_epoch(model, testset, criterion, epoch=1, device='cuda')
    print(f'valid_loss - {valid_loss}')
    torch.save(model.state_dict(), './my_model.pth')


Number of batches: 130


loss: 6.4595:  99%|█████████▉| 129/130 [00:28<00:00,  4.45it/s]


train_loss - 27.230798625946044


val_loss: 6.1053:  91%|█████████ | 10/11 [00:01<00:00,  5.88it/s]


valid_loss - 6.152606443925337
Number of batches: 130


loss: 6.1862:  99%|█████████▉| 129/130 [00:17<00:00,  7.47it/s]


train_loss - 26.49994799296061


val_loss: 5.9021:  91%|█████████ | 10/11 [00:01<00:00,  7.74it/s]


valid_loss - 5.905248988758434
Number of batches: 130


loss: 5.9839:  99%|█████████▉| 129/130 [00:17<00:00,  7.22it/s]


train_loss - 25.567242495218913


val_loss: 5.6014:  91%|█████████ | 10/11 [00:02<00:00,  4.89it/s]


valid_loss - 5.562411698428067
Number of batches: 130


loss: 5.8147:  99%|█████████▉| 129/130 [00:17<00:00,  7.24it/s]


train_loss - 24.191878652572633


val_loss: 5.1869:  91%|█████████ | 10/11 [00:01<00:00,  7.46it/s]


valid_loss - 5.105645699934526
Number of batches: 130


loss: 5.2681:  99%|█████████▉| 129/130 [00:17<00:00,  7.50it/s]


train_loss - 22.514169001579283


val_loss: 4.7153:  91%|█████████ | 10/11 [00:01<00:00,  5.93it/s]


valid_loss - 4.5740460699254815
Number of batches: 130


loss: 4.9797:  99%|█████████▉| 129/130 [00:17<00:00,  7.27it/s]


train_loss - 20.62461661497752


val_loss: 4.2737:  91%|█████████ | 10/11 [00:01<00:00,  7.52it/s]


valid_loss - 4.067878398028287
Number of batches: 130


loss: 4.5955:  99%|█████████▉| 129/130 [00:17<00:00,  7.38it/s]


train_loss - 18.982038990656534


val_loss: 3.9514:  91%|█████████ | 10/11 [00:01<00:00,  7.15it/s]


valid_loss - 3.6897918527776543
Number of batches: 130


loss: 4.1186:  99%|█████████▉| 129/130 [00:17<00:00,  7.30it/s]


train_loss - 17.94798128604889


val_loss: 3.7359:  91%|█████████ | 10/11 [00:01<00:00,  7.36it/s]


valid_loss - 3.4509987180883233
Number of batches: 130


loss: 4.2976:  99%|█████████▉| 129/130 [00:17<00:00,  7.18it/s]


train_loss - 17.17803544998169


val_loss: 3.6183:  91%|█████████ | 10/11 [00:01<00:00,  7.15it/s]


valid_loss - 3.3052017472007056
Number of batches: 130


loss: 4.0449:  99%|█████████▉| 129/130 [00:17<00:00,  7.49it/s]


train_loss - 16.91317607561747


val_loss: 3.5541:  91%|█████████ | 10/11 [00:01<00:00,  7.92it/s]


valid_loss - 3.210336370901628
Number of batches: 130


loss: 3.9040:  99%|█████████▉| 129/130 [00:17<00:00,  7.26it/s]


train_loss - 16.400334417819977


val_loss: 3.4964:  91%|█████████ | 10/11 [00:01<00:00,  7.05it/s]


valid_loss - 3.143911209973422
Number of batches: 130


loss: 3.8081:  99%|█████████▉| 129/130 [00:18<00:00,  7.00it/s]


train_loss - 16.217218232154845


val_loss: 3.4614:  91%|█████████ | 10/11 [00:01<00:00,  7.49it/s]


valid_loss - 3.091057235544378
Number of batches: 130


loss: 3.2504:  99%|█████████▉| 129/130 [00:17<00:00,  7.19it/s]


train_loss - 15.978131858507792


val_loss: 3.4468:  91%|█████████ | 10/11 [00:01<00:00,  7.06it/s]


valid_loss - 3.0472665374929253
Number of batches: 130


loss: 3.4634:  99%|█████████▉| 129/130 [00:18<00:00,  6.91it/s]


train_loss - 15.543312366803487


val_loss: 3.4250:  91%|█████████ | 10/11 [00:01<00:00,  6.69it/s]


valid_loss - 3.015853426673196
Number of batches: 130


loss: 3.2922:  99%|█████████▉| 129/130 [00:17<00:00,  7.32it/s]


train_loss - 15.684847176074982


val_loss: 3.4086:  91%|█████████ | 10/11 [00:01<00:00,  7.47it/s]


valid_loss - 2.9888033216649834
Number of batches: 130


loss: 3.2175:  99%|█████████▉| 129/130 [00:17<00:00,  7.21it/s]


train_loss - 15.345218996206919


val_loss: 3.4054:  91%|█████████ | 10/11 [00:01<00:00,  5.05it/s]


valid_loss - 2.965437889099121
Number of batches: 130


loss: 3.1501:  99%|█████████▉| 129/130 [00:17<00:00,  7.36it/s]


train_loss - 15.29057796796163


val_loss: 3.3749:  91%|█████████ | 10/11 [00:01<00:00,  7.56it/s]


valid_loss - 2.939955061132258
Number of batches: 130


loss: 2.9031:  99%|█████████▉| 129/130 [00:18<00:00,  7.06it/s]


train_loss - 15.133690937360127


val_loss: 3.3597:  91%|█████████ | 10/11 [00:01<00:00,  5.02it/s]


valid_loss - 2.9183449203317817
Number of batches: 130


loss: 2.8743:  99%|█████████▉| 129/130 [00:18<00:00,  7.13it/s]


train_loss - 14.98233227332433


val_loss: 3.3572:  91%|█████████ | 10/11 [00:01<00:00,  6.71it/s]


valid_loss - 2.903102094476873
Number of batches: 130


loss: 2.7423:  99%|█████████▉| 129/130 [00:18<00:00,  6.95it/s]


train_loss - 14.881067125002543


val_loss: 3.3516:  91%|█████████ | 10/11 [00:01<00:00,  5.50it/s]


valid_loss - 2.8879667737267236


In [121]:
valid_loss = valid_one_epoch(model, testset, criterion, epoch=10, device='cuda')

val_loss: 3.3516:  91%|█████████ | 10/11 [00:01<00:00,  8.41it/s]


In [122]:
valid_loss

2.8879667737267236

In [123]:
preds = []
gt = []
prog_bar = tqdm(enumerate(testset), total=len(testset))
for batch, (image,quetion,answer) in prog_bar:
    if batch >= 10:
        break
    answer = torch.tensor([answer]).to('cuda')
    gt += [answer]
    preds += [model(image,quetion).argmax(dim=-1).to('cpu').flatten().numpy()]

 91%|█████████ | 10/11 [00:01<00:00,  9.93it/s]


In [124]:
ggt = []
for i in gt:
    ggt += i.tolist()

In [125]:
ggt

[12, 22, 12, 4, 0, 43, 24, 8, 25, 4]

In [126]:
pp = []
for i in preds:
    pp += list(i)

In [127]:
pp

[22, 22, 22, 4, 4, 4, 4, 4, 4, 4]

In [128]:
from sklearn.metrics import accuracy_score, f1_score
f1_score(ggt, pp, average='weighted'), accuracy_score(ggt,pp)

(0.1388888888888889, 0.3)