In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
#%cd /content/drive/MyDrive/teses/tese_MECD/implementation
import sys
import os
sys.path.append(os.path.abspath('/content/drive/MyDrive/teses/tese_MECD/implementation'))

In [3]:
%cd '/content/drive/MyDrive/teses/tese_MECD/implementation'

/content/drive/MyDrive/teses/tese_MECD/implementation


In [4]:
from PIL import Image
import requests
requests.__version__
!pip install requests==2.27.1



In [6]:
url = "./data/flickr30k_images/flickr30k_images/5897297135.jpg"
image = Image.open(requests.get(url, stream=True).raw).convert("RGB")

MissingSchema: ignored

In [4]:
!pip install transformers
!pip install wget
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import LxmertTokenizer, LxmertConfig, LxmertModel
from modeling_frcnn import GeneralizedRCNN
import utils
from processing_image import Preprocess
from transformers import AdamW
from torch.utils.data import DataLoader

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 33.9 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 6.9 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 60.8 MB/s 
Collecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 55.2 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 67.6 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found e

In [5]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, hypothesis, images, labels):
        self.hypothesis = hypothesis
        self.images = images
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {'text':self.hypothesis[idx],'img':self.images[idx],
                'label': self.labels[idx]}
        return item

    def __len__(self):
        return len(self.labels)

In [9]:
class MyTrainer():
    def __init__(self,model,device='cpu'):
        self.device = device
        self.train = self.read_dataset(data_path ='./data/', dataset_path='esnlive_train.csv',
                                       img_path='flickr30k_images/flickr30k_images/')
        self.test = self.read_dataset(data_path ='./data/', dataset_path='esnlive_test.csv',
                                       img_path='flickr30k_images/flickr30k_images/')
        self.train_dataset = MyDataset(self.train['hypothesis'].values,
                                 self.train['image'].values,
                                 self.train['label'].values)
        self.test_dataset = MyDataset(self.test['hypothesis'].values,
                                 self.test['image'].values,
                                 self.test['label'].values)
        self.lxmert_tokenizer = LxmertTokenizer.from_pretrained("unc-nlp/lxmert-base-uncased")
        self.rcnn_cfg = utils.Config.from_pretrained("unc-nlp/frcnn-vg-finetuned")
        self.rcnn_cfg.MODEL.DEVICE = self.device
        self.rcnn = GeneralizedRCNN.from_pretrained("unc-nlp/frcnn-vg-finetuned", config=self.rcnn_cfg)
        self.image_preprocess = Preprocess(self.rcnn_cfg)
        self.model = model.to(self.device)
    
    def get_visual_features(self,img):
        #preprocess image
        images, sizes, scales_yx = self.image_preprocess(img)
        output_dict = self.rcnn(
            images, 
            sizes, 
            scales_yx=scales_yx, 
            padding="max_detections",
            max_detections=self.rcnn_cfg.max_detections,
            return_tensors="pt"
        )
        
        #Very important that the boxes are normalized
        normalized_boxes = output_dict.get("normalized_boxes")
        features = output_dict.get("roi_features")
        return normalized_boxes, features
    
    def get_text_features(self,text): 
        #preprocess text
        inputs = self.lxmert_tokenizer(
            text,
            padding="max_length",
            max_length=20,
            truncation=True,
            return_token_type_ids=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors="pt"
        )
        return inputs
      
    def read_dataset(self,data_path=None,dataset_path=None,img_path=None):
        data = pd.read_csv(data_path+dataset_path)
        labels_encoding = {'contradiction':0,'neutral': 1,
                           'entailment':2}
        data = data[['hypothesis','Flickr30kID','gold_label']]
        data['gold_label']=data['gold_label'].apply(lambda label: labels_encoding[label])
        data['Flickr30kID'] = data['Flickr30kID'].apply(lambda x: data_path+img_path+x)
        data.rename(columns={ data.columns[0]: "hypothesis", data.columns[1]: "image",
                              data.columns[2]: "label" }, inplace = True)
        return data
                
    def train_model(self,epochs=None):
        optim = AdamW(self.model.parameters(), lr=5e-5)
        train_loader = DataLoader(self.train_dataset, batch_size=8, shuffle=True)
        for epoch in range(epochs):
            for item in train_loader:
                text = item['text']
                img = item['img']
                label = item['label']
                inputs = self.get_text_features(text)
                normalized_boxes, features = self.get_visual_features(img)
                inputs = inputs.to(self.device)
                normalized_boxes = normalized_boxes.to(self.device)
                features = features.to(self.device)
                optim.zero_grad()
                outputs = model.forward(
                    input_ids=inputs.input_ids,
                    attention_mask=inputs.attention_mask,
                    visual_feats=features,
                    visual_pos=normalized_boxes,
                    token_type_ids=inputs.token_type_ids,
                )
                loss = outputs.loss
                loss.backward()
                optim.step()
                break
        self.model.eval()
        return

In [10]:
class Lxmert(LxmertModel):
    def __init__(self,numb_labels=3):
        super().__init__(LxmertConfig.from_pretrained("unc-nlp/lxmert-base-uncased"))
        self.config.problem_type = "single_label_classification"
        self.classification = torch.nn.Linear(self.config.hidden_size, numb_labels)
        self.num_labels = numb_labels
        if self.config.problem_type == "single_label_classification":
          self.loss_fct = torch.nn.CrossEntropyLoss()
          self.output_loss = lambda output,labels : self.loss_fct(output.logits.view(-1, self.num_labels), labels.view(-1)) 
        elif self.config.problem_type == "regression":
          self.loss_fct = torch.nn.MSELoss()
          if self.num_labels == 1: self.output_loss = lambda output,labels : self.loss_fct(output.logits.squeeze(), labels.squeeze())
          else: self.output_loss =  lambda output,labels : self.loss_fct(output.logits, labels)
        elif self.config.problem_type == "multi_label_classification":
          self.loss_fct = torch.nn.BCEWithLogitsLoss()
          self.output_loss = lambda output,labels : self.loss_fct(output.logits, labels)
        # don't forget to init the weights for the new layers
        self.init_weights()
        
    def forward(self,input_ids,attention_mask,token_type_ids,features,normalized_boxes,label):
        output = super().forward(
            input_ids=input_ids,
            attention_mask=attention_mask,
            visual_feats=features,
            visual_pos=normalized_boxes,
            token_type_ids=token_type_ids,
            return_dict=True,
            output_attentions=False,
        )
                
        aux = self.classification(output.pooled_output)
        output.logits = aux
        output.loss = self.output_loss(output, label)
        return output
    
    def save_model(self,path):
        torch.save(self.state_dict(), path)
        
    def load_model(self,path):
        self.load_state_dict(torch.load(path))
        self.eval()
        
    def run(self):
        data_path = './e-ViL/data/'
        train = pd.read_csv(data_path+'esnlive_train.csv')
        labels_encoding = {'contradiction':0,'neutral': 1,
                           'entailment':2}
        train['gold_label']=train['gold_label'].apply(lambda label: labels_encoding[label])
        img_path1 = data_path+'flickr30k_images/flickr30k_images/'+ train.loc[50,'Flickr30kID']#"32542645.jpg"
        question1 = train.loc[50,'hypothesis'] #"How many people are in the image?"
        label1 = train.loc[50,'gold_label']
        print('SAMPLE1')
        print(img_path1,question1,label1)
        item1 = {'text':[question1], 'img':[img_path1], 'label':torch.LongTensor([label1])}
        output = self.forward(item1)
        print(output.logits)
        m = torch.nn.Softmax(dim=1)
        probs = m(output.logits)
        print(probs)
        img_path2 = data_path+'flickr30k_images/flickr30k_images/'+ train.loc[51,'Flickr30kID']#"32542645.jpg"
        question2 = train.loc[51,'hypothesis'] #"How many people are in the image?"
        label2 = train.loc[51,'gold_label']
        print('SAMPLE2')
        print(img_path2,question2,label2)
        item2 = {'text':[question1,question2], 'img':[img_path1,img_path2], 
                'label':torch.LongTensor([label1,label2])}
        output = self.forward(item2)
        print(output.logits)
        m = torch.nn.Softmax(dim=1)
        probs = m(output.logits)
        print(probs)
        
        return output

In [11]:
#device = "cpu"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
task = 'train'
device

device(type='cuda')

In [12]:
if task =='train':
    model = Lxmert()
    trainer = MyTrainer(model,device=device)
    trainer.train_model(epochs=1)
    model.save_model("/content/drive/MyDrive/teses/tese_MECD/implementation/my_model")
elif task =='test':
    model = Lxmert()
    model.load_model("/content/drive/MyDrive/teses/tese_MECD/implementation/my_model")
    output = model.run()

Downloading:   0%|          | 0.00/776 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/153 [00:00<?, ?B/s]

%s not found in cache or force_download set to True, downloading to %s https://s3.amazonaws.com/models.huggingface.co/bert/unc-nlp/frcnn-vg-finetuned/config.yaml /root/.cache/torch/transformers/tmpj234f184


Downloading:   0%|          | 0.00/2.13k [00:00<?, ?B/s]

loading configuration file cache
%s not found in cache or force_download set to True, downloading to %s https://cdn.huggingface.co/unc-nlp/frcnn-vg-finetuned/pytorch_model.bin /root/.cache/torch/transformers/tmp_ln59yjy


Downloading:   0%|          | 0.00/262M [00:00<?, ?B/s]

loading weights file https://cdn.huggingface.co/unc-nlp/frcnn-vg-finetuned/pytorch_model.bin from cache at /root/.cache/torch/transformers/57f6df6abe353be2773f2700159c65615babf39ab5b48114d2b49267672ae10f.77b59256a4cf8343ae0f923246a81489fc8d82f98d082edc2d2037c977c0d9d0
All model checkpoint weights were used when initializing GeneralizedRCNN.

All the weights of GeneralizedRCNN were initialized from the model checkpoint at unc-nlp/frcnn-vg-finetuned.
If your task is similar to the task the model of the checkpoint was trained on, you can already use GeneralizedRCNN for predictions without further training.




MissingSchema: ignored