In [2]:
#from PIL import Image
#import requests
#requests.__version__
#!pip install requests==2.27.1
!pip install transformers
!pip install wget
!pip install lmdb



In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import sys
import os
sys.path.append(os.path.abspath('/content/drive/MyDrive/teses/tese_MECD/implementation'))

In [5]:
%cd '/content/drive/MyDrive/teses/tese_MECD/implementation'

/content/drive/MyDrive/teses/tese_MECD/implementation


In [6]:
#url = "./data/flickr30k_images/flickr30k_images/5897297135.jpg"
#os.path.isfile(url)
#os.path.isfile('./data/flickr30k_images/flickr30k_images/4852389235.jpg')
#image = Image.open(requests.get(url, stream=True).raw).convert("RGB")

In [29]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import LxmertTokenizer, LxmertConfig, LxmertModel
from modeling_frcnn import GeneralizedRCNN
import utils
from processing_image import Preprocess
from torch.optim import AdamW
from torch.utils.data import DataLoader
import lmdb
import pickle

In [20]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self,path):
        self.path = path
        self.size = self.getSize()
        self.env = lmdb.open(
            path, readonly=True, create=False, readahead=not False
        )
        self.txn = self.env.begin(buffers=True)
    
    def getSize(self):
        env = lmdb.open(self.path, readonly=True)
        stats = env.stat()
        count = stats['entries']
        env.close()
        return count
            
    def deserializeItem(self,item):
        item = pickle.loads(item)
        item['input_ids']=torch.IntTensor(item['input_ids'][0])
        item['attention_mask']=torch.IntTensor(item['attention_mask'][0])
        item['token_type_ids']=torch.IntTensor(item['token_type_ids'][0])
        item['normalized_boxes']=torch.FloatTensor(item['normalized_boxes'][0])
        item['features']=torch.FloatTensor(item['features'][0])
        return item
    
    def __getitem__(self, idx):
        item = self.txn.get(str(idx).encode())
        item = self.deserializeItem(item)
        return item

    def __len__(self):
        return self.size
    
    def __exit__(self):
        self.env.close()

In [15]:
class MyTrainer():
    def __init__(self,model,train,device=None):
        self.device = device
        self.model = model
        self.train = train

    def train_model(self,batch_size = None, lr= None, epochs=None):
        optimizer = AdamW(self.model.parameters(), lr=lr)
        train_loader = DataLoader(self.train, batch_size=batch_size, shuffle=True)
        for epoch in range(epochs):
            for item in train_loader:
                input_ids = item['input_ids'].to(self.device)
                attention_mask=item['attention_mask'].to(self.device)
                token_type_ids=item['token_type_ids'].to(self.device)
                features = item['features'].to(self.device)
                normalized_boxes = item['normalized_boxes'].to(self.device)
                label = item['label'].to(self.device)
                optimizer.zero_grad()
                outputs = self.model.forward(input_ids,attention_mask,token_type_ids,
                                             features,normalized_boxes,label)
                loss = outputs.loss
                loss.backward()
                optimizer.step()
        return

In [12]:
class MyEvaluator():
  def __init__(self,test):
    self.test = test
    pass
    
  def evaluate(self, X, y):
      """
      X (n_examples x n_features)
      y (n_examples): gold labels
      """
      self.eval()
      y_hat = self.predict(X)
      n_correct = (y == y_hat).sum().item()
      n_possible = float(y.shape[0])
      self.train()
      return n_correct / n_possible

In [13]:
class Lxmert(LxmertModel):
    def __init__(self,numb_labels=3):
        super().__init__(LxmertConfig.from_pretrained("unc-nlp/lxmert-base-uncased"))
        self.config.problem_type = "single_label_classification"
        self.num_labels = numb_labels
        self.classification = torch.nn.Linear(self.config.hidden_size, self.num_labels)
        if self.config.problem_type == "single_label_classification":
          self.loss_fct = torch.nn.CrossEntropyLoss()
          self.output_loss = lambda output,labels : self.loss_fct(output.logits.view(-1, self.num_labels), labels.view(-1)) 
        elif self.config.problem_type == "regression":
          self.loss_fct = torch.nn.MSELoss()
          if self.num_labels == 1: self.output_loss = lambda output,labels : self.loss_fct(output.logits.squeeze(), labels.squeeze())
          else: self.output_loss =  lambda output,labels : self.loss_fct(output.logits, labels)
        elif self.config.problem_type == "multi_label_classification":
          self.loss_fct = torch.nn.BCEWithLogitsLoss()
          self.output_loss = lambda output,labels : self.loss_fct(output.logits, labels)
        # don't forget to init the weights for the new layers
        self.init_weights()
    
    def forward(self,input_ids,attention_mask,token_type_ids,features,normalized_boxes,label):

        output = super().forward(
            input_ids=input_ids,
            attention_mask=attention_mask,
            visual_feats=features,
            visual_pos=normalized_boxes,
            token_type_ids=token_type_ids,
            return_dict=True,
            output_attentions=False,
        )
        
        aux = self.classification(output.pooled_output)
        
        output.logits = aux
        output.loss = self.output_loss(output, label)
        return output
    
    def predict(self,X):
      """
      X (n_examples x n_features)
      """
      scores = model(X)  # (n_examples x n_classes)
      predicted_labels = scores.argmax(dim=-1)  # (n_examples)
      return predicted_labels

    def save_model(self,path):
        torch.save(self.state_dict(), path)
        
    def load_model(self,path):
        self.load_state_dict(torch.load(path))

In [14]:
#device = "cpu"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
task = 'train'
device

device(type='cuda')

In [21]:
if task =='train':
    model = Lxmert()
    model = model.to(device)
    train = MyDataset("/content/drive/MyDrive/teses/tese_MECD/implementation/data/my_train_db")
    trainer = MyTrainer(model,train,device=device)
    print("-----Training Model-----")
    trainer.train_model(epochs=1,batch_size = 8, lr = 1e-2)
    print('----Training finished-----')
    model.save_model("/content/drive/MyDrive/teses/tese_MECD/implementation/my_model")
elif task =='test':
    model = Lxmert()
    model.load_model("/content/drive/MyDrive/teses/tese_MECD/implementation/my_model")
    trainer = MyTrainer(model,device=device)
    output = run_example(model,train)

-----Training Model-----
----Training finished-----


In [None]:
#%reset
#import gc
#gc.collect()

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


0

In [26]:
def run_example(model):
    data_path = '/content/drive/MyDrive/teses/tese_MECD/implementation/data/'
    train = pd.read_csv(data_path+'esnlive_train.csv')
    labels_encoding = {'contradiction':0,'neutral': 1,
                       'entailment':2}
    train['gold_label']=train['gold_label'].apply(lambda label: labels_encoding[label])
    img_path = data_path+'flickr30k_images/flickr30k_images/'+ train.loc[50,'Flickr30kID']#"32542645.jpg"
    question = train.loc[50,'hypothesis'] #"How many people are in the image?"
    label = train.loc[50,'gold_label']
    
    lxmert_tokenizer = LxmertTokenizer.from_pretrained("unc-nlp/lxmert-base-uncased")
    rcnn_cfg = utils.Config.from_pretrained("unc-nlp/frcnn-vg-finetuned")
    rcnn = GeneralizedRCNN.from_pretrained("unc-nlp/frcnn-vg-finetuned", config=rcnn_cfg)
    image_preprocess = Preprocess(rcnn_cfg)
    
    images, sizes, scales_yx = image_preprocess(img_path)
    
    #preprocess image
    output_dict = rcnn(
        images, 
        sizes, 
        scales_yx=scales_yx, 
        padding="max_detections",
        max_detections=rcnn_cfg.max_detections,
        return_tensors="pt"
    )
    
    #preprocess text
    inputs = lxmert_tokenizer(
        question,
        padding="max_length",
        max_length=20,
        truncation=True,
        return_token_type_ids=True,
        return_attention_mask=True,
        add_special_tokens=True,
        return_tensors="pt"
    )
    
    #Very important that the boxes are normalized
    normalized_boxes = output_dict.get("normalized_boxes")
    features = output_dict.get("roi_features")
    item = {'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
            'token_type_ids': inputs['token_type_ids'],
            'features':features, 
            'normalized_boxes':normalized_boxes, 
            'label':torch.LongTensor([label])}
    output = model.forward(inputs['input_ids'],inputs['attention_mask'],inputs['token_type_ids'],
                          features,normalized_boxes,torch.LongTensor([label]))
    m = torch.nn.Softmax(dim=1)
    probs = m(output.logits)
    print(img_path)
    print(question)
    print(label)
    print(probs)
    return output

run_example(model)