# Setup

In [None]:
!pip install transformers==4.17 datasets
from google.colab import drive
drive.mount('/content/drive')

Collecting transformers==4.17
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 4.6 MB/s 
[?25hCollecting datasets
  Downloading datasets-2.0.0-py3-none-any.whl (325 kB)
[K     |████████████████████████████████| 325 kB 37.5 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 18.3 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 33.6 MB/s 
[?25hCollecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 39.6 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 k

In [None]:
import requests
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.optim as optim
from torch.nn import CrossEntropyLoss
from transformers import BertConfig, ViTConfig
from transformers import CLIPProcessor, CLIPTokenizer, CLIPModel, CLIPTextModel, CLIPVisionModel
from transformers import VisionEncoderDecoderConfig, VisionEncoderDecoderModel
from transformers import VisionTextDualEncoderModel, VisionTextDualEncoderProcessor
from PIL import Image

processor = CLIPProcessor.from_pretrained("flax-community/clip-rsicd-v2")
clip_model = CLIPModel.from_pretrained("flax-community/clip-rsicd-v2")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


class MyVisionTextModel(CLIPModel):
  def __init__(self, config, num_labels):
    super().__init__(clip_model.config)
    self.new_encoder_layer = torch.nn.TransformerEncoderLayer(d_model=512, nhead=8)
    self.new_transformer_encoder = torch.nn.TransformerEncoder(self.new_encoder_layer, num_layers=3)
    self.classification = torch.nn.Linear(512, num_labels, bias=True)
    self.num_labels = num_labels

  def forward(self, input_ids=None, pixel_values=None, attention_mask=None, position_ids=None, return_loss=None, output_attentions=None, output_hidden_states=None, labels=None):
    output = super().forward(input_ids, pixel_values, attention_mask, position_ids, return_loss, output_attentions, output_hidden_states, return_dict=True)
    
    aux_vision = output.vision_model_output[0]
    aux_vision = self.visual_projection(aux_vision) 
    aux_text = output.text_model_output[0]
    aux_text = self.text_projection(aux_text)

    #print("input_ids:", input_ids, "inputs_id size:", input_ids.size())
    #print("aux_text:", aux_text, "aux_text size:", aux_text.size())  
    #print("vision projection size:", aux_vision.size(), "/ text projection size:", aux_text.size()) 


    aux = torch.cat((aux_vision, aux_text), dim=1)
    #print("initial multi modal tensor size:", aux.size())
    #print("multi modal tensor size needs to be (sequence length, number of batches, feature number)", "(", aux.size()[1], ",", aux.size()[0], ",", aux.size()[2] , ")")
    aux = aux.reshape((aux.size()[1], aux.size()[0], aux.size()[2]))
    #print("after reshape multi modal tensor:", aux, "multi modal tensor size:", aux.size())
    vision_mask = torch.ones((aux_vision.size()[0], aux_vision.size()[1])).to(device)
    #print("text mask", attention_mask, "text size:", attention_mask.size())
    #print("vision mask:", vision_mask, "vision mask size:", vision_mask.size())

    #print("text_projection_mask:", text_projection_mask, "text_projection_mask size:", text_projection_mask.size())
    multi_modal_mask = torch.cat((vision_mask, attention_mask), dim=1).to(device)

    #print("multi_modal_mask tensor:", multi_modal_mask, "multi_modal_mask size:", multi_modal_mask.size())
    aux = self.new_transformer_encoder(aux, src_key_padding_mask=multi_modal_mask) 
    aux = aux.reshape((aux.size()[1], aux.size()[0], aux.size()[2])) #change back shape to (batch size, sequence length, features)
    #print("trasnformer encoder output:", aux, "transformer encoder output size:", aux.size())
    
    #multi_modal_mask = multi_modal_mask.unsqueeze(2).expand(-1,-1, aux.size()[2])
    
    #TODO experimentar a mask inicial e ver se ele faz as contas bem na mesma, se nao deixar ficar assim 
    #print("expanded multi_modal_mask tensor:", multi_modal_mask, "expanded multi_modal_mask size:", multi_modal_mask.size())
    aux = torch.sum(aux * multi_modal_mask, 1) / torch.clamp(multi_modal_mask.sum(1), min=1e-9)
    aux = self.classification(aux)

    #print("classification:", aux, "classification size:", aux.size())
    output.logits = aux
    output.loss = None
    #print("labels:", labels)
    #print("forward output with no loss:", output)
    if labels is not None:
      if self.config.problem_type is None:
        if self.num_labels == 1: self.config.problem_type = "regression"
        elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): self.config.problem_type = "single_label_classification" 
        else: self.config.problem_type = "multi_label_classification" # cenários com várias respostas possíveis
      if self.config.problem_type == "regression":
        loss_fct = torch.nn.MSELoss()
        if self.num_labels == 1: loss = loss_fct(output.logits.squeeze(), labels.squeeze())
        else: output.loss = loss_fct(output.logits, labels)
      elif self.config.problem_type == "single_label_classification":
        loss_fct = CrossEntropyLoss()
        output.loss = loss_fct(output.logits.view(-1, self.num_labels), labels.view(-1))
      elif self.config.problem_type == "multi_label_classification":
        loss_fct = torch.nn.BCEWithLogitsLoss()
        output.loss = loss_fct(output.logits, labels)
    #print("problem type:", self.config.problem_type)
    return output
    



Downloading:   0%|          | 0.00/316 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/939k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/512k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/389 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.00k [00:00<?, ?B/s]

ftfy or spacy is not installed using BERT BasicTokenizer instead of ftfy.


Downloading:   0%|          | 0.00/3.94k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/577M [00:00<?, ?B/s]

# Train with PyTorch


In [None]:
import pandas
import datasets
import os
batch_size = 64
vqaLR_dataset = datasets.load_from_disk("/content/drive/My Drive/Thesis Datasets/RSVQA_LR/Processed/dataset")

vqaLR_dataset["train"] = vqaLR_dataset["train"]#.shard(num_shards=100, index=0)
vqaLR_dataset["test"] = vqaLR_dataset["test"]#.shard(num_shards=100, index=0)
vqaLR_dataset["validation"] = vqaLR_dataset["validation"]#.shard(num_shards=100, index=0)
labels = list(set(vqaLR_dataset["train"]["answer"]))
vqaLR_dataset

DatasetDict({
    train: Dataset({
        features: ['img_id', 'question', 'answer'],
        num_rows: 57223
    })
    test: Dataset({
        features: ['img_id', 'question', 'answer'],
        num_rows: 10004
    })
    validation: Dataset({
        features: ['img_id', 'question', 'answer'],
        num_rows: 10005
    })
})

In [None]:
#generate label mapping
label2id = {}
id2label = {}

count = 0
for label in labels:
  label2id[label] = count  
  id2label[count] = label
  count += 1
print("label2id",label2id)
print("id2label", id2label)


trainloader = torch.utils.data.DataLoader(vqaLR_dataset["train"], batch_size=batch_size,
                                          shuffle=True, num_workers=2)

testloader = torch.utils.data.DataLoader(vqaLR_dataset["test"], batch_size=batch_size,
                                         shuffle=False, num_workers=2)
num_labels = len(list(label2id.keys()))

label2id {'no': 0, 'yes': 1, 'rural': 2, 'urban': 3, 'more than 1000': 4, 'between 1 and 10': 5, '0': 6, 'between 101 and 1000': 7, 'between 11 and 100': 8}
id2label {0: 'no', 1: 'yes', 2: 'rural', 3: 'urban', 4: 'more than 1000', 5: 'between 1 and 10', 6: '0', 7: 'between 101 and 1000', 8: 'between 11 and 100'}


In [None]:
model = MyVisionTextModel(clip_model.config, num_labels=num_labels)
model.text_model = clip_model.text_model
model.vision_model = clip_model.vision_model
model.visual_projection = clip_model.visual_projection
model.text_projection = clip_model.text_projection
model.logit_scale = clip_model.logit_scale
#model.save_pretrained("my-model")
#model = MyVisionTextModel.from_pretrained("my-model", num_labels)
model.to(device)

MyVisionTextModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0): CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e-05,

In [None]:
from transformers import get_scheduler
from tqdm.auto import tqdm
from datasets import load_metric

optimizer = optim.AdamW(model.parameters(), lr=1e-5)
num_epochs = 5
num_training_steps = num_epochs * len(trainloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

img_list = os.listdir("/content/drive/My Drive/Thesis Datasets/RSVQA_LR/Processed/Images_LR")

def prepare_batch(batch):
  encoded_imgs = {}
  img_names = []
  for img_id in batch["img_id"].tolist():
    if str(img_id) + ".tif" in img_list:
      img_names.append(str(img_id) + ".tif")
  imgs_to_encode = [Image.open("/content/drive/My Drive/Thesis Datasets/RSVQA_LR/Processed/Images_LR/" +img) for img in img_names]
  #process the entire batch at once with padding for dynamic padding
  processed_batch = processor(text=batch["question"], images=imgs_to_encode, padding=True, return_tensors="pt")
  del imgs_to_encode # free up memory from imgs
  processed_input = {**{"labels": torch.tensor([label2id[label] for label in batch["answer"]])}, **dict(processed_batch)}
  #send tensors to GPU
  for key in processed_input:
    processed_input[key] = processed_input[key].to(device)
  return processed_input

In [None]:
num_test_steps = len(testloader)
progress_bar = tqdm(range(num_test_steps))

metric = load_metric("accuracy")
model.eval()
for batch in testloader:
    batch = prepare_batch(batch)
    with torch.no_grad():
        outputs = model(**batch)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])
    progress_bar.update(1)
print("accuracy before training:")
metric.compute()

  0%|          | 0/157 [00:00<?, ?it/s]

Downloading builder script:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

accuracy before training:


{'accuracy': 0.06427429028388644}

In [None]:
progress_bar = tqdm(range(num_training_steps))

for epoch in range(5):  # loop over the dataset multiple times
    running_loss = 0.0
    for batch in trainloader:
        # encode batch and feed it to model
        batch = prepare_batch(batch)
        output = model(**batch)
        #print("model output", output)
        output.loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

print('Finished Training')

  0%|          | 0/4475 [00:00<?, ?it/s]

Finished Training


In [None]:
from datasets import load_metric

num_test_steps = len(testloader)
progress_bar = tqdm(range(num_test_steps))

metric = load_metric("accuracy")
model.eval()
for batch in testloader:
    batch = prepare_batch(batch)
    with torch.no_grad():
        outputs = model(**batch)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])
    progress_bar.update(1)

print("accuracy after training:")
metric.compute()

  0%|          | 0/157 [00:00<?, ?it/s]

accuracy after training:


{'accuracy': 0.806077568972411}

In [None]:
PATH = '/content/drive/My Drive/torch-save/my-model.pth'
torch.save(model.state_dict(), PATH)

In [None]:
PATH = '/content/drive/My Drive/hugging-face-save/my-model'
model.save_pretrained(PATH)

# Archive

In [None]:
"""inputs = {1: {"url": "https://fki.tic.heia-fr.ch/static/img/a01-122-02.jpg", "labels": ["text", "hand-written text"]},
          2: {"url": "https://raw.githubusercontent.com/arampacha/CLIP-rsicd/master/data/stadium_1.jpg", "labels": ["residential area", "playground", "stadium", "forest", "airport", "baseball stadium"]}
          }

# quando for para treinar é suposto usar todas as respostas possíveis indepentdemente da imagem, ou apenas usar as respostas possíveis para essa imagem ? 

plt.figure(figsize=(16, 16))

for id in inputs:
  # clip training example
  print("\n\ngenerating image from:", inputs[id]["url"])
  inputs[id]["image"] =  Image.open(requests.get(inputs[id]["url"], stream=True).raw).convert("RGB")
  print("image saved...")
  print("processing input...")
  inputs[id]["processed_inputs"] = processor(text=[f"a photo of a {l}" for l in inputs[id]["labels"]], images=inputs[id]["image"], return_tensors="pt", padding=True) # alterar aqui a parte do text para passar a ser somente a pergunta
  print("input_ids size:", inputs[id]["processed_inputs"]["input_ids"].size(), "pixel_value size:", inputs[id]["processed_inputs"]["pixel_values"].size())
  print("applying model...")
  #print(inputs[id]["processed_inputs"])
  #inputs[id]["processed_inputs"]["labels"] = inputs[id]["processed_inputs"]["input_ids"]
  inputs[id]["outputs"] =  model(**inputs[id]["processed_inputs"]) # predict probabilities 
  print("computing results...")
  logits_per_image = inputs[id]["outputs"].logits_per_image # this is the image-text similarity score
  probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
  result = {}
  for l, p in zip(inputs[id]["labels"], probs[0]):
    result[l] = f"{p:.4f}"
  inputs[id]["results"] = result
  sorted_labels, sorted_probs  = zip(*[[k, float(v)] for k, v in sorted(result.items(), key=lambda item: item[1])])
  print("results:", sorted_labels, sorted_probs)
  print("final prediction:", sorted_labels[np.argmax(sorted_probs)])
  fig, (image_viz, label_viz) = plt.subplots(1,2)
  image_viz.set_title("a photo of a " + sorted_labels[np.argmax(sorted_probs)])
  image_viz.imshow(inputs[id]["image"])
  image_viz.axis("off")
  label_viz.barh(sorted_labels, sorted_probs)
  label_viz.set_title("probability")

plt.subplots_adjust(wspace=0.7)
plt.show()
"""



## Train


In [None]:
import pandas
import datasets
import os
vqaLR_dataset = datasets.load_from_disk("/content/drive/My Drive/Thesis Datasets/RSVQA_LR/Processed/dataset")

vqaLR_dataset["train"] = vqaLR_dataset["train"]#.shard(num_shards=100, index=0)
vqaLR_dataset["test"] = vqaLR_dataset["test"]#.shard(num_shards=100, index=0)
vqaLR_dataset["validation"] = vqaLR_dataset["validation"]#.shard(num_shards=100, index=0)
vqaLR_dataset

DatasetDict({
    train: Dataset({
        features: ['img_id', 'question', 'answer'],
        num_rows: 57223
    })
    test: Dataset({
        features: ['img_id', 'question', 'answer'],
        num_rows: 10004
    })
    validation: Dataset({
        features: ['img_id', 'question', 'answer'],
        num_rows: 10005
    })
})

In [None]:
#encode the labels
label2id = {}
id2label = {}
all_labels = list(set(vqaLR_dataset["train"]["answer"]))

count = 0
for label in all_labels:
  label2id[label] = count  
  id2label[count] = label
  count += 1
print("label2id",label2id)
print("id2label", id2label)

vqaLR_dataset = vqaLR_dataset.map(lambda row: {"labels": label2id[row["answer"]]}, remove_columns=["answer"])
vqaLR_dataset

label2id {'between 11 and 100': 0, 'rural': 1, 'between 1 and 10': 2, '0': 3, 'urban': 4, 'more than 1000': 5, 'yes': 6, 'between 101 and 1000': 7, 'no': 8}
id2label {0: 'between 11 and 100', 1: 'rural', 2: 'between 1 and 10', 3: '0', 4: 'urban', 5: 'more than 1000', 6: 'yes', 7: 'between 101 and 1000', 8: 'no'}


  0%|          | 0/57223 [00:00<?, ?ex/s]

  0%|          | 0/10004 [00:00<?, ?ex/s]

  0%|          | 0/10005 [00:00<?, ?ex/s]

DatasetDict({
    train: Dataset({
        features: ['img_id', 'question', 'labels'],
        num_rows: 57223
    })
    test: Dataset({
        features: ['img_id', 'question', 'labels'],
        num_rows: 10004
    })
    validation: Dataset({
        features: ['img_id', 'question', 'labels'],
        num_rows: 10005
    })
})

In [None]:
#encode the questions
encoded_questions = {}
#get all unique questions
all_questions = set()
for split in vqaLR_dataset:
  for question in vqaLR_dataset[split]["question"]:
    all_questions.add(question)

  encoded_questions = processor(text=list(all_questions), return_tensors="pt")
encoded_questions
#encoded_questions["train"] = processor(vqaLR_dataset["train"]["question"], return_tensors="pt")
#encoded_questions["test"] = processor(vqaLR_dataset["test"]["question"], return_tensors="pt")
#encoded_questions["validation"] = processor(vqaLR_dataset["validation"]["question"], return_tensors="pt")
#encoded_questions

ValueError: ignored

In [None]:
#encode the images
needed_imgs = set()
for split in vqaLR_dataset:
  for record in vqaLR_dataset[split]:
    needed_imgs.add(record["img_id"])
encoded_imgs = {}
img_list = os.listdir("/content/drive/My Drive/Thesis Datasets/RSVQA_LR/Processed/Images_LR")
for img in img_list:
  img_id = img.split(".")[0]
  extension = img.split(".")[1]
  if int(img_id) in needed_imgs:
    if extension == "tif":
      encoded_imgs[img_id] = processor(images=Image.open("/content/drive/My Drive/Thesis Datasets/RSVQA_LR/Processed/Images_LR/" +img), return_tensors="pt").pixel_values
      processed_img_size = encoded_imgs[img_id].size()
      encoded_imgs[img_id] = encoded_imgs[img_id].reshape(processed_img_size[1], processed_img_size[2], processed_img_size[3])
encoded_imgs

In [None]:
#map encoded data
vqaLR_dataset = vqaLR_dataset.map(lambda example: {"pixel_values": encoded_imgs[str(example["img_id"])]})
vqaLR_dataset["train"] = vqaLR_dataset["train"].map(lambda example, idx: {"input_ids": encoded_questions["train"]["input_ids"][idx], "attention_mask": encoded_questions["train"]["attention_mask"][idx]}, with_indices=True)
vqaLR_dataset["test"] = vqaLR_dataset["test"].map(lambda example, idx: {"input_ids": encoded_questions["test"]["input_ids"][idx], "attention_mask": encoded_questions["test"]["attention_mask"][idx]}, with_indices=True)
vqaLR_dataset["validation"] = vqaLR_dataset["validation"].map(lambda example, idx: {"input_ids": encoded_questions["validation"]["input_ids"][idx], "attention_mask": encoded_questions["validation"]["attention_mask"][idx]}, with_indices=True)
vqaLR_dataset

In [None]:
#compute class weights
vqaLR_dataset_df = vqaLR_dataset["train"].to_pandas()
class_weights = (1 - (vqaLR_dataset_df["labels"].value_counts(normalize=True).sort_index())/ len(vqaLR_dataset_df)).values
class_weights = torch.from_numpy(class_weights).float()
print(class_weights)

tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000])


In [None]:
from transformers import Trainer

class MyTrainer(Trainer):
  def compute_loss(self, model, inputs, return_outputs=False):
    outputs = model(**inputs)
    #print("model output:", outputs)
    logits = outputs.logits
    labels = inputs.get("labels")
    #print("output logits:", logits)
    """print("computed logits:", logits)
    loss_func = CrossEntropyLoss(weight=class_weights)
    loss = loss_func(logits, labels)"""
    loss = outputs.loss
    #print("output loss:", loss)
    return (loss, outputs) if return_outputs else loss

In [None]:
from sklearn.metrics import f1_score

def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  f1 = f1_score(labels,preds, average="weighted")
  return {"f1": f1}

In [None]:
vqaLR_dataset

DatasetDict({
    train: Dataset({
        features: ['img_id', 'question', 'labels'],
        num_rows: 57223
    })
    test: Dataset({
        features: ['img_id', 'question', 'labels'],
        num_rows: 10004
    })
    validation: Dataset({
        features: ['img_id', 'question', 'labels'],
        num_rows: 10005
    })
})

In [None]:
from transformers import TrainingArguments
from transformers import DataCollatorWithPadding
batch_size = 64
logging_steps = len(vqaLR_dataset["train"]) // batch_size

output_dir = "/content/drive/My Drive/Training-Checkpoints/RSVQA_LR/training-1"
training_args = TrainingArguments(output_dir=output_dir,
                                  num_train_epochs=5,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  logging_steps=logging_steps)

In [None]:
num_labels = len(list(label2id.keys()))
model = MyVisionTextModel(clip_model.config, num_labels=num_labels)
model.text_model = clip_model.text_model
model.vision_model = clip_model.vision_model
model.visual_projection = clip_model.visual_projection
model.text_projection = clip_model.text_projection
model.logit_scale = clip_model.logit_scale
model.save_pretrained("my-model")
model = MyVisionTextModel.from_pretrained("my-model", num_labels)
data_collator = DataCollatorWithPadding(processor)

trainer = MyTrainer(model=model,
                      args=training_args,
                      compute_metrics=compute_metrics,
                      train_dataset=vqaLR_dataset["train"],
                      eval_dataset=vqaLR_dataset["test"],
                      data_collator=data_collator)

In [None]:
trainer.train()


The following columns in the training set  don't have a corresponding argument in `MyVisionTextModel.forward` and have been ignored: question, img_id, mode, category. If question, img_id, mode, category are not expected by `MyVisionTextModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 57223
  Num Epochs = 5
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 4475


Epoch,Training Loss,Validation Loss


Saving model checkpoint to test1_output/checkpoint-500
Configuration saved in test1_output/checkpoint-500/config.json
Model weights saved in test1_output/checkpoint-500/pytorch_model.bin


KeyboardInterrupt: ignored

In [None]:
!zip -r training-checkpoint-500.zip test1_output/checkpoint-500/

  adding: test1_output/checkpoint-500/ (stored 0%)
  adding: test1_output/checkpoint-500/scheduler.pt (deflated 49%)
  adding: test1_output/checkpoint-500/rng_state.pth (deflated 23%)
  adding: test1_output/checkpoint-500/optimizer.pt (deflated 22%)
  adding: test1_output/checkpoint-500/config.json
  adding: test1_output/checkpoint-500/training_args.bin
  adding: test1_output/checkpoint-500/pytorch_model.bin
  adding: test1_output/checkpoint-500/trainer_state.json

  files/entries read:  4 (1.1G bytes)  skipped:  4 (625M bytes)


## Sample

In [None]:
import pandas
import datasets
import os
vqaLR_dataset = datasets.load_from_disk("/content/drive/My Drive/Thesis Datasets/RSVQA_LR/Processed/dataset")

sample_batch = vqaLR_dataset["train"].shard(num_shards=895, index=0)
sample_batch

Dataset({
    features: ['img_id', 'question', 'answer'],
    num_rows: 64
})

In [None]:
all_labels = list(set(sample_batch["answer"]))
all_labels

['between 1 and 10',
 'more than 1000',
 'between 11 and 100',
 'urban',
 'between 101 and 1000',
 'no',
 'yes',
 '0',
 'rural']

In [None]:
label2id = {}
id2label = {}
count = 0
for label in all_labels:
  label2id[label] = count  
  id2label[count] = label
  count += 1
print("label2id",label2id)
print("id2label", id2label)


label2id {'between 1 and 10': 0, 'more than 1000': 1, 'between 11 and 100': 2, 'urban': 3, 'between 101 and 1000': 4, 'no': 5, 'yes': 6, '0': 7, 'rural': 8}
id2label {0: 'between 1 and 10', 1: 'more than 1000', 2: 'between 11 and 100', 3: 'urban', 4: 'between 101 and 1000', 5: 'no', 6: 'yes', 7: '0', 8: 'rural'}


In [None]:
#encode the labels
sample_batch = sample_batch.map(lambda row: {"labels": label2id[row["answer"]]}, remove_columns=["answer"])
sample_batch

Loading cached processed dataset at /content/drive/My Drive/Thesis Datasets/RSVQA_LR/Processed/dataset/train/cache-ab84ae02a2f6574d.arrow


Dataset({
    features: ['img_id', 'question', 'labels'],
    num_rows: 64
})

In [None]:
processed_input = processor(text=sample_batch["question"], return_tensors="pt", padding=True)

print(processed_input.input_ids.size(), processed_input.attention_mask.size())
processed_input

torch.Size([64, 18]) torch.Size([64, 18])


{'input_ids': tensor([[49406,   533,   585,  ..., 49407, 49407, 49407],
        [49406,   631,   997,  ..., 49407, 49407, 49407],
        [49406,   533,   518,  ..., 49407, 49407, 49407],
        ...,
        [49406,   533,   320,  ..., 49407, 49407, 49407],
        [49406,   533,   320,  ..., 49407, 49407, 49407],
        [49406,   533,   320,  ..., 49407, 49407, 49407]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [None]:
sample_batch = sample_batch.map(lambda example, idx: {"input_ids": processed_input["input_ids"][idx], 
                                                      "attention_mask": processed_input["attention_mask"][idx]}, with_indices=True)
sample_batch

  0%|          | 0/64 [00:00<?, ?ex/s]

Dataset({
    features: ['img_id', 'question', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 64
})

In [None]:
needed_imgs = set()
for record in sample_batch:
    needed_imgs.add(record["img_id"])
needed_imgs = list(needed_imgs)
print(len(needed_imgs), needed_imgs)
print(15 in needed_imgs)

64 [0, 386, 134, 647, 8, 395, 143, 656, 17, 404, 152, 665, 26, 413, 161, 674, 35, 422, 169, 683, 44, 431, 178, 692, 53, 439, 187, 701, 62, 575, 448, 196, 709, 71, 584, 457, 332, 205, 718, 80, 593, 466, 341, 214, 727, 89, 602, 350, 223, 736, 98, 611, 359, 745, 107, 620, 368, 754, 116, 629, 377, 763, 125, 638]
False


In [None]:
processed_imgs = {}
img_list = os.listdir("/content/drive/My Drive/Thesis Datasets/RSVQA_LR/Processed/Images_LR")
extension = ".tif"
for img_id in needed_imgs:
    processed_imgs[img_id] = processor(images=Image.open("/content/drive/My Drive/Thesis Datasets/RSVQA_LR/Processed/Images_LR/" + str(img_id) + extension), return_tensors="pt").pixel_values
    processed_img_size = processed_imgs[img_id].size()
    processed_imgs[img_id] = processed_imgs[img_id].reshape(1, processed_img_size[1], processed_img_size[2], processed_img_size[3])
processed_imgs

{0: tensor([[[[-1.0769, -1.0915, -1.0769,  ..., -0.8288, -0.6536, -0.7120],
           [-1.0915, -1.1061, -1.0915,  ..., -0.8726, -0.6536, -0.5368],
           [-1.0915, -1.1061, -1.0769,  ..., -0.8434, -0.7996, -0.6098],
           ...,
           [-0.4346, -0.9164, -1.2083,  ..., -1.3397, -1.3689, -1.3835],
           [-0.5514, -0.9164, -1.0185,  ..., -1.2229, -1.3251, -1.4127],
           [-0.8580, -1.0477, -0.7996,  ..., -1.0769, -1.1791, -1.3543]],
 
          [[-0.7616, -0.7616, -0.7466,  ..., -0.6715, -0.5365, -0.6415],
           [-0.7466, -0.7616, -0.7466,  ..., -0.7466, -0.5815, -0.5365],
           [-0.7466, -0.7616, -0.7466,  ..., -0.8066, -0.7016, -0.5365],
           ...,
           [-0.4314, -0.7316, -0.8366,  ..., -1.0317, -1.1068, -1.0767],
           [-0.4614, -0.6715, -0.7466,  ..., -0.8516, -0.9867, -1.0918],
           [-0.7016, -0.7166, -0.5215,  ..., -0.7016, -0.8516, -1.0317]],
 
          [[-0.4137, -0.3995, -0.3711,  ..., -0.2573, -0.1720, -0.1293],
          

In [None]:
sample_input = sample_batch[0]
sample_input = {"attention_mask": torch.tensor(sample_input["attention_mask"]), 
                "input_ids": torch.tensor(sample_input["input_ids"]),
                "pixel_values": processed_imgs[sample_input["img_id"]],
                "labels": sample_input["labels"]}
sample_input = {"attention_mask": sample_input["attention_mask"].reshape(1, sample_input["attention_mask"].size()[0]), 
                "input_ids": sample_input["input_ids"].reshape(1, sample_input["input_ids"].size()[0]),
                "pixel_values": sample_input["pixel_values"],
                "labels": torch.tensor(sample_input["labels"])}
sample_input

{'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]),
 'input_ids': tensor([[49406,   533,   585,   320,  8737,   541,   550,  5800,  2445, 49407,
          49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407]]),
 'labels': tensor(3),
 'pixel_values': tensor([[[[-1.0769, -1.0915, -1.0769,  ..., -0.8288, -0.6536, -0.7120],
           [-1.0915, -1.1061, -1.0915,  ..., -0.8726, -0.6536, -0.5368],
           [-1.0915, -1.1061, -1.0769,  ..., -0.8434, -0.7996, -0.6098],
           ...,
           [-0.4346, -0.9164, -1.2083,  ..., -1.3397, -1.3689, -1.3835],
           [-0.5514, -0.9164, -1.0185,  ..., -1.2229, -1.3251, -1.4127],
           [-0.8580, -1.0477, -0.7996,  ..., -1.0769, -1.1791, -1.3543]],
 
          [[-0.7616, -0.7616, -0.7466,  ..., -0.6715, -0.5365, -0.6415],
           [-0.7466, -0.7616, -0.7466,  ..., -0.7466, -0.5815, -0.5365],
           [-0.7466, -0.7616, -0.7466,  ..., -0.8066, -0.7016, -0.5365],
           ...,
           [-0.4314, -0

In [None]:
model = MyVisionTextModel(clip_model.config, num_labels=len(list(label2id.keys())))
model.text_model = clip_model.text_model
model.vision_model = clip_model.vision_model
model.visual_projection = clip_model.visual_projection
model.text_projection = clip_model.text_projection
model.logit_scale = clip_model.logit_scale
model.save_pretrained("my-model")
model = MyVisionTextModel.from_pretrained("my-model", len(list(label2id.keys())))
model(**sample_input)

input_ids: tensor([[49406,   533,   585,   320,  8737,   541,   550,  5800,  2445, 49407,
         49407, 49407, 49407, 49407, 49407, 49407, 49407, 49407]]) inputs_id size: torch.Size([1, 18])
attention mask tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]) attention_mask size: torch.Size([1, 18])
aux_text: tensor([[[ 0.0237,  0.0539, -0.1067,  ...,  0.1818,  0.0401,  0.0029],
         [-0.1636, -0.1299, -0.0642,  ..., -0.2770, -0.2325, -0.2860],
         [-0.3945, -0.0763,  0.0336,  ..., -0.1719, -0.3835, -0.2537],
         ...,
         [ 0.1504, -0.0471,  0.2638,  ...,  0.1059, -0.0130, -0.3284],
         [ 0.1624, -0.0154,  0.1465,  ...,  0.1503,  0.0404, -0.2761],
         [ 0.1666, -0.0095,  0.1196,  ...,  0.1578,  0.0493, -0.2728]]],
       grad_fn=<UnsafeViewBackward0>) aux_text size: torch.Size([1, 18, 512])
new attention_mask reshaped tensor([[[1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         ...,
         

CLIPOutput([('logits_per_image',
             tensor([[23.7425]], grad_fn=<PermuteBackward0>)),
            ('logits_per_text', tensor([[23.7425]], grad_fn=<MulBackward0>)),
            ('text_embeds',
             tensor([[ 1.4644e-02, -2.4322e-03,  2.5274e-02, -1.1557e-02, -1.0524e-02,
                       1.6268e-02,  9.0458e-03, -1.1153e-01,  1.5282e-02,  1.3832e-02,
                      -1.2850e-02,  1.1531e-02, -5.8912e-03, -3.9218e-02, -7.4436e-04,
                       3.3390e-03, -6.2851e-03, -6.2516e-03, -3.8136e-02,  3.7205e-02,
                       3.1658e-02, -2.3932e-03,  1.0764e-02,  2.7085e-03, -6.7932e-03,
                      -2.6345e-04,  3.3795e-03, -1.3837e-02, -1.2379e-02, -1.4089e-02,
                       1.4093e-02, -3.2567e-02,  5.5877e-03, -7.6793e-03, -1.4707e-02,
                       1.1630e-02,  1.9111e-02,  1.5721e-02, -8.1963e-03,  1.1288e-02,
                      -4.0399e-03, -7.8520e-03,  2.5494e-02,  4.4198e-04, -9.1945e-03,
               