In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install -q datasets transformers evaluate pynvml accelerate

In [None]:
import torch
import numpy as np
from torch import nn
import json
from torchvision.transforms import ColorJitter
from functools import reduce
from typing import Union
import math
import torch.nn.functional as F
import pickle
import os
import psutil
from huggingface_hub import hf_hub_download
from pynvml import *
from datasets import load_dataset, load_from_disk
import evaluate
from transformers import SegformerFeatureExtractor
from transformers import TrainingArguments, Trainer, logging
from transformers import AutoModelForSemanticSegmentation
from transformers import SegformerForSemanticSegmentation, SegformerConfig

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  

In [None]:
def NEG_INF_DIAG(n: int, device: torch.device) -> torch.Tensor:
    """Returns a diagonal matrix of size [n, n].

    The diagonal are all "-inf". This is for avoiding calculating the
    overlapped element in the Criss-Cross twice.
    """
    return torch.diag(torch.tensor(float('-inf')).to(device).repeat(n), 0)


class Scale(nn.Module):
    """A learnable scale parameter.

    This layer scales the input by a learnable factor. It multiplies a
    learnable scale parameter of shape (1,) with input of any shape.

    Args:
        scale (float): Initial value of scale factor. Default: 1.0
    """

    def __init__(self, scale: float = 1.0):
        super().__init__()
        self.scale = nn.Parameter(torch.tensor(scale, dtype=torch.float))

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return x * self.scale


class CrissCrossAttention(nn.Module):
    """Criss-Cross Attention Module.

    .. note::
        Before v1.3.13, we use a CUDA op. Since v1.3.13, we switch
        to a pure PyTorch and equivalent implementation. For more
        details, please refer to https://github.com/open-mmlab/mmcv/pull/1201.

        Speed comparison for one forward pass

        - Input size: [2,512,97,97]
        - Device: 1 NVIDIA GeForce RTX 2080 Ti

        +-----------------------+---------------+------------+---------------+
        |                       |PyTorch version|CUDA version|Relative speed |
        +=======================+===============+============+===============+
        |with torch.no_grad()   |0.00554402 s   |0.0299619 s |5.4x           |
        +-----------------------+---------------+------------+---------------+
        |no with torch.no_grad()|0.00562803 s   |0.0301349 s |5.4x           |
        +-----------------------+---------------+------------+---------------+

    Args:
        in_channels (int): Channels of the input feature map.
    """

    def __init__(self, in_channels: int) -> None:
        super().__init__()
        self.query_conv = nn.Conv2d(in_channels, in_channels // 8, 1)
        self.key_conv = nn.Conv2d(in_channels, in_channels // 8, 1)
        self.value_conv = nn.Conv2d(in_channels, in_channels, 1)
        self.gamma = Scale(0.)
        self.in_channels = in_channels

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """forward function of Criss-Cross Attention.

        Args:
            x (torch.Tensor): Input feature with the shape of
                (batch_size, in_channels, height, width).

        Returns:
            torch.Tensor: Output of the layer, with the shape of
            (batch_size, in_channels, height, width)
        """
        B, C, H, W = x.size()
        query = self.query_conv(x)
        key = self.key_conv(x)
        value = self.value_conv(x)
        energy_H = torch.einsum('bchw,bciw->bwhi', query, key) + NEG_INF_DIAG(
            H, query.device)
        energy_H = energy_H.transpose(1, 2)
        energy_W = torch.einsum('bchw,bchj->bhwj', query, key)
        attn = F.softmax(
            torch.cat([energy_H, energy_W], dim=-1), dim=-1)  # [B,H,W,(H+W)]
        out = torch.einsum('bciw,bhwi->bchw', value, attn[..., :H])
        out += torch.einsum('bchj,bhwj->bchw', value, attn[..., H:])

        out = self.gamma(out) + x
        out = out.contiguous()

        return out


    def __repr__(self) -> str:
        s = self.__class__.__name__
        s += f'(in_channels={self.in_channels})'
        return s

In [None]:
class MySegformerAttention(nn.Module):
    def __init__(self, config, hidden_size):
        super().__init__()
        self.cca = CrissCrossAttention(hidden_size)
        


    def forward(self, hidden_states, height, width, output_attentions=False):
       
        #self_outputs = self.self(hidden_states, height, width, output_attentions)
        #context_layer = self_outputs[0] # (1,16384,32) = (1,height*width,hidden_states.shape[-1] )

        x=torch.reshape(hidden_states, (hidden_states.shape[0],hidden_states.shape[-1], height, width))
        self_outputs= self.cca(x)
        self_outputs=torch.reshape(self_outputs, (hidden_states.shape[0], height*width, hidden_states.shape[-1]))
        self_outputs=(self_outputs,)


        #attention_output = self.output(self_outputs[0], hidden_states) # (1,16384,32)
        #outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
        #print('outputs: ', (outputs[0].shape)) (1,16384,32)
       
        return self_outputs



In [None]:
ds = load_dataset("scene_parse_150")



  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
validation_ds=ds['validation']

In [None]:
repo_id = "huggingface/label-files"
filename = "ade20k-id2label.json"
id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
id2label = {int(k):v for k,v in id2label.items()}

label2id = {v: k for k, v in id2label.items()}
num_labels = len(id2label)

In [None]:
path='/content/drive/MyDrive/AML/models'
model=torch.load(path+'/very_final_train/epoch8')

In [None]:
#FUNZIONA, VEDIAMO SE NON SCOPPIA LA RAM
from tqdm import tqdm
feature_extractor = SegformerFeatureExtractor()
metric = evaluate.load("mean_iou")
n=2000
mious=[]
validation_ds.set_format('pytorch')
data=validation_ds
for i in tqdm(range(n)):
  img=data[i]['image']
  outputs=model(feature_extractor(img, return_tensors="pt").pixel_values.to(device))
  logits = outputs.logits#.cpu()
  upsampled_logits = nn.functional.interpolate(
      logits,
      size=img.shape[:-1],
      mode="bilinear",
      align_corners=False,
      )
  pred_seg = upsampled_logits.argmax(dim=1)[0].to(dtype=torch.uint8)
  mask=data[i]['annotation']
  metrics=metric.compute(predictions=[pred_seg], references=[mask], 
                         num_labels=num_labels, ignore_index=255, 
                         reduce_labels=False)
  mious.append(metrics['mean_iou'])

  iou = total_area_intersect / total_area_union
  acc = total_area_intersect / total_area_label
100%|██████████| 2000/2000 [51:03<00:00,  1.53s/it]


In [None]:
np.mean(mious)

0.0994517111424355