In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
!pip install git+https://github.com/openai/clip
!pip install ftfy
!pip install regex
!pip install tqdm
!pip install pillow
!pip install pytorch
!pip install torchvision
!pip install opencv-contrib-python
!pip install einops
!pip install grad-cam

Collecting git+https://github.com/openai/clip
  Cloning https://github.com/openai/clip to /tmp/pip-req-build-hwzj9zu_
  Running command git clone --filter=blob:none --quiet https://github.com/openai/clip /tmp/pip-req-build-hwzj9zu_
  Resolved https://github.com/openai/clip to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting ftfy (from clip==1.0)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: clip
  Building wheel for clip (setup.py) ... [?25ldone
[?25h  Created wheel for clip: filename=clip-1.0-py3-none-any.whl size=1369490 sha256=583f13c77da2191b9db5b8f124ff81d0cf27102721c76cec0c1693a5300e2f17
  Stored in directory: /tmp/pip-ephem-wheel-cache-05svqc5h/wheels/1f/79/1c/1fd0db79e903aa56e8aff0effc18abcca

In [4]:
# AuraSR: GAN-based Super-Resolution for real-world, a reproduction of the GigaGAN* paper. Implementation is
# based on the unofficial lucidrains/gigagan-pytorch repository. Heavily modified from there.
#
# https://mingukkang.github.io/GigaGAN/
from math import log2, ceil
from functools import partial
from typing import Any, Optional, List, Iterable

import torch
from torchvision import transforms
from PIL import Image
from torch import nn, einsum, Tensor
import torch.nn.functional as F

from einops import rearrange, repeat, reduce
from einops.layers.torch import Rearrange
from torchvision.utils import save_image
import math


def get_same_padding(size, kernel, dilation, stride):
    return ((size - 1) * (stride - 1) + dilation * (kernel - 1)) // 2


class AdaptiveConv2DMod(nn.Module):
    def __init__(
        self,
        dim,
        dim_out,
        kernel,
        *,
        demod=True,
        stride=1,
        dilation=1,
        eps=1e-8,
        num_conv_kernels=1,  # set this to be greater than 1 for adaptive
    ):
        super().__init__()
        self.eps = eps

        self.dim_out = dim_out

        self.kernel = kernel
        self.stride = stride
        self.dilation = dilation
        self.adaptive = num_conv_kernels > 1

        self.weights = nn.Parameter(
            torch.randn((num_conv_kernels, dim_out, dim, kernel, kernel))
        )

        self.demod = demod

        nn.init.kaiming_normal_(
            self.weights, a=0, mode="fan_in", nonlinearity="leaky_relu"
        )

    def forward(
        self, fmap, mod: Optional[Tensor] = None, kernel_mod: Optional[Tensor] = None
    ):
        """
        notation

        b - batch
        n - convs
        o - output
        i - input
        k - kernel
        """

        b, h = fmap.shape[0], fmap.shape[-2]

        # account for feature map that has been expanded by the scale in the first dimension
        # due to multiscale inputs and outputs

        if mod.shape[0] != b:
            mod = repeat(mod, "b ... -> (s b) ...", s=b // mod.shape[0])

        if exists(kernel_mod):
            kernel_mod_has_el = kernel_mod.numel() > 0

            assert self.adaptive or not kernel_mod_has_el

            if kernel_mod_has_el and kernel_mod.shape[0] != b:
                kernel_mod = repeat(
                    kernel_mod, "b ... -> (s b) ...", s=b // kernel_mod.shape[0]
                )

        # prepare weights for modulation

        weights = self.weights

        if self.adaptive:
            weights = repeat(weights, "... -> b ...", b=b)

            # determine an adaptive weight and 'select' the kernel to use with softmax

            assert exists(kernel_mod) and kernel_mod.numel() > 0

            kernel_attn = kernel_mod.softmax(dim=-1)
            kernel_attn = rearrange(kernel_attn, "b n -> b n 1 1 1 1")

            weights = reduce(weights * kernel_attn, "b n ... -> b ...", "sum")

        # do the modulation, demodulation, as done in stylegan2

        mod = rearrange(mod, "b i -> b 1 i 1 1")

        weights = weights * (mod + 1)

        if self.demod:
            inv_norm = (
                reduce(weights**2, "b o i k1 k2 -> b o 1 1 1", "sum")
                .clamp(min=self.eps)
                .rsqrt()
            )
            weights = weights * inv_norm

        fmap = rearrange(fmap, "b c h w -> 1 (b c) h w")

        weights = rearrange(weights, "b o ... -> (b o) ...")

        padding = get_same_padding(h, self.kernel, self.dilation, self.stride)
        fmap = F.conv2d(fmap, weights, padding=padding, groups=b)

        return rearrange(fmap, "1 (b o) ... -> b o ...", b=b)


class Attend(nn.Module):
    def __init__(self, dropout=0.0, flash=False):
        super().__init__()
        self.dropout = dropout
        self.attn_dropout = nn.Dropout(dropout)
        self.scale = nn.Parameter(torch.randn(1))
        self.flash = flash

    def flash_attn(self, q, k, v):
        q, k, v = map(lambda t: t.contiguous(), (q, k, v))
        out = F.scaled_dot_product_attention(
            q, k, v, dropout_p=self.dropout if self.training else 0.0
        )
        return out

    def forward(self, q, k, v):
        if self.flash:
            return self.flash_attn(q, k, v)

        scale = q.shape[-1] ** -0.5

        # similarity
        sim = einsum("b h i d, b h j d -> b h i j", q, k) * scale

        # attention
        attn = sim.softmax(dim=-1)
        attn = self.attn_dropout(attn)

        # aggregate values
        out = einsum("b h i j, b h j d -> b h i d", attn, v)

        return out


def exists(x):
    return x is not None


def default(val, d):
    if exists(val):
        return val
    return d() if callable(d) else d


def cast_tuple(t, length=1):
    if isinstance(t, tuple):
        return t
    return (t,) * length


def identity(t, *args, **kwargs):
    return t


def is_power_of_two(n):
    return log2(n).is_integer()


def null_iterator():
    while True:
        yield None

def Downsample(dim, dim_out=None):
    return nn.Sequential(
        Rearrange("b c (h p1) (w p2) -> b (c p1 p2) h w", p1=2, p2=2),
        nn.Conv2d(dim * 4, default(dim_out, dim), 1),
    )


class RMSNorm(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.g = nn.Parameter(torch.ones(1, dim, 1, 1))
        self.eps = 1e-4

    def forward(self, x):
        return F.normalize(x, dim=1) * self.g * (x.shape[1] ** 0.5)


# building block modules


class Block(nn.Module):
    def __init__(self, dim, dim_out, groups=8, num_conv_kernels=0):
        super().__init__()
        self.proj = AdaptiveConv2DMod(
            dim, dim_out, kernel=3, num_conv_kernels=num_conv_kernels
        )
        self.kernel = 3
        self.dilation = 1
        self.stride = 1

        self.act = nn.SiLU()

    def forward(self, x, conv_mods_iter: Optional[Iterable] = None):
        conv_mods_iter = conv_mods_iter

        x = self.proj(x, mod=next(conv_mods_iter), kernel_mod=next(conv_mods_iter))

        x = self.act(x)
        return x


class ResnetBlock(nn.Module):
    def __init__(
        self, dim, dim_out, *, groups=8, num_conv_kernels=0, style_dims: List = []
    ):
        super().__init__()
        style_dims.extend([dim, num_conv_kernels, dim_out, num_conv_kernels])

        self.block1 = Block(
            dim, dim_out, groups=groups, num_conv_kernels=num_conv_kernels
        )
        self.block2 = Block(
            dim_out, dim_out, groups=groups, num_conv_kernels=num_conv_kernels
        )
        self.res_conv = nn.Conv2d(dim, dim_out, 1) if dim != dim_out else nn.Identity()

    def forward(self, x, conv_mods_iter: Optional[Iterable] = None):
        h = self.block1(x, conv_mods_iter=conv_mods_iter)
        h = self.block2(h, conv_mods_iter=conv_mods_iter)

        return h + self.res_conv(x)


class LinearAttention(nn.Module):
    def __init__(self, dim, heads=4, dim_head=32):
        super().__init__()
        self.scale = dim_head**-0.5
        self.heads = heads
        hidden_dim = dim_head * heads

        self.norm = RMSNorm(dim)
        self.to_qkv = nn.Conv2d(dim, hidden_dim * 3, 1, bias=False)

        self.to_out = nn.Sequential(nn.Conv2d(hidden_dim, dim, 1), RMSNorm(dim))

    def forward(self, x):
        b, c, h, w = x.shape

        x = self.norm(x)

        qkv = self.to_qkv(x).chunk(3, dim=1)
        q, k, v = map(
            lambda t: rearrange(t, "b (h c) x y -> b h c (x y)", h=self.heads), qkv
        )

        q = q.softmax(dim=-2)
        k = k.softmax(dim=-1)

        q = q * self.scale

        context = torch.einsum("b h d n, b h e n -> b h d e", k, v)

        out = torch.einsum("b h d e, b h d n -> b h e n", context, q)
        out = rearrange(out, "b h c (x y) -> b (h c) x y", h=self.heads, x=h, y=w)
        return self.to_out(out)


class Attention(nn.Module):
    def __init__(self, dim, heads=4, dim_head=32, flash=False):
        super().__init__()
        self.heads = heads
        hidden_dim = dim_head * heads

        self.norm = RMSNorm(dim)

        self.attend = Attend(flash=flash)
        self.to_qkv = nn.Conv2d(dim, hidden_dim * 3, 1, bias=False)
        self.to_out = nn.Conv2d(hidden_dim, dim, 1)

    def forward(self, x):
        b, c, h, w = x.shape
        x = self.norm(x)
        qkv = self.to_qkv(x).chunk(3, dim=1)

        q, k, v = map(
            lambda t: rearrange(t, "b (h c) x y -> b h (x y) c", h=self.heads), qkv
        )

        out = self.attend(q, k, v)
        out = rearrange(out, "b h (x y) d -> b (h d) x y", x=h, y=w)

        return self.to_out(out)


# feedforward
def FeedForward(dim, mult=4):
    return nn.Sequential(
        RMSNorm(dim),
        nn.Conv2d(dim, dim * mult, 1),
        nn.GELU(),
        nn.Conv2d(dim * mult, dim, 1),
    )


# transformers
class Transformer(nn.Module):
    def __init__(self, dim, dim_head=64, heads=8, depth=1, flash_attn=True, ff_mult=4):
        super().__init__()
        self.layers = nn.ModuleList([])

        for _ in range(depth):
            self.layers.append(
                nn.ModuleList(
                    [
                        Attention(
                            dim=dim, dim_head=dim_head, heads=heads, flash=flash_attn
                        ),
                        FeedForward(dim=dim, mult=ff_mult),
                    ]
                )
            )

    def forward(self, x):
        for attn, ff in self.layers:
            x = attn(x) + x
            x = ff(x) + x

        return x


class LinearTransformer(nn.Module):
    def __init__(self, dim, dim_head=64, heads=8, depth=1, ff_mult=4):
        super().__init__()
        self.layers = nn.ModuleList([])

        for _ in range(depth):
            self.layers.append(
                nn.ModuleList(
                    [
                        LinearAttention(dim=dim, dim_head=dim_head, heads=heads),
                        FeedForward(dim=dim, mult=ff_mult),
                    ]
                )
            )

    def forward(self, x):
        for attn, ff in self.layers:
            x = attn(x) + x
            x = ff(x) + x

        return x


class NearestNeighborhoodUpsample(nn.Module):
    def __init__(self, dim, dim_out=None):
        super().__init__()
        dim_out = default(dim_out, dim)
        self.conv = nn.Conv2d(dim, dim_out, kernel_size=3, stride=1, padding=1)

    def forward(self, x):

        if x.shape[0] >= 64:
            x = x.contiguous()

        x = F.interpolate(x, scale_factor=2.0, mode="nearest")
        x = self.conv(x)

        return x

class EqualLinear(nn.Module):
    def __init__(self, dim, dim_out, lr_mul=1, bias=True):
        super().__init__()
        self.weight = nn.Parameter(torch.randn(dim_out, dim))
        if bias:
            self.bias = nn.Parameter(torch.zeros(dim_out))

        self.lr_mul = lr_mul

    def forward(self, input):
        return F.linear(input, self.weight * self.lr_mul, bias=self.bias * self.lr_mul)


class StyleGanNetwork(nn.Module):
    def __init__(self, dim_in=128, dim_out=512, depth=8, lr_mul=0.1, dim_text_latent=0):
        super().__init__()
        self.dim_in = dim_in
        self.dim_out = dim_out
        self.dim_text_latent = dim_text_latent

        layers = []
        for i in range(depth):
            is_first = i == 0

            if is_first:
                dim_in_layer = dim_in + dim_text_latent
            else:
                dim_in_layer = dim_out

            dim_out_layer = dim_out

            layers.extend(
                [EqualLinear(dim_in_layer, dim_out_layer, lr_mul), nn.LeakyReLU(0.2)]
            )

        self.net = nn.Sequential(*layers)

    def forward(self, x, text_latent=None):
        x = F.normalize(x, dim=1)
        if self.dim_text_latent > 0:
            assert exists(text_latent)
            x = torch.cat((x, text_latent), dim=-1)
        return self.net(x)


class UnetUpsampler(torch.nn.Module):

    def __init__(
        self,
        dim: int,
        *,
        image_size: int,
        input_image_size: int,
        init_dim: Optional[int] = None,
        out_dim: Optional[int] = None,
        style_network: Optional[dict] = None,
        up_dim_mults: tuple = (1, 2, 4, 8, 16),
        down_dim_mults: tuple = (4, 8, 16),
        channels: int = 3,
        resnet_block_groups: int = 8,
        full_attn: tuple = (False, False, False, True, True),
        flash_attn: bool = True,
        self_attn_dim_head: int = 64,
        self_attn_heads: int = 8,
        attn_depths: tuple = (2, 2, 2, 2, 4),
        mid_attn_depth: int = 4,
        num_conv_kernels: int = 4,
        resize_mode: str = "bilinear",
        unconditional: bool = True,
        skip_connect_scale: Optional[float] = None,
    ):
        super().__init__()
        self.style_network = style_network = StyleGanNetwork(**style_network)
        self.unconditional = unconditional
        assert not (
            unconditional
            and exists(style_network)
            and style_network.dim_text_latent > 0
        )

        assert is_power_of_two(image_size) and is_power_of_two(
            input_image_size
        ), "both output image size and input image size must be power of 2"
        assert (
            input_image_size < image_size
        ), "input image size must be smaller than the output image size, thus upsampling"

        self.image_size = image_size
        self.input_image_size = input_image_size

        style_embed_split_dims = []

        self.channels = channels
        input_channels = channels

        init_dim = default(init_dim, dim)

        up_dims = [init_dim, *map(lambda m: dim * m, up_dim_mults)]
        init_down_dim = up_dims[len(up_dim_mults) - len(down_dim_mults)]
        down_dims = [init_down_dim, *map(lambda m: dim * m, down_dim_mults)]
        self.init_conv = nn.Conv2d(input_channels, init_down_dim, 7, padding=3)

        up_in_out = list(zip(up_dims[:-1], up_dims[1:]))
        down_in_out = list(zip(down_dims[:-1], down_dims[1:]))

        block_klass = partial(
            ResnetBlock,
            groups=resnet_block_groups,
            num_conv_kernels=num_conv_kernels,
            style_dims=style_embed_split_dims,
        )

        FullAttention = partial(Transformer, flash_attn=flash_attn)
        *_, mid_dim = up_dims

        self.skip_connect_scale = default(skip_connect_scale, 2**-0.5)

        self.downs = nn.ModuleList([])
        self.ups = nn.ModuleList([])

        block_count = 6

        for ind, (
            (dim_in, dim_out),
            layer_full_attn,
            layer_attn_depth,
        ) in enumerate(zip(down_in_out, full_attn, attn_depths)):
            attn_klass = FullAttention if layer_full_attn else LinearTransformer

            blocks = []
            for i in range(block_count):
                blocks.append(block_klass(dim_in, dim_in))

            self.downs.append(
                nn.ModuleList(
                    [
                        nn.ModuleList(blocks),
                        nn.ModuleList(
                            [
                                (
                                    attn_klass(
                                        dim_in,
                                        dim_head=self_attn_dim_head,
                                        heads=self_attn_heads,
                                        depth=layer_attn_depth,
                                    )
                                    if layer_full_attn
                                    else None
                                ),
                                nn.Conv2d(
                                    dim_in, dim_out, kernel_size=3, stride=2, padding=1
                                ),
                            ]
                        ),
                    ]
                )
            )

        self.mid_block1 = block_klass(mid_dim, mid_dim)
        self.mid_attn = FullAttention(
            mid_dim,
            dim_head=self_attn_dim_head,
            heads=self_attn_heads,
            depth=mid_attn_depth,
        )
        self.mid_block2 = block_klass(mid_dim, mid_dim)

        *_, last_dim = up_dims

        for ind, (
            (dim_in, dim_out),
            layer_full_attn,
            layer_attn_depth,
        ) in enumerate(
            zip(
                reversed(up_in_out),
                reversed(full_attn),
                reversed(attn_depths),
            )
        ):
            attn_klass = FullAttention if layer_full_attn else LinearTransformer

            blocks = []
            input_dim = dim_in * 2 if ind < len(down_in_out) else dim_in
            for i in range(block_count):
                blocks.append(block_klass(input_dim, dim_in))

            self.ups.append(
                nn.ModuleList(
                    [
                        nn.ModuleList(blocks),
                        nn.ModuleList(
                            [
                                NearestNeighborhoodUpsample(
                                    last_dim if ind == 0 else dim_out,
                                    dim_in,
                                ),
                                (
                                    attn_klass(
                                        dim_in,
                                        dim_head=self_attn_dim_head,
                                        heads=self_attn_heads,
                                        depth=layer_attn_depth,
                                    )
                                    if layer_full_attn
                                    else None
                                ),
                            ]
                        ),
                    ]
                )
            )

        self.out_dim = default(out_dim, channels)
        self.final_res_block = block_klass(dim, dim)
        self.final_to_rgb = nn.Conv2d(dim, channels, 1)
        self.resize_mode = resize_mode
        self.style_to_conv_modulations = nn.Linear(
            style_network.dim_out, sum(style_embed_split_dims)
        )
        self.style_embed_split_dims = style_embed_split_dims

    @property
    def allowable_rgb_resolutions(self):
        input_res_base = int(log2(self.input_image_size))
        output_res_base = int(log2(self.image_size))
        allowed_rgb_res_base = list(range(input_res_base, output_res_base))
        return [*map(lambda p: 2**p, allowed_rgb_res_base)]

    @property
    def device(self):
        return self.style_network.net[0].weight.device

    @property
    def total_params(self):
        return sum([p.numel() for p in self.parameters()])

    def resize_image_to(self, x, size):
        return F.interpolate(x, (size, size), mode=self.resize_mode)

    def forward(
        self,
        lowres_image: torch.Tensor,
        styles: Optional[torch.Tensor] = None,
        noise: Optional[torch.Tensor] = None,
        global_text_tokens: Optional[torch.Tensor] = None,
        return_all_rgbs: bool = False,
    ):
        x = lowres_image

        noise_scale = 0.001  # Adjust the scale of the noise as needed
        noise_aug = torch.randn_like(x) * noise_scale
        x = x + noise_aug
        x = x.clamp(0, 1)

        shape = x.shape
        batch_size = shape[0]

        assert shape[-2:] == ((self.input_image_size,) * 2)

        # styles
        if not exists(styles):
            assert exists(self.style_network)

            noise = default(
                noise,
                torch.randn(
                    (batch_size, self.style_network.dim_in), device= self.device
                ),
            )
            styles = self.style_network(noise, global_text_tokens)

        # project styles to conv modulations
        conv_mods = self.style_to_conv_modulations(styles)
        conv_mods = conv_mods.split(self.style_embed_split_dims, dim=-1)
        conv_mods = iter(conv_mods)

        x = self.init_conv(x)

        h = []
        for blocks, (attn, downsample) in self.downs:
            for block in blocks:
                x = block(x, conv_mods_iter=conv_mods)
                h.append(x)

            if attn is not None:
                x = attn(x)

            x = downsample(x)

        x = self.mid_block1(x, conv_mods_iter=conv_mods)
        x = self.mid_attn(x)
        x = self.mid_block2(x, conv_mods_iter=conv_mods)

        for (
            blocks,
            (
                upsample,
                attn,
            ),
        ) in self.ups:
            x = upsample(x)
            for block in blocks:
                if h != []:
                    res = h.pop()
                    res = res * self.skip_connect_scale
                    x = torch.cat((x, res), dim=1)

                x = block(x, conv_mods_iter=conv_mods)

            if attn is not None:
                x = attn(x)

        x = self.final_res_block(x, conv_mods_iter=conv_mods)
        rgb = self.final_to_rgb(x)

        if not return_all_rgbs:
            return rgb

        return rgb, []


def tile_image(image, chunk_size=64):
    c, h, w = image.shape
    h_chunks = ceil(h / chunk_size)
    w_chunks = ceil(w / chunk_size)
    tiles = []
    for i in range(h_chunks):
        for j in range(w_chunks):
            tile = image[:, i * chunk_size:(i + 1) * chunk_size, j * chunk_size:(j + 1) * chunk_size]
            tiles.append(tile)
    return tiles, h_chunks, w_chunks

# This helps create a checkboard pattern with some edge blending
def create_checkerboard_weights(tile_size):
    x = torch.linspace(-1, 1, tile_size)
    y = torch.linspace(-1, 1, tile_size)

    x, y = torch.meshgrid(x, y, indexing='ij')
    d = torch.sqrt(x*x + y*y)
    sigma, mu = 0.5, 0.0
    weights = torch.exp(-((d-mu)**2 / (2.0 * sigma**2)))

    # saturate the values to sure get high weights in the center
    weights = weights**8

    return weights / weights.max()  # Normalize to [0, 1]

def repeat_weights(weights, image_size):
    tile_size = weights.shape[0]
    repeats = (math.ceil(image_size[0] / tile_size), math.ceil(image_size[1] / tile_size))
    return weights.repeat(repeats)[:image_size[0], :image_size[1]]

def create_offset_weights(weights, image_size):
    tile_size = weights.shape[0]
    offset = tile_size // 2
    full_weights = repeat_weights(weights, (image_size[0] + offset, image_size[1] + offset))
    return full_weights[offset:, offset:]

def merge_tiles(tiles, h_chunks, w_chunks, chunk_size=64):
    # Determine the shape of the output tensor
    c = tiles[0].shape[0]
    h = h_chunks * chunk_size
    w = w_chunks * chunk_size

    # Create an empty tensor to hold the merged image
    merged = torch.zeros((c, h, w), dtype=tiles[0].dtype)

    # Iterate over the tiles and place them in the correct position
    for idx, tile in enumerate(tiles):
        i = idx // w_chunks
        j = idx % w_chunks

        h_start = i * chunk_size
        w_start = j * chunk_size

        tile_h, tile_w = tile.shape[1:]
        merged[:, h_start:h_start+tile_h, w_start:w_start+tile_w] = tile

    return merged

class AuraSR:
    def __init__(self, config: dict[str, Any], device: str = "cuda"):
        self.upsampler = UnetUpsampler(**config).to(device)
        self.input_image_size = config["input_image_size"]

    @classmethod
    def from_pretrained(cls, model_id: str = "fal-ai/AuraSR", use_safetensors: bool = True, device: str = "cuda"):
        import json
        import torch
        from pathlib import Path
        from huggingface_hub import snapshot_download

        # Check if model_id is a local file
        if Path(model_id).is_file():
            local_file = Path(model_id)
            if local_file.suffix == '.safetensors':
                use_safetensors = True
            elif local_file.suffix == '.ckpt':
                use_safetensors = False
            else:
                raise ValueError(f"Unsupported file format: {local_file.suffix}. Please use .safetensors or .ckpt files.")

            # For local files, we need to provide the config separately
            config_path = local_file.with_name('config.json')
            if not config_path.exists():
                raise FileNotFoundError(
                    f"Config file not found: {config_path}. "
                    f"When loading from a local file, ensure that 'config.json' "
                    f"is present in the same directory as '{local_file.name}'. "
                    f"If you're trying to load a model from Hugging Face, "
                    f"please provide the model ID instead of a file path."
                )

            config = json.loads(config_path.read_text())
            hf_model_path = local_file.parent
        else:
            hf_model_path = Path(snapshot_download(model_id))
            config = json.loads((hf_model_path / "config.json").read_text())

        model = cls(config, device)

        if use_safetensors:
            try:
                from safetensors.torch import load_file
                checkpoint = load_file(hf_model_path / "model.safetensors" if not Path(model_id).is_file() else model_id)
            except ImportError:
                raise ImportError(
                    "The safetensors library is not installed. "
                    "Please install it with `pip install safetensors` "
                    "or use `use_safetensors=False` to load the model with PyTorch."
                )
        else:
            checkpoint = torch.load(hf_model_path / "model.ckpt" if not Path(model_id).is_file() else model_id)

        model.upsampler.load_state_dict(checkpoint, strict=True)
        return model
    @torch.no_grad()
    def upscale_2x(self, image: Image.Image, max_batch_size=8) -> Image.Image:
        tensor_transform = transforms.ToTensor()
        device = self.upsampler.device
    
        image_tensor = tensor_transform(image).unsqueeze(0)
        _, _, h, w = image_tensor.shape
        pad_h = (self.input_image_size - h % self.input_image_size) % self.input_image_size
        pad_w = (self.input_image_size - w % self.input_image_size) % self.input_image_size
    
        # Pad the image
        image_tensor = torch.nn.functional.pad(image_tensor, (0, pad_w, 0, pad_h), mode='reflect').squeeze(0)
        tiles, h_chunks, w_chunks = tile_image(image_tensor, self.input_image_size)
    
        # Batch processing of tiles
        num_tiles = len(tiles)
        batches = [tiles[i:i + max_batch_size] for i in range(0, num_tiles, max_batch_size)]
        reconstructed_tiles = []
    
        for batch in batches:
            model_input = torch.stack(batch).to(device)
            generator_output = self.upsampler(
                lowres_image=model_input,
                noise=torch.randn(model_input.shape[0], 128, device=device)
            )
            reconstructed_tiles.extend(list(generator_output.clamp_(0, 1).detach().cpu()))
    
        merged_tensor = merge_tiles(reconstructed_tiles, h_chunks, w_chunks, self.input_image_size * 2)
        unpadded = merged_tensor[:, :h * 2, :w * 2]
    
        to_pil = transforms.ToPILImage()
        return to_pil(unpadded)

    @torch.no_grad()
    def upscale_4x(self, image: Image.Image, max_batch_size=8) -> Image.Image:
        tensor_transform = transforms.ToTensor()
        device = self.upsampler.device

        image_tensor = tensor_transform(image).unsqueeze(0)
        _, _, h, w = image_tensor.shape
        pad_h = (self.input_image_size - h % self.input_image_size) % self.input_image_size
        pad_w = (self.input_image_size - w % self.input_image_size) % self.input_image_size

        # Pad the image
        image_tensor = torch.nn.functional.pad(image_tensor, (0, pad_w, 0, pad_h), mode='reflect').squeeze(0)
        tiles, h_chunks, w_chunks = tile_image(image_tensor, self.input_image_size)

        # Batch processing of tiles
        num_tiles = len(tiles)
        batches = [tiles[i:i + max_batch_size] for i in range(0, num_tiles, max_batch_size)]
        reconstructed_tiles = []

        for batch in batches:
            model_input = torch.stack(batch).to(device)
            generator_output = self.upsampler(
                lowres_image=model_input,
                noise=torch.randn(model_input.shape[0], 128, device=device)
            )
            reconstructed_tiles.extend(list(generator_output.clamp_(0, 1).detach().cpu()))

        merged_tensor = merge_tiles(reconstructed_tiles, h_chunks, w_chunks, self.input_image_size * 4)
        unpadded = merged_tensor[:, :h * 4, :w * 4]

        to_pil = transforms.ToPILImage()
        return to_pil(unpadded)
    # Tiled 4x upscaling with overlapping tiles to reduce seam artifacts
    # weights options are 'checkboard' and 'constant'
    @torch.no_grad()
    def upscale_4x_overlapped(self, image, max_batch_size=16, weight_type='checkboard'):
        tensor_transform = transforms.ToTensor()
        device = self.upsampler.device

        image_tensor = tensor_transform(image).unsqueeze(0)
        _, _, h, w = image_tensor.shape

        # Calculate paddings
        pad_h = (
            self.input_image_size - h % self.input_image_size
        ) % self.input_image_size
        pad_w = (
            self.input_image_size - w % self.input_image_size
        ) % self.input_image_size

        pad_h = min(pad_h-1, self.input_image_size-1)
        pad_w = min(pad_w-1, self.input_image_size-1)
        # Pad the image
        image_tensor = torch.nn.functional.pad(
            image_tensor, (0, pad_w, 0, pad_h), mode="reflect"
        ).squeeze(0)

        # Function to process tiles
        def process_tiles(tiles, h_chunks, w_chunks):
            num_tiles = len(tiles)
            batches = [
                tiles[i : i + max_batch_size]
                for i in range(0, num_tiles, max_batch_size)
            ]
            reconstructed_tiles = []

            for batch in batches:
                model_input = torch.stack(batch).to(device)
                generator_output = self.upsampler(
                    lowres_image=model_input,
                    noise=torch.randn(model_input.shape[0], 128, device=device),
                )
                reconstructed_tiles.extend(
                    list(generator_output.clamp_(0, 1).detach().cpu())
                )

            return merge_tiles(
                reconstructed_tiles, h_chunks, w_chunks, self.input_image_size
            )

        # First pass
        tiles1, h_chunks1, w_chunks1 = tile_image(image_tensor, self.input_image_size)
        result1 = process_tiles(tiles1, h_chunks1, w_chunks1)

        # Second pass with offset
        offset = self.input_image_size // 2
        image_tensor_offset = torch.nn.functional.pad(image_tensor, (offset, offset, offset, offset), mode='reflect').squeeze(0)

        tiles2, h_chunks2, w_chunks2 = tile_image(
            image_tensor_offset, self.input_image_size
        )
        result2 = process_tiles(tiles2, h_chunks2, w_chunks2)

        # unpad
        offset_4x = offset * 1
        result2_interior = result2[:, offset_4x:-offset_4x, offset_4x:-offset_4x]

        if weight_type == 'checkboard':
            weight_tile = create_checkerboard_weights(self.input_image_size * 1)

            weight_shape = result2_interior.shape[1:]
            weights_1 = create_offset_weights(weight_tile, weight_shape)
            weights_2 = repeat_weights(weight_tile, weight_shape)

            normalizer = weights_1 + weights_2
            weights_1 = weights_1 / normalizer
            weights_2 = weights_2 / normalizer

            weights_1 = weights_1.unsqueeze(0).repeat(3, 1, 1)
            weights_2 = weights_2.unsqueeze(0).repeat(3, 1, 1)
        elif weight_type == 'constant':
            weights_1 = torch.ones_like(result2_interior) * 0.5
            weights_2 = weights_1
        else:
            raise ValueError("weight_type should be either 'gaussian' or 'constant' but got", weight_type)

        result1 = result1 * weights_2
        result2 = result2_interior * weights_1

        # Average the overlapping region
        result1 = (
            result1 + result2
        )

        # Remove padding
        unpadded = result1[:, : h, : w]

        to_pil = transforms.ToPILImage()
        return to_pil(unpadded)

In [5]:
os.mkdir('testgradcam')
os.chdir('testgradcam')
os.mkdir('test')
os.mkdir('train')
os.chdir('test')
os.mkdir('FAKE')
os.mkdir('REAL')
os.chdir('..')
os.chdir('train')
os.mkdir('FAKE')
os.mkdir('REAL')
os.chdir('..')
os.chdir('..')

In [6]:
import requests
from io import BytesIO
from PIL import Image
import os
import cv2
from PIL import Image
import numpy as np
aura_sr = AuraSR.from_pretrained()
def load_image_from_url(url):
    response = requests.get(url)
    image_data = BytesIO(response.content)
    return Image.open(image_data)
from torchvision.datasets import CIFAR10
from torch.utils.data import DataLoader
from torchvision import transforms

Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

LICENSE.md:   0%|          | 0.00/18.6k [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/222 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/983 [00:00<?, ?B/s]

model.ckpt:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

In [7]:
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
from torchvision.models import resnet50
from pytorch_grad_cam import GradCAM
from pytorch_grad_cam.utils.image import show_cam_on_image
from pytorch_grad_cam.utils.model_targets import ClassifierOutputTarget
import numpy as np
import cv2
from tqdm import tqdm
import os

def load_model(model_path, device='cuda'):
    model = resnet50(pretrained=False)
    model.fc = nn.Linear(model.fc.in_features, 2)
    model.load_state_dict(torch.load(model_path))
    model.to(device)
    model.eval()
    return model
def generate_intensity_matrix(grayscale_cam, size=(32, 32)):
    # Resize the grayscale CAM to desired dimensions
    intensity_matrix = cv2.resize(grayscale_cam, size, interpolation=cv2.INTER_LINEAR)
    # Values are already normalized between 0 and 1 from GradCAM
    return intensity_matrix

def generate_gradcam_visualizations(
    model_path,
    dataset_path,
    output_dir,
    num_samples=1,  # Changed to 1 for testing
    device='cuda'
):
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)

    # Load model
    model = load_model(model_path, device)

    # Define target layer for GradCAM
    target_layers = [model.layer4[-1]]
    # Initialize GradCAM
    cam = GradCAM(
        model=model,
        target_layers=target_layers,
    )

    # Setup data loading
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                           std=[0.229, 0.224, 0.225])
    ])

    # Load dataset
    dataset = torchvision.datasets.ImageFolder(
        root=dataset_path,
        transform=transform
    )

    # Select random samples
    num_samples = min(num_samples, len(dataset))
    indices = torch.randperm(len(dataset))[:num_samples]

    # Process each image
    for idx in tqdm(indices, desc="Generating Grad-CAM visualizations"):
        # Get image and label
        img_tensor, label = dataset[idx]
        input_tensor = img_tensor.unsqueeze(0)

        # Get original image for overlay
        img_path = dataset.imgs[idx][0]
        rgb_img = cv2.imread(img_path, 1)[:, :, ::-1]
        rgb_img = cv2.resize(rgb_img, (224, 224))
        rgb_img = np.float32(rgb_img) / 255

        # Generate class activation map
        grayscale_cam = cam(input_tensor=input_tensor, targets=None)
        grayscale_cam = grayscale_cam[0, :]

        # Generate 32x32 intensity matrix
        intensity_matrix = generate_intensity_matrix(grayscale_cam)

        # Save intensity matrix
        matrix_output_path = os.path.join(
            output_dir,
            f"intensity_matrix_{0}.npy"
        )
        np.save(matrix_output_path, intensity_matrix)

        # Print shape and some statistics of the intensity matrix
        print(f"Intensity Matrix Shape: {intensity_matrix.shape}")
        print(f"Min value: {intensity_matrix.min():.4f}")
        print(f"Max value: {intensity_matrix.max():.4f}")
        print(f"Mean value: {intensity_matrix.mean():.4f}")

        # Overlay CAM on original image
        visualization = show_cam_on_image(rgb_img, grayscale_cam, use_rgb=True)

        # Get model prediction
        with torch.no_grad():
            output = model(input_tensor.to(device))
            pred = output.argmax(dim=1).item()

        # Save the visualization
        output_path = os.path.join(
            output_dir,
            f"gradcam_{idx}_pred{pred}_true{label}.jpg"
        )
        cv2.imwrite(output_path, visualization[:, :, ::-1])

In [8]:
import os
import json
import pandas as pd
import torch
from transformers import CLIPProcessor, CLIPModel
from PIL import Image

class Filter():
    def __init__(self, csv_path ="/kaggle/input/artifact-interiit/Artifact_Description.csv", 
                 model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32"), 
                 processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")):
        self.csv_path = csv_path
        
        self.data = pd.read_csv(csv_path, on_bad_lines="skip")
        
        # Load CLIP model and processor
        self.model = model
        self.processor = processor
        
        # Prepare artifact descriptions
        self.artifact_texts = data['Artifact'] + ": " + data['Description']
        self.inputs_text = processor(text=artifact_texts.tolist(), return_tensors="pt", padding=True)
        
        # Extract text features
        with torch.no_grad():
            self.text_features = model.get_text_features(**inputs_text)

# Function to extract image features
    def get_image_features(self, image_path):
        # Load the image using Pillow
        image = Image.open(image_path).convert("RGB")
        # Process the image
        inputs_image = self.processor(images=image, return_tensors="pt")
        with torch.no_grad():
            image_features = self.model.get_image_features(**inputs_image)
        return image_features
    
    # Compare image features with artifact text features and save results
    def find_artifacts(self, image_path, k=5):
        image_features = self.get_image_features(image_path)
        similarities = torch.nn.functional.cosine_similarity(image_features, self.text_features)
        top_k_indices = similarities.topk(k).indices
        top_k_similarities = similarities[top_k_indices].cpu().numpy()
        results = data.iloc[top_k_indices].copy()
        results['Cosine_Similarity'] = top_k_similarities
        return results
    
    # Process all images in a folder and save results to a JSON file
    def filter_artifacts(self, image_folder, output_json = "image_text_results.json", k=10):
        results_dict = {}
        
        for image_name in os.listdir(image_folder):
            image_path = os.path.join(image_folder, image_name)
            
            if os.path.isfile(image_path):  # Ensure it's a file
                print(f"Processing: {image_name}")
                
                try:
                    top_artifacts = self.find_artifacts(image_path, k=k)
                    # Convert DataFrame to a dictionary format for JSON serialization
                    results_dict[image_name] = top_artifacts.to_dict(orient='records')
                except Exception as e:
                    print(f"Error processing {image_name}: {e}")
        
        # Save the results dictionary to a JSON file
        with open(output_json, 'w') as json_file:
            json.dump(results_dict, json_file, indent=4)
        print(f"Results saved to {output_json}")
    
    # Define the folder path and output JSON file
    # image_folder = "/kaggle/input/random-mirror-lady-fake-lol"  # Replace with your folder path
    # output_json = "/kaggle/working/top_artifacts.json"
    
    # Run the folder processing
    # process_folder(image_folder, output_json, k=5)


The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]



In [64]:
from PIL import Image
# from PIL import Image
import cv2

def process_img_final(image_path):
    # image = cv2.imread(image_path)
    # image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image = Image.open(image_path)
    # img = Image("/kaggle/input/pegasus/Screenshot 2024-11-24 161819.png")
    image = image.resize((32,32))
    image_2 = Image.fromarray(np.vstack((np.hstack((image,image)), np.hstack((image,image)))))
    # image.save("cropped_image.png")
    image.save("/kaggle/working/testgradcam/test/FAKE/peg32.png")
    image.save("/kaggle/working/testgradcam/test/REAL/peg32.png")
    
    image.save("/kaggle/working/testgradcam/train/FAKE/peg32.png")
    image.save("/kaggle/working/testgradcam/train/REAL/peg32.png")
    
    # # Load your 32x32 image (replace with your image path)
    # # image = Image.new("RGB", (32, 32), color="red")  # Example red image
    
    # # Calculate padding
    # pad_size = (48 - 32) // 2  # 8 pixels on each side
    
    # # Add padding
    # padded_image = Image.new("RGB", (48, 48), color=(0, 0, 0))  # Black padding
    # padded_image.paste(image, (pad_size, pad_size))  # Paste the original image centered
    
    # # Show the padded image
    # print(padded_image.size)
    # # import matplotlib.pyplot as plt
    # # for i in range(len(cifar10)):
    # #   img, label = cifar10[i]
    upscaled_image = aura_sr.upscale_4x(image_2)
    # upscaled_image = aura_sr.upscale_4x(upscaled_image)
    # # upscaled_image = cv2.resize(np.array(upscaled_image), (32,32), interpolation=cv2.INTER_CUBIC)
    # # cv2.imwrite("pegasus_aursr.png", upscaled_image)
    # # Image.save("pegasus_aursr", upscaled_image)
    # upscaled_image.save("pegasus_aursr.png")
    # print(upscaled_image.size)
    # upscaled_image
    #   # if i%100 == 0:
    #   #   print(i)
    # # from PIL import Image
    
    # # Open your 192x192 image (replace with your actual image path)
    # image = Image.open("/kaggle/working/pegasus_aursr.png")
    # print(image.size)
    # # Calculate the center coordinates
    # width, height = image.size
    # crop_size = 512
    # left = (width - crop_size) // 2
    # top = (height - crop_size) // 2
    # right = (width + crop_size) // 2
    # bottom = (height + crop_size) // 2
    
    # # Crop the image
    cropped_image = upscaled_image.crop((0, 0, 128, 128))
    
    # # Save or show the cropped image
    cropped_image.save("cropped_image.png")
    # # cropped_image
def process_img_final_32(image_path):
    # image = cv2.imread(image_path)
    # image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image = Image.open(image_path)
    # img = Image("/kaggle/input/pegasus/Screenshot 2024-11-24 161819.png")
    image = image.resize((32,32))
    image_2 = Image.fromarray(np.vstack((np.hstack((image,image)), np.hstack((image,image)))))
    # image.save("cropped_image.png")
    image.save("/kaggle/working/testgradcam/test/FAKE/peg32.png")
    image.save("/kaggle/working/testgradcam/test/REAL/peg32.png")
    
    image.save("/kaggle/working/testgradcam/train/FAKE/peg32.png")
    image.save("/kaggle/working/testgradcam/train/REAL/peg32.png")
    
    # # Load your 32x32 image (replace with your image path)
    # # image = Image.new("RGB", (32, 32), color="red")  # Example red image
    
    # # Calculate padding
    # pad_size = (48 - 32) // 2  # 8 pixels on each side
    
    # # Add padding
    # padded_image = Image.new("RGB", (48, 48), color=(0, 0, 0))  # Black padding
    # padded_image.paste(image, (pad_size, pad_size))  # Paste the original image centered
    
    # # Show the padded image
    # print(padded_image.size)
    # # import matplotlib.pyplot as plt
    # # for i in range(len(cifar10)):
    # #   img, label = cifar10[i]
    # upscaled_image = aura_sr.upscale_4x(image_2)
    # upscaled_image = aura_sr.upscale_4x(upscaled_image)
    # # upscaled_image = cv2.resize(np.array(upscaled_image), (32,32), interpolation=cv2.INTER_CUBIC)
    # # cv2.imwrite("pegasus_aursr.png", upscaled_image)
    # # Image.save("pegasus_aursr", upscaled_image)
    # upscaled_image.save("pegasus_aursr.png")
    # print(upscaled_image.size)
    # upscaled_image
    #   # if i%100 == 0:
    #   #   print(i)
    # # from PIL import Image
    
    # # Open your 192x192 image (replace with your actual image path)
    # image = Image.open("/kaggle/working/pegasus_aursr.png")
    # print(image.size)
    # # Calculate the center coordinates
    # width, height = image.size
    # crop_size = 512
    # left = (width - crop_size) // 2
    # top = (height - crop_size) // 2
    # right = (width + crop_size) // 2
    # bottom = (height + crop_size) // 2
    
    # # Crop the image
    # cropped_image = upscaled_image.crop((0, 0, 128, 128))
    
    # # Save or show the cropped image
    image.save("cropped_image.png")
    # # cropped_image



In [42]:
import json
import clip
import cv2
import matplotlib.pyplot as plt
import numpy as np
import torch
from PIL import Image

# Load CLIP model

class Final():
    def __init__(self):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model, self.preprocess = clip.load("ViT-B/32", device=self.device)
        self.data = pd.read_csv("/kaggle/input/artifact-interiit/Artifact_Description.csv", on_bad_lines="skip")
    # def load_descriptions_from_json(json_path, top_n=10):
    #     """
    #     Load top N descriptions from the JSON file.
    #     """
    #     with open(json_path, "r") as f:
    #         data = json.load(f)
        
    #     # Collect the top N descriptions
    #     descriptions = []
    #     for image_name, artifacts in data.items():
    #         descriptions.extend(list(artifacts.keys())[:top_n])
        
    #     return list(set(descriptions))  # Deduplicate descriptions
    
    
    def image_text_search(self, image_patches, text_descriptions):
        """
        Compute text-image similarity for each patch and return scores.
        """
        text = clip.tokenize(text_descriptions).to(self.device)
        patch_results = []
    
        for patch in image_patches:
            patch = self.preprocess(patch).unsqueeze(0).to(self.device)
            with torch.no_grad():
                logits_per_image, _ = self.model(patch, text)
                probs = logits_per_image.softmax(dim=-1).cpu().numpy()
            patch_results.append(probs[0])
        
        return np.array(patch_results)  # Shape: (num_patches, num_descriptions)
    
    
    def split_image_into_patches(self, image, patch_size, intensities):
        """
        Split the image into non-overlapping patches of size (patch_size x patch_size).
        """
        image = np.array(image)
        h, w, c = image.shape
        patches = []
        intensity = []
        for i in range(0, h, patch_size):
            for j in range(0, w, patch_size):
                patch = image[i : i + patch_size, j : j + patch_size]
                gradpatch = intensities[i : i + patch_size, j : j + patch_size]
                if patch.shape[:2] == (patch_size, patch_size):  # Ensure patch size
                    patches.append(Image.fromarray(patch))
                    intensity.append(np.sum(gradpatch))
        return patches, intensity
    
    # def patch_weight(i, j, image, intensity)
    
    
    def give_result(self, patch_results, text_descriptions, patch_weights):
        """
        Perform majority voting to determine the most frequent description.
        """
        votes = np.argmax(patch_results, axis=1)  # Get index of max similarity for each patch
        weighted_avg = (votes==0)*patch_weights / ((votes!=2)*patch_weights)
        # unique, counts = np.unique(votes, return_counts=True)  # Count occurrences
        # majority_index = unique[np.argmax(counts)]  # Get the index with the highest count
        return weighted_avg
    
    
    def overall_majority_vote(self, results, text_descriptions):
        """
        Perform a majority vote across patch sizes.
        """
        all_votes = np.concatenate([np.argmax(res, axis=1) for res in results])
        unique, counts = np.unique(all_votes, return_counts=True)
        majority_index = unique[np.argmax(counts)]
        return text_descriptions[majority_index], dict(zip(unique, counts))
    
    def give_result(self, patch_results, text_descriptions, patch_weights):
        """
        Perform majority voting to determine the most frequent description.
        """
        votes = np.argmax(patch_results, axis=1)  # Get index of max similarity for each patch
        if (np.sum((votes!=2)*patch_weights)!=0):
            weighted_avg = np.sum((votes==0)*patch_weights) / np.sum((votes!=2)*patch_weights)
        else:
            weighted_avg = 0
        # unique, counts = np.unique(votes, return_counts=True)  # Count occurrences
        # majority_index = unique[np.argmax(counts)]  # Get the index with the highest count
        return weighted_avg
    
    def final_img_process(self, description_list):
        
        # Load descriptions from the JSON file
        json_path = "image_text_results.json"  # Path to the JSON file
        # descriptions = load_descriptions_from_json(json_path, top_n=10)
        descriptions = description_list
        
        # Parameters
        patch_sizes = [16, 32, 64, 128, 256]  # Patch sizes to process
        
        # Load and preprocess the image
        # image = cv2.imread("/kaggle/working/cropped_image.png")
        # image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = Image.open("/kaggle/working/cropped_image.png")
        intensity = np.load("/kaggle/working/gradcam_visualizations/intensity_matrix_0.npy")
        intensity = np.array(Image.fromarray(intensity).resize((512,512)))
        # Process the image with each patch size
        all_results = []
        all_weights = []
        for patch_size in patch_sizes:
            patches, patch_intensity = self.split_image_into_patches(image, patch_size, intensity)
            patch_results = self.image_text_search(patches, descriptions)
            all_results.append(list(patch_results))
            all_weights.append(patch_intensity)
        
        # all_results[0].shape
        
        # allnp.array([i for j in all_results for i in j])
        # # Perform majority voting for each patch size
        # individual_results = []
        
        stacked_results = np.array([i for j in all_results for i in j])
        stacked_weights = np.array([i for j in all_weights for i in j])
        
        
        # # for i, patch_results in enumerate(all_results):
        weighted_avg = self.give_result(stacked_results, descriptions, stacked_weights)
        #     # individual_results.append((patch_sizes[i], label, patch_votes))
        #     # print(f"Patch Size: {patch_sizes[i]}, Final Label: {label}, Patch Votes: {patch_votes}")
        
        # # Perform overall majority voting across all patch sizes
        # # final_label, overall_votes = overall_majority_vote(all_results, descriptions)
        
        # Display the image with final results
        # plt.imshow(np.array(image))
        # title_string = f"Final Label: {final_label}\nOverall Votes: {overall_votes}"
        # plt.title(title_string)
        # plt.axis("off")
        # plt.show()
        
        # print(weighted_avg)
        return weighted_avg

In [65]:
def setup_image(img_path):
    process_img_final(img_path)
    generate_gradcam_visualizations(
        model_path='/kaggle/input/forgradcam/pytorch/default/1/fake_detector.pth',
        dataset_path='/kaggle/working/testgradcam',
        output_dir='gradcam_visualizations',
        num_samples=1,
        device='cuda' if torch.cuda.is_available() else 'cpu'
    )

def setup_image_32(img_path):
    process_img_final_32(img_path)
    generate_gradcam_visualizations(
        model_path='/kaggle/input/forgradcam/pytorch/default/1/fake_detector.pth',
        dataset_path='/kaggle/working/testgradcam',
        output_dir='gradcam_visualizations',
        num_samples=1,
        device='cuda' if torch.cuda.is_available() else 'cpu'
    )


In [44]:
# filter = Filter()
# filter.filter_artifacts("/kaggle/input/cifake-test")

In [45]:
def extract_probable_artifacts_from_JSON(filename, json_file_path = "image_text_results.json"):
    
    try:
        # Load JSON data from the file
        with open(json_file_path, 'r') as file:
            data = json.load(file)
        
        # Extract artifacts for the given filename
        if filename in data:
            return [entry["Artifact"] for entry in data[filename]]
        else:
            return []
    except FileNotFoundError:
        print(f"Error: File not found at {json_file_path}")
        return []
    except json.JSONDecodeError:
        print("Error: Invalid JSON format")
        return []
    
    
def find_artifacts_in_image(folder_path, threshold = 0.5):
    sr_artifacts = [
        "Incorrect reflection mapping",
        "Abruptly cut off objects",
        "Ghosting effects: Semi-transparent duplicates of elements",
        "Dental anomalies in mammals",
        "Anatomically incorrect paw structures",
        "Unrealistic eye reflections",
        "Misshapen ears or appendages",
        "Unnatural pose artifacts",
        "Biological asymmetry errors",
        "Impossible foreshortening in animal bodies",
        "Impossible mechanical connections",
        "Impossible mechanical joints",
        "Physically impossible structural elements",
        "Incorrect wheel geometry",
        "Implausible aerodynamic structures",
        "Misaligned body panels",
        "Distorted window reflections",
        "Anatomically impossible joint configurations",
        "Non-manifold geometries in rigid structures",
        "Asymmetric features in naturally symmetric objects",
        "Misaligned bilateral elements in animal faces",
        "Irregular proportions in mechanical components",
        "Inconsistent scale of mechanical parts",
        "Incorrect perspective rendering",
        "Scale inconsistencies within single objects",
        "Spatial relationship errors",
        "Scale inconsistencies within the same object class",
        "Depth perception anomalies"
    ]

    tuple_desc = pd.read_csv('/kaggle/input/simplified-tuple-artifact/simplified_artifact_descriptions.csv')
    # filter_artifacts(folder_path)
    # filter = Filter()
    # filter.filter_artifacts(folder_path)
    result = {}
    artifact_detector = Final()
    for filename in os.listdir(folder_path):
        if filename.lower().endswith((".png", ".jpg", ".jpeg", ".bmp", ".gif")):
            # Read and preprocess the image
            image_path = os.path.join(folder_path, filename)
            # artifacts = extract_probable_artifacts_from_JSON(filename)
            artifacts_present = []
            setup_image(image_path)
            for artifact in sr_artifacts:
                # description = tuple_desc['Artifact'].loc(tuple_desc) #write code to get list of artifact 3 tuple
                row = tuple_desc[tuple_desc['Artifact'] == artifact]
                description = row.iloc[0, 1:].tolist()
                print(description)
                
                score = artifact_detector.final_img_process(description)
                if score>threshold: 
                    artifacts_present.append(artifact)
                print(filename, " processed for ", artifact, " with probability of,", score)
                
            result[filename] = artifacts_present
    return result
    

In [None]:
# def find_artifacts_in_image(folder_path, threshold = 0.5):
#     sr_artifacts = [
#         "Incorrect reflection mapping",
#         "Abruptly cut off objects",
#         "Ghosting effects: Semi-transparent duplicates of elements",
#         "Dental anomalies in mammals",
#         "Anatomically incorrect paw structures",
#         "Unrealistic eye reflections",
#         "Misshapen ears or appendages",
#         "Unnatural pose artifacts",
#         "Biological asymmetry errors",
#         "Impossible foreshortening in animal bodies",
#         "Impossible mechanical connections",
#         "Impossible mechanical joints",
#         "Physically impossible structural elements",
#         "Incorrect wheel geometry",
#         "Implausible aerodynamic structures",
#         "Misaligned body panels",
#         "Distorted window reflections",
#         "Anatomically impossible joint configurations",
#         "Non-manifold geometries in rigid structures",
#         "Asymmetric features in naturally symmetric objects",
#         "Misaligned bilateral elements in animal faces",
#         "Irregular proportions in mechanical components",
#         "Inconsistent scale of mechanical parts",
#         "Incorrect perspective rendering",
#         "Scale inconsistencies within single objects",
#         "Spatial relationship errors",
#         "Scale inconsistencies within the same object class",
#         "Depth perception anomalies"
#     ]

#     tuple_desc = pd.read_csv('/kaggle/input/simplified-tuple-artifact/simplified_artifact_descriptions.csv')
#     # filter_artifacts(folder_path)
#     # filter = Filter()
#     # filter.filter_artifacts(folder_path)
#     result = {}
#     artifact_detector = Final()
#     for filename in os.listdir(folder_path):
#         if filename.lower().endswith((".png", ".jpg", ".jpeg", ".bmp", ".gif")):
#             # Read and preprocess the image
#             image_path = os.path.join(folder_path, filename)
#             # artifacts = extract_probable_artifacts_from_JSON(filename)
#             artifacts_present = []
#             setup_image(image_path)
#             for artifact in sr_artifacts:
#                 # description = tuple_desc['Artifact'].loc(tuple_desc) #write code to get list of artifact 3 tuple
#                 row = tuple_desc[tuple_desc['Artifact'] == artifact]
#                 description = row.iloc[0, 1:].tolist()
#                 print(description)
                
#                 score = artifact_detector.final_img_process(description)
#                 if score>threshold: 
#                     artifacts_present.append(artifact)
#                 print(filename, " processed for ", artifact, " with probability of,", score)
                
#             result[filename] = artifacts_present
#     return result

In [54]:
result = find_artifacts_in_image("/kaggle/input/manually-labelled-artifacts", 0.7)
print(result)

  model.load_state_dict(torch.load(model_path))
Generating Grad-CAM visualizations: 100%|██████████| 1/1 [00:00<00:00, 27.92it/s]


Intensity Matrix Shape: (32, 32)
Min value: 0.0091
Max value: 0.9982
Mean value: 0.2934
['Reflections that do not match', 'Reflections accurately match the surrounding', 'N/A for non-reflective scenes']
Image_1.png  processed for  Incorrect reflection mapping  with probability of, 0.8156757
['Objects that end suddenly instead', 'Objects taper naturally, following proper', 'Fully rendered objects']
Image_1.png  processed for  Abruptly cut off objects  with probability of, 0.0
['Duplicate elements that appear faint', 'Elements are unique and do', 'Solid, non-transparent objects']
Image_1.png  processed for  Ghosting effects: Semi-transparent duplicates of elements  with probability of, 0.6418769
['Teeth that are shaped or', 'Teeth are correctly shaped and', 'N/A for Non-biological or abstract']
Image_1.png  processed for  Dental anomalies in mammals  with probability of, 0.045781504
['Paws that do not resemble', 'Paws follow the natural anatomy', 'N/A for Non-furry or non-biological']
Im

Generating Grad-CAM visualizations: 100%|██████████| 1/1 [00:00<00:00, 29.51it/s]


Intensity Matrix Shape: (32, 32)
Min value: 0.0000
Max value: 1.0000
Mean value: 0.2570
['Reflections that do not match', 'Reflections accurately match the surrounding', 'N/A for non-reflective scenes']
Image_6.png  processed for  Incorrect reflection mapping  with probability of, 0.327395
['Objects that end suddenly instead', 'Objects taper naturally, following proper', 'Fully rendered objects']
Image_6.png  processed for  Abruptly cut off objects  with probability of, 0.0
['Duplicate elements that appear faint', 'Elements are unique and do', 'Solid, non-transparent objects']
Image_6.png  processed for  Ghosting effects: Semi-transparent duplicates of elements  with probability of, 0.57234985
['Teeth that are shaped or', 'Teeth are correctly shaped and', 'N/A for Non-biological or abstract']
Image_6.png  processed for  Dental anomalies in mammals  with probability of, 0.16215932
['Paws that do not resemble', 'Paws follow the natural anatomy', 'N/A for Non-furry or non-biological']
Ima

Generating Grad-CAM visualizations: 100%|██████████| 1/1 [00:00<00:00, 30.46it/s]


Intensity Matrix Shape: (32, 32)
Min value: 0.0000
Max value: 0.9923
Mean value: 0.3921
['Reflections that do not match', 'Reflections accurately match the surrounding', 'N/A for non-reflective scenes']
Image_3.png  processed for  Incorrect reflection mapping  with probability of, 0.21772125
['Objects that end suddenly instead', 'Objects taper naturally, following proper', 'Fully rendered objects']
Image_3.png  processed for  Abruptly cut off objects  with probability of, 0.0
['Duplicate elements that appear faint', 'Elements are unique and do', 'Solid, non-transparent objects']
Image_3.png  processed for  Ghosting effects: Semi-transparent duplicates of elements  with probability of, 0.632673
['Teeth that are shaped or', 'Teeth are correctly shaped and', 'N/A for Non-biological or abstract']
Image_3.png  processed for  Dental anomalies in mammals  with probability of, 0.15273473
['Paws that do not resemble', 'Paws follow the natural anatomy', 'N/A for Non-furry or non-biological']
Ima

Generating Grad-CAM visualizations: 100%|██████████| 1/1 [00:00<00:00, 30.96it/s]


Intensity Matrix Shape: (32, 32)
Min value: 0.0000
Max value: 1.0000
Mean value: 0.1622
['Reflections that do not match', 'Reflections accurately match the surrounding', 'N/A for non-reflective scenes']
Image_2.png  processed for  Incorrect reflection mapping  with probability of, 0.040881604
['Objects that end suddenly instead', 'Objects taper naturally, following proper', 'Fully rendered objects']
Image_2.png  processed for  Abruptly cut off objects  with probability of, 0.38362142
['Duplicate elements that appear faint', 'Elements are unique and do', 'Solid, non-transparent objects']
Image_2.png  processed for  Ghosting effects: Semi-transparent duplicates of elements  with probability of, 0.6924018
['Teeth that are shaped or', 'Teeth are correctly shaped and', 'N/A for Non-biological or abstract']
Image_2.png  processed for  Dental anomalies in mammals  with probability of, 0.61835444
['Paws that do not resemble', 'Paws follow the natural anatomy', 'N/A for Non-furry or non-biologi

Generating Grad-CAM visualizations: 100%|██████████| 1/1 [00:00<00:00, 31.17it/s]


Intensity Matrix Shape: (32, 32)
Min value: 0.0000
Max value: 0.9645
Mean value: 0.1710
['Reflections that do not match', 'Reflections accurately match the surrounding', 'N/A for non-reflective scenes']
Image_5.png  processed for  Incorrect reflection mapping  with probability of, 0.112543546
['Objects that end suddenly instead', 'Objects taper naturally, following proper', 'Fully rendered objects']
Image_5.png  processed for  Abruptly cut off objects  with probability of, 0.0
['Duplicate elements that appear faint', 'Elements are unique and do', 'Solid, non-transparent objects']
Image_5.png  processed for  Ghosting effects: Semi-transparent duplicates of elements  with probability of, 0.9722256
['Teeth that are shaped or', 'Teeth are correctly shaped and', 'N/A for Non-biological or abstract']
Image_5.png  processed for  Dental anomalies in mammals  with probability of, 0.4049022
['Paws that do not resemble', 'Paws follow the natural anatomy', 'N/A for Non-furry or non-biological']
Im

Generating Grad-CAM visualizations: 100%|██████████| 1/1 [00:00<00:00, 30.95it/s]


Intensity Matrix Shape: (32, 32)
Min value: 0.0023
Max value: 1.0000
Mean value: 0.4815
['Reflections that do not match', 'Reflections accurately match the surrounding', 'N/A for non-reflective scenes']
Image_7.png  processed for  Incorrect reflection mapping  with probability of, 0.23143366
['Objects that end suddenly instead', 'Objects taper naturally, following proper', 'Fully rendered objects']
Image_7.png  processed for  Abruptly cut off objects  with probability of, 0.0
['Duplicate elements that appear faint', 'Elements are unique and do', 'Solid, non-transparent objects']
Image_7.png  processed for  Ghosting effects: Semi-transparent duplicates of elements  with probability of, 0.57193583
['Teeth that are shaped or', 'Teeth are correctly shaped and', 'N/A for Non-biological or abstract']
Image_7.png  processed for  Dental anomalies in mammals  with probability of, 0.13941008
['Paws that do not resemble', 'Paws follow the natural anatomy', 'N/A for Non-furry or non-biological']
I

Generating Grad-CAM visualizations: 100%|██████████| 1/1 [00:00<00:00, 32.22it/s]


Intensity Matrix Shape: (32, 32)
Min value: 0.0000
Max value: 1.0000
Mean value: 0.2258
['Reflections that do not match', 'Reflections accurately match the surrounding', 'N/A for non-reflective scenes']
Image_10.png  processed for  Incorrect reflection mapping  with probability of, 1.0
['Objects that end suddenly instead', 'Objects taper naturally, following proper', 'Fully rendered objects']
Image_10.png  processed for  Abruptly cut off objects  with probability of, 0.0
['Duplicate elements that appear faint', 'Elements are unique and do', 'Solid, non-transparent objects']
Image_10.png  processed for  Ghosting effects: Semi-transparent duplicates of elements  with probability of, 1.0
['Teeth that are shaped or', 'Teeth are correctly shaped and', 'N/A for Non-biological or abstract']
Image_10.png  processed for  Dental anomalies in mammals  with probability of, 0.31926593
['Paws that do not resemble', 'Paws follow the natural anatomy', 'N/A for Non-furry or non-biological']
Image_10.pn

Generating Grad-CAM visualizations: 100%|██████████| 1/1 [00:00<00:00, 31.80it/s]


Intensity Matrix Shape: (32, 32)
Min value: 0.0000
Max value: 1.0000
Mean value: 0.1896
['Reflections that do not match', 'Reflections accurately match the surrounding', 'N/A for non-reflective scenes']
Image_8.png  processed for  Incorrect reflection mapping  with probability of, -3.5790195e-05
['Objects that end suddenly instead', 'Objects taper naturally, following proper', 'Fully rendered objects']
Image_8.png  processed for  Abruptly cut off objects  with probability of, 0.0
['Duplicate elements that appear faint', 'Elements are unique and do', 'Solid, non-transparent objects']
Image_8.png  processed for  Ghosting effects: Semi-transparent duplicates of elements  with probability of, 0.74489474
['Teeth that are shaped or', 'Teeth are correctly shaped and', 'N/A for Non-biological or abstract']
Image_8.png  processed for  Dental anomalies in mammals  with probability of, 0.35867646
['Paws that do not resemble', 'Paws follow the natural anatomy', 'N/A for Non-furry or non-biological

Generating Grad-CAM visualizations: 100%|██████████| 1/1 [00:00<00:00, 29.60it/s]


Intensity Matrix Shape: (32, 32)
Min value: 0.0000
Max value: 0.9977
Mean value: 0.3279
['Reflections that do not match', 'Reflections accurately match the surrounding', 'N/A for non-reflective scenes']
Image_4.png  processed for  Incorrect reflection mapping  with probability of, 0.10510454
['Objects that end suddenly instead', 'Objects taper naturally, following proper', 'Fully rendered objects']
Image_4.png  processed for  Abruptly cut off objects  with probability of, 0.0
['Duplicate elements that appear faint', 'Elements are unique and do', 'Solid, non-transparent objects']
Image_4.png  processed for  Ghosting effects: Semi-transparent duplicates of elements  with probability of, 0.9363378
['Teeth that are shaped or', 'Teeth are correctly shaped and', 'N/A for Non-biological or abstract']
Image_4.png  processed for  Dental anomalies in mammals  with probability of, 0.40120396
['Paws that do not resemble', 'Paws follow the natural anatomy', 'N/A for Non-furry or non-biological']
Im

Generating Grad-CAM visualizations: 100%|██████████| 1/1 [00:00<00:00, 32.39it/s]


Intensity Matrix Shape: (32, 32)
Min value: 0.0000
Max value: 1.0000
Mean value: 0.2523
['Reflections that do not match', 'Reflections accurately match the surrounding', 'N/A for non-reflective scenes']
Image_9.png  processed for  Incorrect reflection mapping  with probability of, 0.014033097
['Objects that end suddenly instead', 'Objects taper naturally, following proper', 'Fully rendered objects']
Image_9.png  processed for  Abruptly cut off objects  with probability of, 0.0
['Duplicate elements that appear faint', 'Elements are unique and do', 'Solid, non-transparent objects']
Image_9.png  processed for  Ghosting effects: Semi-transparent duplicates of elements  with probability of, 0.55091214
['Teeth that are shaped or', 'Teeth are correctly shaped and', 'N/A for Non-biological or abstract']
Image_9.png  processed for  Dental anomalies in mammals  with probability of, 0.31650913
['Paws that do not resemble', 'Paws follow the natural anatomy', 'N/A for Non-furry or non-biological']


In [106]:
# process_img_final_with_downsize_to_32("/kaggle/input/pegasus/Screenshot 2024-11-24 161819.png")
# # process_img_final_already_32("/kaggle/input/cifake-test/1019 (2).jpg")

(48, 48)
(768, 768)
(768, 768)


In [63]:
print(result["Image_6.png"])
print(image_artifacts['Image_6.png'])

['Unrealistic eye reflections', 'Misshapen ears or appendages', 'Impossible foreshortening in animal bodies', 'Impossible mechanical joints', 'Incorrect wheel geometry', 'Misaligned body panels', 'Distorted window reflections', 'Non-manifold geometries in rigid structures', 'Asymmetric features in naturally symmetric objects', 'Misaligned bilateral elements in animal faces', 'Scale inconsistencies within the same object class']
['Inconsistent scale of mechanical parts']


In [74]:
def find_artifacts_in_image_32(folder_path, threshold = 0.5):
    non_sr_artifacts = [
        "Improper fur direction flows",
        "Incorrect skin tones",
        "Inconsistent object boundaries",
        "Blurred boundaries in fine details",
        "Over-sharpening artifacts",
        "Excessive sharpness in certain image regions",
        "Aliasing along high-contrast edges",
        "Jagged edges in curved structures",
        "Fake depth of field",
        "Artificial depth of field in object presentation",
        "Discontinuous surfaces",
        "Unnaturally glossy surfaces",
        "Metallic surface artifacts",
        "Texture bleeding between adjacent regions",
        "Texture repetition patterns",
        "Over-smoothing of natural textures",
        "Regular grid-like artifacts in textures",
        "Artificial noise patterns in uniform surfaces",
        "Random noise patterns in detailed areas",
        "Repeated element patterns",
        "Systematic color distribution anomalies",
        "Unnatural color transitions",
        "Color coherence breaks",
        "Frequency domain signatures",
        "Artificial smoothness",
        "Cinematization effects",
        "Movie-poster-like composition of ordinary scenes",
        "Exaggerated characteristic features",
        "Synthetic material appearance",
        "Floating or disconnected components",
        "Inconsistent material properties",
        "Depth perception anomalies",
        "Loss of fine detail in complex structures",
        "Resolution inconsistencies within regions",
        "Abruptly cut off objects",
        "Unrealistic specular highlights",
    ]


    tuple_desc = pd.read_csv('/kaggle/input/simplified-tuple-artifact/simplified_artifact_descriptions.csv')
    # filter_artifacts(folder_path)
    # filter = Filter()
    # filter.filter_artifacts(folder_path)
    result = {}
    artifact_detector = Final()
    for filename in os.listdir(folder_path):
        if filename.lower().endswith((".png", ".jpg", ".jpeg", ".bmp", ".gif")):
            # Read and preprocess the image
            image_path = os.path.join(folder_path, filename)
            # artifacts = extract_probable_artifacts_from_JSON(filename)
            artifacts_present = []
            setup_image(image_path)
            for artifact in non_sr_artifacts:
                # description = tuple_desc['Artifact'].loc(tuple_desc) #write code to get list of artifact 3 tuple
                row = tuple_desc[tuple_desc['Artifact'] == artifact]
                if not (row.empty): 
                    description = row.iloc[0, 1:].tolist()
                else:
                    continue
                print(description)
                
                score = artifact_detector.final_img_process(description)
                if score>threshold: 
                    artifacts_present.append(artifact)
                print(filename, " processed for ", artifact, " with probability of,", score)
                
            result[filename] = artifacts_present
    return result

In [76]:
image_artifacts = {
    "Image_1.png": [
        "Biological asymmetry error",
        "Anatomically impossible joint configurations",
        "Scale inconsistencies within single objects",
        "Unnaturally glossy surfaces",
        "Unnatural color transitions",
        "Cinematization effects",
        "Over smoothing of natural textures",
    ],
    "Image_2.png": [
        "Unnatural glossy surfaces",
        "Excessive sharpness in certain image regions",
        "Color coherence breaks",
    ],
    "Image_3.png": [
        "Missshapen ears or appendages",
        "Anatomically incorrect paw structures",
        "Dental anomalies in animals",
        "Over smoothing of natural textures",
    ],
    "Image_4.png": [
        "Abruptly cut off objects",
        "Biological asymmetry errors",
        "Asymmetric features in naturally symmetric objects",
        "Scale inconsistencies within single objects",
        "Impossible foreshortening in animal bodies",
    ],
    "Image_5.png": [
        "Anatomically impossible joint configurations",
        "Systematic color distribution anomalies",
        "Artificial smoothness",
    ],
    "Image_6.png": [
        "Glow or light bleed around object boundaries",
        "Inconsistent scale of mechanical parts",
    ],
    "Image_7.png": [
        "Dramatic lighting that defies natural surfaces",
        "Metallic surface artifacts",
        "Multiple light source conflicts",
        "Unnatural color transitions",
        "Unrealistic specular highlights",
    ],
    "Image_8.png": [
        "Inconsistent shadow directions",
        "Impossible mechanical connections",
        "Non-manifold geometries in rigid structures",
        "Implausible aerodynamic structures",
    ],
    "Image_9.png": [
        "Incorrect reflection mapping",
        "Scale inconsistencies within single objects",
        "Blurred boundaries in fine details",
    ],
}


In [77]:
def calculate_iou(dict1, dict2):
    iou_scores = {}
    
    for image_name in dict1:
        if image_name in dict2:
            # Get the sets of artifacts
            artifacts1 = set(dict1[image_name])
            artifacts2 = set(dict2[image_name])
            
            # Calculate intersection and union
            intersection = artifacts1 & artifacts2
            union = artifacts1 | artifacts2
            
            # Calculate IoU
            iou = len(intersection) / len(union) if union else 0
            iou_scores[image_name] = iou
    
    return iou_scores

# Example usage
iou_scores = calculate_iou(image_artifacts, result_32)
print(iou_scores)


{'Image_1.png': 0.05263157894736842, 'Image_2.png': 0.09090909090909091, 'Image_3.png': 0.0, 'Image_4.png': 0.0, 'Image_5.png': 0.047619047619047616, 'Image_6.png': 0.0, 'Image_7.png': 0.125, 'Image_8.png': 0.0, 'Image_9.png': 0.0}


In [79]:
result_32 = find_artifacts_in_image_32("/kaggle/input/manually-labelled-artifacts", 0.9)
print(result_32)

  model.load_state_dict(torch.load(model_path))
Generating Grad-CAM visualizations: 100%|██████████| 1/1 [00:00<00:00, 28.21it/s]


Intensity Matrix Shape: (32, 32)
Min value: 0.0091
Max value: 0.9982
Mean value: 0.2934
['Fur that grows in unrealistic', 'Fur grows naturally with proper', 'N/A for Non-furry objects']
Image_1.png  processed for  Improper fur direction flows  with probability of, 0.97259974
['Edges of objects are unclear,', 'Objects have clear and well-defined', 'Flat 2D images, simple backgrounds']
Image_1.png  processed for  Inconsistent object boundaries  with probability of, 0.96396244
['Details appear soft or smudged.', 'Details are sharp, clear, and', 'Simple, large objects']
Image_1.png  processed for  Blurred boundaries in fine details  with probability of, 0.00353237
['Edges appear unnaturally sharp.', 'Edges are naturally smooth and', 'Soft or out-of-focus images']
Image_1.png  processed for  Over-sharpening artifacts  with probability of, 0.9012657
['Some parts of the image', 'Sharpness is consistent and realistic', 'Soft, blurred regions']
Image_1.png  processed for  Excessive sharpness in

Generating Grad-CAM visualizations: 100%|██████████| 1/1 [00:00<00:00, 30.68it/s]


Intensity Matrix Shape: (32, 32)
Min value: 0.0000
Max value: 1.0000
Mean value: 0.2570
['Fur that grows in unrealistic', 'Fur grows naturally with proper', 'N/A for Non-furry objects']
Image_6.png  processed for  Improper fur direction flows  with probability of, 0.29831597
['Edges of objects are unclear,', 'Objects have clear and well-defined', 'Flat 2D images, simple backgrounds']
Image_6.png  processed for  Inconsistent object boundaries  with probability of, 1.0
['Details appear soft or smudged.', 'Details are sharp, clear, and', 'Simple, large objects']
Image_6.png  processed for  Blurred boundaries in fine details  with probability of, 0.122857295
['Edges appear unnaturally sharp.', 'Edges are naturally smooth and', 'Soft or out-of-focus images']
Image_6.png  processed for  Over-sharpening artifacts  with probability of, 1.0
['Some parts of the image', 'Sharpness is consistent and realistic', 'Soft, blurred regions']
Image_6.png  processed for  Excessive sharpness in certain ima

Generating Grad-CAM visualizations: 100%|██████████| 1/1 [00:00<00:00, 29.82it/s]


Intensity Matrix Shape: (32, 32)
Min value: 0.0000
Max value: 0.9923
Mean value: 0.3921
['Fur that grows in unrealistic', 'Fur grows naturally with proper', 'N/A for Non-furry objects']
Image_3.png  processed for  Improper fur direction flows  with probability of, 0.7307074
['Edges of objects are unclear,', 'Objects have clear and well-defined', 'Flat 2D images, simple backgrounds']
Image_3.png  processed for  Inconsistent object boundaries  with probability of, 1.0
['Details appear soft or smudged.', 'Details are sharp, clear, and', 'Simple, large objects']
Image_3.png  processed for  Blurred boundaries in fine details  with probability of, 0.045492258
['Edges appear unnaturally sharp.', 'Edges are naturally smooth and', 'Soft or out-of-focus images']
Image_3.png  processed for  Over-sharpening artifacts  with probability of, 0.83822954
['Some parts of the image', 'Sharpness is consistent and realistic', 'Soft, blurred regions']
Image_3.png  processed for  Excessive sharpness in certa

Generating Grad-CAM visualizations: 100%|██████████| 1/1 [00:00<00:00, 31.61it/s]


Intensity Matrix Shape: (32, 32)
Min value: 0.0000
Max value: 1.0000
Mean value: 0.1622
['Fur that grows in unrealistic', 'Fur grows naturally with proper', 'N/A for Non-furry objects']
Image_2.png  processed for  Improper fur direction flows  with probability of, 0.15591703
['Edges of objects are unclear,', 'Objects have clear and well-defined', 'Flat 2D images, simple backgrounds']
Image_2.png  processed for  Inconsistent object boundaries  with probability of, 1.0
['Details appear soft or smudged.', 'Details are sharp, clear, and', 'Simple, large objects']
Image_2.png  processed for  Blurred boundaries in fine details  with probability of, 0.099534936
['Edges appear unnaturally sharp.', 'Edges are naturally smooth and', 'Soft or out-of-focus images']
Image_2.png  processed for  Over-sharpening artifacts  with probability of, 0.7909428
['Some parts of the image', 'Sharpness is consistent and realistic', 'Soft, blurred regions']
Image_2.png  processed for  Excessive sharpness in certa

Generating Grad-CAM visualizations: 100%|██████████| 1/1 [00:00<00:00, 29.27it/s]


Intensity Matrix Shape: (32, 32)
Min value: 0.0000
Max value: 0.9645
Mean value: 0.1710
['Fur that grows in unrealistic', 'Fur grows naturally with proper', 'N/A for Non-furry objects']
Image_5.png  processed for  Improper fur direction flows  with probability of, 0.4025282
['Edges of objects are unclear,', 'Objects have clear and well-defined', 'Flat 2D images, simple backgrounds']
Image_5.png  processed for  Inconsistent object boundaries  with probability of, 1.0
['Details appear soft or smudged.', 'Details are sharp, clear, and', 'Simple, large objects']
Image_5.png  processed for  Blurred boundaries in fine details  with probability of, 0.12787387
['Edges appear unnaturally sharp.', 'Edges are naturally smooth and', 'Soft or out-of-focus images']
Image_5.png  processed for  Over-sharpening artifacts  with probability of, 0.8560775
['Some parts of the image', 'Sharpness is consistent and realistic', 'Soft, blurred regions']
Image_5.png  processed for  Excessive sharpness in certain

Generating Grad-CAM visualizations: 100%|██████████| 1/1 [00:00<00:00, 30.99it/s]


Intensity Matrix Shape: (32, 32)
Min value: 0.0023
Max value: 1.0000
Mean value: 0.4815
['Fur that grows in unrealistic', 'Fur grows naturally with proper', 'N/A for Non-furry objects']
Image_7.png  processed for  Improper fur direction flows  with probability of, 0.11443758
['Edges of objects are unclear,', 'Objects have clear and well-defined', 'Flat 2D images, simple backgrounds']
Image_7.png  processed for  Inconsistent object boundaries  with probability of, 1.0
['Details appear soft or smudged.', 'Details are sharp, clear, and', 'Simple, large objects']
Image_7.png  processed for  Blurred boundaries in fine details  with probability of, 0.0
['Edges appear unnaturally sharp.', 'Edges are naturally smooth and', 'Soft or out-of-focus images']
Image_7.png  processed for  Over-sharpening artifacts  with probability of, 0.9625127
['Some parts of the image', 'Sharpness is consistent and realistic', 'Soft, blurred regions']
Image_7.png  processed for  Excessive sharpness in certain image

Generating Grad-CAM visualizations: 100%|██████████| 1/1 [00:00<00:00, 30.33it/s]


Intensity Matrix Shape: (32, 32)
Min value: 0.0000
Max value: 1.0000
Mean value: 0.2258
['Fur that grows in unrealistic', 'Fur grows naturally with proper', 'N/A for Non-furry objects']
Image_10.png  processed for  Improper fur direction flows  with probability of, 1.0
['Edges of objects are unclear,', 'Objects have clear and well-defined', 'Flat 2D images, simple backgrounds']
Image_10.png  processed for  Inconsistent object boundaries  with probability of, 1.0
['Details appear soft or smudged.', 'Details are sharp, clear, and', 'Simple, large objects']
Image_10.png  processed for  Blurred boundaries in fine details  with probability of, 0.040940855
['Edges appear unnaturally sharp.', 'Edges are naturally smooth and', 'Soft or out-of-focus images']
Image_10.png  processed for  Over-sharpening artifacts  with probability of, 0.9444406
['Some parts of the image', 'Sharpness is consistent and realistic', 'Soft, blurred regions']
Image_10.png  processed for  Excessive sharpness in certain

Generating Grad-CAM visualizations: 100%|██████████| 1/1 [00:00<00:00, 30.34it/s]


Intensity Matrix Shape: (32, 32)
Min value: 0.0000
Max value: 1.0000
Mean value: 0.1896
['Fur that grows in unrealistic', 'Fur grows naturally with proper', 'N/A for Non-furry objects']
Image_8.png  processed for  Improper fur direction flows  with probability of, 0.2509062
['Edges of objects are unclear,', 'Objects have clear and well-defined', 'Flat 2D images, simple backgrounds']
Image_8.png  processed for  Inconsistent object boundaries  with probability of, 1.0
['Details appear soft or smudged.', 'Details are sharp, clear, and', 'Simple, large objects']
Image_8.png  processed for  Blurred boundaries in fine details  with probability of, 0.11168563
['Edges appear unnaturally sharp.', 'Edges are naturally smooth and', 'Soft or out-of-focus images']
Image_8.png  processed for  Over-sharpening artifacts  with probability of, 1.0
['Some parts of the image', 'Sharpness is consistent and realistic', 'Soft, blurred regions']
Image_8.png  processed for  Excessive sharpness in certain image

Generating Grad-CAM visualizations: 100%|██████████| 1/1 [00:00<00:00, 32.67it/s]


Intensity Matrix Shape: (32, 32)
Min value: 0.0000
Max value: 0.9977
Mean value: 0.3279
['Fur that grows in unrealistic', 'Fur grows naturally with proper', 'N/A for Non-furry objects']
Image_4.png  processed for  Improper fur direction flows  with probability of, 0.28718692
['Edges of objects are unclear,', 'Objects have clear and well-defined', 'Flat 2D images, simple backgrounds']
Image_4.png  processed for  Inconsistent object boundaries  with probability of, 1.0
['Details appear soft or smudged.', 'Details are sharp, clear, and', 'Simple, large objects']
Image_4.png  processed for  Blurred boundaries in fine details  with probability of, 0.44732067
['Edges appear unnaturally sharp.', 'Edges are naturally smooth and', 'Soft or out-of-focus images']
Image_4.png  processed for  Over-sharpening artifacts  with probability of, 0.7116731
['Some parts of the image', 'Sharpness is consistent and realistic', 'Soft, blurred regions']
Image_4.png  processed for  Excessive sharpness in certai

Generating Grad-CAM visualizations: 100%|██████████| 1/1 [00:00<00:00, 29.46it/s]


Intensity Matrix Shape: (32, 32)
Min value: 0.0000
Max value: 1.0000
Mean value: 0.2523
['Fur that grows in unrealistic', 'Fur grows naturally with proper', 'N/A for Non-furry objects']
Image_9.png  processed for  Improper fur direction flows  with probability of, 0.28664318
['Edges of objects are unclear,', 'Objects have clear and well-defined', 'Flat 2D images, simple backgrounds']
Image_9.png  processed for  Inconsistent object boundaries  with probability of, 1.0
['Details appear soft or smudged.', 'Details are sharp, clear, and', 'Simple, large objects']
Image_9.png  processed for  Blurred boundaries in fine details  with probability of, 0.18258812
['Edges appear unnaturally sharp.', 'Edges are naturally smooth and', 'Soft or out-of-focus images']
Image_9.png  processed for  Over-sharpening artifacts  with probability of, 0.8776951
['Some parts of the image', 'Sharpness is consistent and realistic', 'Soft, blurred regions']
Image_9.png  processed for  Excessive sharpness in certai

In [81]:
def concatenate_dicts(dict1, dict2):
    # Ensure both dictionaries have the same keys
    if set(dict1.keys()) != set(dict2.keys()):
        raise ValueError("Dictionaries do not have the same keys.")
    
    # Create the new dictionary with concatenated lists
    concatenated_dict = {key: dict1[key] + dict2[key] for key in dict1.keys()}
    return concatenated_dict
results = concatenate_dicts(result, result_32)
iou_scores = calculate_iou(image_artifacts, results)
print(iou_scores)

{'Image_1.png': 0.09090909090909091, 'Image_2.png': 0.06666666666666667, 'Image_3.png': 0.0, 'Image_4.png': 0.16666666666666666, 'Image_5.png': 0.0, 'Image_6.png': 0.0, 'Image_7.png': 0.06666666666666667, 'Image_8.png': 0.047619047619047616, 'Image_9.png': 0.0}
