In [9]:
import torch
from torch import nn, Tensor
from torch.utils.data import Dataset, DataLoader
from torchvision.datasets import CIFAR100
from typing import Optional, Callable
import os
import timm
import numpy as np
import pandas as pd
from torchvision.transforms import v2
from torch.backends import cudnn
from torch import GradScaler
from torch import optim
from tqdm import tqdm

In [None]:
list_name = [
    'caformer_s18',
    'caformer_m36',
    'caformer_b36',
    'maxvit_small_tf_224',
    'maxvit_base_tf_224'
]

In [3]:
device = torch.device('cuda')
cudnn.benchmark = True
pin_memory = True
enable_half = True  # Disable for CPU, it is slower!
scaler = GradScaler(device, enabled=enable_half)

In [4]:
class SimpleCachedDataset(Dataset):
    def __init__(self, dataset, transform):
        self.data = dataset.data
        self.targets = dataset.targets
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, i):
        return self.transform(self.data[i]), self.targets[i]

In [5]:
class CIFAR100_noisy_fine(Dataset):
    """
    See https://github.com/UCSC-REAL/cifar-10-100n, https://www.noisylabels.com/ and `Learning with Noisy Labels
    Revisited: A Study Using Real-World Human Annotations`.
    """

    def __init__(
        self, root: str, train: bool, transform: Optional[Callable], download: bool
    ):
        cifar100 = CIFAR100(
            root=root, train=train, transform=None, download=download
        )
        data, targets = tuple(zip(*cifar100))

        if train:
            noisy_label_file = os.path.join(root, "CIFAR-100-noisy.npz")
            if not os.path.isfile(noisy_label_file):
                raise FileNotFoundError(
                    f"{type(self).__name__} need {noisy_label_file} to be used!"
                )

            noise_file = np.load(noisy_label_file)
            if not np.array_equal(noise_file["clean_label"], targets):
                raise RuntimeError("Clean labels do not match!")
            targets = noise_file["noisy_label"]

        self.data = data
        self.targets = targets
        self.transform = transform

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, i: int):
        return self.transform(self.data[i]), self.targets[i]

In [6]:
train_transform = v2.Compose([
    v2.Resize((224, 224)),
    v2.RandAugment(num_ops=6, magnitude=9),
    v2.ToImage(),
    v2.ToDtype(torch.float32, scale=True),
    v2.Normalize(mean=(0.5071, 0.4865, 0.4409), std=(0.2673, 0.2564, 0.2762), inplace=True),
])
test_transform = v2.Compose([
    v2.Resize((224, 224)),
    v2.ToImage(),
    v2.ToDtype(torch.float32, scale=True),
    v2.Normalize(mean=(0.5071, 0.4865, 0.4409), std=(0.2673, 0.2564, 0.2762), inplace=True),
])
train_set = CIFAR100_noisy_fine('./fii-atnn-2024-project-noisy-cifar-100', download=False, train=True, transform=test_transform)
test_set = CIFAR100_noisy_fine('./fii-atnn-2024-project-noisy-cifar-100', download=False, train=False, transform=test_transform)
train_set = SimpleCachedDataset(train_set, transform=test_transform)
test_set = SimpleCachedDataset(test_set, transform=test_transform)

train_loader = DataLoader(train_set, batch_size=1, shuffle=False, pin_memory=pin_memory)
test_loader = DataLoader(test_set, batch_size=8, pin_memory=pin_memory)

In [7]:
def create_base_model(name):
    return timm.create_model(
        name,
        pretrained=True,
        num_classes=100
    )

In [None]:
@torch.inference_mode()
def inference_to_get_predicted_labels():
    models = []
    for name in list_name:
        model = create_base_model(name)
        model_path = f'./v3/best_{name}.pth'
        state = torch.load(model_path, map_location=device)
        model.load_state_dict(state)
        model.to(device)
        model.eval()
        models.append(model)

    all_preds = []
    for inputs, _ in tqdm(train_loader):
        inputs = inputs.to(device, non_blocking=True)

        with torch.autocast(device_type=device.type, enabled=enable_half):
            logits = [m(inputs) for m in models]
            summed = torch.stack(logits, dim=0).sum(dim=0)

        preds = summed.argmax(dim=1)          # [B]
        all_preds.extend(preds.cpu().tolist())

    return all_preds

In [9]:
labels_pred = inference_to_get_predicted_labels()

100%|█████████████████████████████████████████████████████████████████████████| 50000/50000 [1:40:38<00:00,  8.28it/s]


In [10]:
print(len(labels_pred))

50000


In [10]:
@torch.inference_mode()
def inference_to_get_test_labels():
    models = []
    for name in list_name:
        model = create_base_model(name)
        model_path = f'./v3/best_{name}.pth'
        state = torch.load(model_path, map_location=device)
        model.load_state_dict(state)
        model.to(device)
        model.eval()
        models.append(model)

    all_preds = []
    for inputs, _ in tqdm(test_loader):
        inputs = inputs.to(device, non_blocking=True)

        with torch.autocast(device_type=device.type, enabled=enable_half):
            logits = [m(inputs) for m in models]
            summed = torch.stack(logits, dim=0).sum(dim=0)

        preds = summed.argmax(dim=1)          # [B]
        all_preds.extend(preds.cpu().tolist())

    return all_preds

In [11]:
test_loader = DataLoader(test_set, batch_size=8, pin_memory=pin_memory)
test_pred = inference_to_get_test_labels()

100%|█████████████████████████████████████████████████████████████████████████████| 1250/1250 [02:45<00:00,  7.56it/s]


In [12]:
print(len(test_pred))

10000


In [13]:
test_pred[0]

68

In [14]:
test_loader = DataLoader(test_set, batch_size=1, pin_memory=pin_memory)

acc = 0
cont = 0
for _, targ in test_loader:
    if targ == test_pred[cont]:
        acc += 1
    cont += 1
print(acc/len(test_pred))

0.843


In [11]:
cifar100 = CIFAR100(root='./fii-atnn-2024-project-noisy-cifar-100', train=True, transform=None, download=False)
data, targets = tuple(zip(*cifar100))

noisy_label_file = os.path.join('./fii-atnn-2024-project-noisy-cifar-100', "CIFAR-100-noisy.npz")
if not os.path.isfile(noisy_label_file):
    raise FileNotFoundError(f"{type(self).__name__} need {noisy_label_file} to be used!")

noise_file = np.load(noisy_label_file)
if not np.array_equal(noise_file["clean_label"], targets):
    raise RuntimeError("Clean labels do not match!")
targets = noise_file["noisy_label"]

print(len(data))
print(len(targets))

50000
50000


In [12]:
### create dataset 1: delete instances with pred != noisy
### create dataset 2: change the label for instances with pred != noisy with pred value

new_data_1 = []
new_targets_1 = []
new_data_2 = []
new_targets_2 = []

new_values_1 = [0] * 100
new_values_2 = [0] * 100

idx = 0
for i, j in zip(labels_pred, targets):
    if i == j:
        # new_data_1.append(data[idx])
        # new_data_2.append(data[idx])
        new_targets_1.append((i, idx))
        new_targets_2.append((i, idx))

        new_values_1[i] += 1
        new_values_2[i] += 1
    else:
        # new_data_2.append(data[idx])
        new_targets_2.append((i, idx))
        
        new_values_2[i] += 1
    idx += 1

In [13]:
print(new_values_1)

[469, 369, 435, 383, 40, 424, 309, 354, 420, 458, 269, 382, 255, 391, 382, 380, 339, 391, 285, 382, 523, 423, 418, 462, 375, 320, 306, 306, 373, 274, 314, 411, 145, 310, 329, 307, 406, 390, 274, 407, 351, 356, 232, 428, 355, 136, 618, 174, 448, 373, 387, 290, 428, 462, 325, 108, 318, 344, 488, 46, 284, 507, 391, 66, 160, 277, 270, 192, 412, 355, 340, 373, 135, 253, 93, 210, 344, 337, 333, 304, 254, 147, 495, 382, 346, 331, 462, 372, 370, 392, 369, 218, 241, 345, 423, 141, 107, 284, 465, 342]


In [14]:
print(new_values_2)

[543, 629, 543, 619, 155, 517, 469, 546, 495, 531, 362, 508, 387, 557, 494, 484, 458, 550, 460, 481, 585, 554, 491, 631, 479, 414, 536, 507, 522, 463, 674, 495, 276, 605, 566, 398, 782, 510, 464, 503, 466, 467, 414, 537, 538, 297, 751, 298, 500, 523, 1060, 498, 1103, 514, 509, 341, 562, 432, 555, 124, 436, 680, 706, 188, 322, 452, 567, 447, 561, 505, 450, 577, 381, 550, 343, 483, 508, 533, 453, 530, 489, 319, 556, 499, 458, 485, 515, 462, 533, 536, 516, 462, 374, 563, 522, 302, 292, 498, 612, 603]


In [15]:
np.savez("dataset_delete_test_v4.npz", targets=new_targets_1)
np.savez("dataset_change_test_v4.npz", targets=new_targets_2)

In [16]:
cifar100 = CIFAR100(root='./fii-atnn-2024-project-noisy-cifar-100', train=True, transform=None, download=False)
data, targets = tuple(zip(*cifar100))

print(len(targets))

50000


In [17]:
acc = 0
for dataset in [new_targets_1, new_targets_2]:
    acc = 0
    for it in dataset:
        if targets[it[1]] == it[0]:
            acc += 1
    print(acc / len(dataset), len(dataset))
# for i, j in zip(targets, new_targets_2):
#     if i == j[0]:
#         acc += 1
# print(acc / len(new_targets_2))

0.8763455532308438 32979
0.82916 50000


In [18]:
cifar100 = CIFAR100(root='./fii-atnn-2024-project-noisy-cifar-100', train=True, transform=None, download=False)
data, targets = tuple(zip(*cifar100))

print(len(targets))

noisy_label_file = os.path.join('./fii-atnn-2024-project-noisy-cifar-100', "CIFAR-100-noisy.npz")
if not os.path.isfile(noisy_label_file):
    raise FileNotFoundError(f"{type(self).__name__} need {noisy_label_file} to be used!")

noise_file = np.load(noisy_label_file)
if not np.array_equal(noise_file["clean_label"], targets):
    raise RuntimeError("Clean labels do not match!")
targets_noisy = noise_file["noisy_label"]

50000


In [19]:
acc = 0
for it in range(50000):
    if targets[it] == targets_noisy[it]:
        acc += 1
print(acc / 50000)

0.598
