In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from torchvision import transforms, models, datasets
from torchvision.transforms.functional import InterpolationMode
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
import numpy as np
import copy
from utils import get_model, get_transforms, get_dataset, get_features, get_recon_loss, learn_reconstruct, do_feat_recon, get_logits_from_feats

  from .autonotebook import tqdm as notebook_tqdm


Given the predicted logits from a ResNet18, pretrained on ImageNet, can we learn the features with a single linear layer?

In [17]:
model = get_model('resnet18')

In [3]:
test_transform = get_transforms('cifar100')
train_set, test_set = get_dataset('cifar100', test_transform=test_transform, get_train=True)
len(test_set), len(train_set)

Files already downloaded and verified
Files already downloaded and verified


(10000, 50000)

In [23]:
cifar_feats_train, cifar_logits_train, cifar_labels_train = get_features(model, train_set)
cifar_feats_test, cifar_logits_test, cifar_labels_test = get_features(model, test_set)
cifar_feats_train.shape, cifar_feats_test.shape, cifar_logits_train.shape, cifar_logits_test.shape

1563it [00:29, 52.88it/s]                          
313it [00:06, 49.26it/s]                         


((50000, 512), (10000, 512), (50000, 1000), (10000, 1000))

In [24]:
recon_model = learn_reconstruct(cifar_logits_train, cifar_feats_train, cifar_logits_test, cifar_feats_test)

Epoch 0, train loss=82.71377904620022, test loss = 5.457098842598498
Epoch 10, train loss=7.264157226425596, test loss = 1.5426271691685542
Epoch 20, train loss=4.583923751953989, test loss = 0.9979622986866161
Epoch 30, train loss=3.319168064976111, test loss = 0.7295367528568022
Epoch 40, train loss=2.5593346460082103, test loss = 0.5646934271790087
Epoch 50, train loss=2.0483272130950354, test loss = 0.4522753198398277
Epoch 60, train loss=1.6807755785412155, test loss = 0.37070933749782853
Epoch 70, train loss=1.4042739442666061, test loss = 0.30904794664820656
Epoch 80, train loss=1.189472611164092, test loss = 0.26104325967025943
Epoch 90, train loss=1.0185217096150154, test loss = 0.22283535328460857


In [25]:
recon_cifar_feats_test = do_feat_recon(cifar_logits_test, recon_model)
recon_cifar_feats_test.shape

(10000, 512)

In [26]:
preds_recon = get_logits_from_feats(recon_cifar_feats_test,  model.original_fc)
preds_recon.shape

313it [00:00, 382.99it/s]                         


(10000, 1000)

In [27]:
top_recon = np.argmax(preds_recon, axis=1)
top_orig = np.argmax(cifar_logits_test, axis=1)
np.mean(top_orig == top_recon)

0.9935

Training on CIFAR-100 train, and testing on CIFAR-100 test, we get a reconstruction, MSE loss of 0.224 total over the 10k test examples.

If we take these reconstructed features and then use the model's linear head, the top-1 prediction matches the original prediction 99.24% of the time.

Given the predicted logits from a ResNet18, pretrained on ImageNet, can we learn the features from a pretrained ResNet50 with a single linear layer?

In [4]:
resnet50 = get_model('resnet50')

In [5]:
cifar_feats_train_resnet50, cifar_logits_train_resnet50, cifar_labels_train_resnet50 = get_features(resnet50, train_set)
cifar_feats_test_resnet50, cifar_logits_test_resnet50, cifar_labels_test_resnet50 = get_features(resnet50, test_set)
cifar_feats_train_resnet50.shape, cifar_feats_test_resnet50.shape, cifar_logits_train_resnet50.shape, cifar_logits_test_resnet50.shape

1563it [00:59, 26.34it/s]                          
313it [00:12, 25.56it/s]                         


((50000, 2048), (10000, 2048), (50000, 1000), (10000, 1000))

In [7]:
recon_model_resnet50 = learn_reconstruct(cifar_logits_train_resnet50, cifar_feats_train_resnet50, cifar_logits_test_resnet50, cifar_feats_test_resnet50)

Epoch 0, train loss=39.300385073293, test loss = 2.8255152758210897
Epoch 10, train loss=3.7770108753466047, test loss = 0.7561160636250861
Epoch 20, train loss=3.034100213786587, test loss = 0.6142645400250331
Epoch 30, train loss=2.717832017049659, test loss = 0.5525207920582034
Epoch 40, train loss=2.5255862835620064, test loss = 0.5147924707562197
Epoch 50, train loss=2.3897917853319086, test loss = 0.48809869555407204
Epoch 60, train loss=2.285780160455033, test loss = 0.4676349123183172
Epoch 70, train loss=2.201998257514788, test loss = 0.4511379585310351
Epoch 80, train loss=2.132155106140999, test loss = 0.4373734011896886
Epoch 90, train loss=2.072465785226086, test loss = 0.4255984641495161


In [9]:
recon_cifar_feats_test_resnet50 = do_feat_recon(cifar_logits_test_resnet50, recon_model_resnet50)
preds_recon_resnet50 = get_logits_from_feats(recon_cifar_feats_test_resnet50,  resnet50.original_fc)
top_recon = np.argmax(preds_recon_resnet50, axis=1)
top_orig = np.argmax(cifar_logits_test_resnet50, axis=1)
np.mean(top_orig == top_recon)

313it [00:00, 393.13it/s]                         


0.9822

Training on CIFAR-100 train, and testing on CIFAR-100 test, we get a reconstruction, MSE loss of 5.17 total over the 10k test examples.

If we take these reconstructed features and then use the model's linear head, the top-1 prediction matches the original prediction 99.47% of the time.

Given the predicted logits from a ResNet18, pretrained on ImageNet, can we learn the features with a single linear layer, while training / testing on a different dataset?

In [12]:
places_test = get_dataset("places365", get_train=False)
len(places_test)

36500

In [30]:
places_feats_test, places_logits_test, places_labels_test = get_features(model, places_test)
places_feats_test.shape, places_logits_test.shape

1141it [00:23, 49.08it/s]                          


((36500, 512), (36500, 1000))

In [31]:
recon_places_feats_test = do_feat_recon(places_logits_test, recon_model)
places_preds_recon = get_logits_from_feats(recon_places_feats_test,  model.original_fc)
top_recon = np.argmax(places_preds_recon, axis=1)
top_orig = np.argmax(places_logits_test, axis=1)
np.mean(top_orig == top_recon)

1141it [00:01, 629.53it/s]                          


0.9765205479452055

If we take these reconstructed features and then use the model's linear head, the top-1 prediction matches the original prediction 96.48% of the time.

Given the predicted logits from a ResNet18, pretrained on ImageNet, can we learn the features from a ViT model with a single linear layer?

In [64]:
import open_clip
clip_vit, _, test_transform = open_clip.create_model_and_transforms('ViT-B-32', pretrained='laion2b_s34b_b79k')

In [63]:
test_set = datasets.CIFAR10(root=f'/home/ubuntu/data/cifar10', train=False, download=True, transform=test_transform)
train_set = datasets.CIFAR10(root=f'/home/ubuntu/data/cifar10', train=True, download=True, transform=test_transform)
len(test_set), len(train_set)

Files already downloaded and verified
Files already downloaded and verified


(10000, 50000)

In [65]:
cifar_feats_train_vit = get_features(clip_vit, train_set)
cifar_feats_test_vit = get_features(clip_vit, test_set)
cifar_feats_train_vit.shape, cifar_feats_test_vit.shape

AttributeError: 'CLIP' object has no attribute 'fc'