In [3]:
from torchvision import transforms, models, datasets
from torchvision.transforms.functional import InterpolationMode
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
import numpy as np
import copy

  from .autonotebook import tqdm as notebook_tqdm


Given the predicted logits from a ResNet18, pretrained on ImageNet, can we learn the features with a single linear layer?

In [4]:
model = models.resnet18(pretrained=True)

In [3]:
resize_size = 256
crop_size = 224
mean=(0.485, 0.456, 0.406)
std=(0.229, 0.224, 0.225)

test_transform = transforms.Compose(
        [
            transforms.Resize(resize_size, interpolation=InterpolationMode.BILINEAR),
            transforms.CenterCrop(crop_size),
            transforms.PILToTensor(),
            transforms.ConvertImageDtype(torch.float),
            transforms.Normalize(mean=mean, std=std),
        ]
    )
test_set = datasets.ImageFolder(f'/home/ubuntu/data/Imagenet/ILSVRC/Data/CLS-LOC/val/', transform=test_transform)
len(test_set)

FileNotFoundError: [Errno 2] No such file or directory: '/home/ubuntu/data/Imagenet/ILSVRC/Data/CLS-LOC/val/'

In [5]:
mean=(0.485, 0.456, 0.406)
crop_size = 224
std=(0.229, 0.224, 0.225)
test_transform = transforms.Compose([
    transforms.CenterCrop((crop_size, crop_size)),
    transforms.ToTensor(),
    transforms.Normalize(mean, std),
])
test_set = datasets.CIFAR10(root=f'/home/ubuntu/data/cifar10', train=False, download=True, transform=test_transform)
train_set = datasets.CIFAR10(root=f'/home/ubuntu/data/cifar10', train=True, download=True, transform=test_transform)
len(test_set), len(train_set)

Files already downloaded and verified
Files already downloaded and verified


(10000, 50000)

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [7]:
def append_np(original, to_append):
    if original is None:
        return to_append
    else:
        return np.concatenate((original, to_append))

In [8]:
def get_features(model, dataset, batch_size=32, num_workers=8):
    original_fc = copy.deepcopy(model.fc)
    model.fc = torch.nn.Identity()
    model.to(device)
    model.eval()
    dataloader = DataLoader(dataset,batch_size=batch_size, num_workers=num_workers,shuffle=False)
    all_feats = None
    for (images, label) in tqdm(dataloader, total=len(dataset) // batch_size):
        with torch.no_grad():
            output = model(images.to(device)).cpu().numpy()
            all_feats = append_np(all_feats, output)
    model.fc = original_fc
    return all_feats

In [9]:
cifar_feats_train = get_features(model, train_set)
cifar_feats_test = get_features(model, test_set)
cifar_feats_train.shape, cifar_feats_test.shape

1563it [01:15, 20.81it/s]                          
313it [00:08, 39.12it/s]                         


((50000, 512), (10000, 512))

In [None]:
def compute_logits(model, dataset, batch_size=32, num_workers=8):
    model.to(device)
    model.eval()
    dataloader = DataLoader(dataset,batch_size=batch_size, num_workers=num_workers,shuffle=False)
    all_logits = None
    all_labels = None
    for (images, label) in tqdm(dataloader, total=len(dataset) // batch_size):
        with torch.no_grad():
            output = model(images.to(device)).cpu().numpy()
            all_logits = append_np(all_logits, output)
            all_labels = append_np(all_labels, label.numpy())
    return all_logits, all_labels

'def get_logits(model, dataset, batch_size=32, num_workers=8):\n    model.to(device)\n    model.eval()\n    dataloader = DataLoader(dataset,batch_size=batch_size, num_workers=num_workers,shuffle=False)\n    all_logits = None\n    all_labels = None\n    for (images, label) in tqdm(dataloader, total=len(dataset) // batch_size):\n        with torch.no_grad():\n            output = model(images.to(device)).cpu().numpy()\n            all_logits = append_np(all_logits, output)\n            all_labels = append_np(all_labels, label.numpy())\n    return all_logits, all_labels'

In [22]:
class FeatDataset(torch.utils.data.Dataset):
    def __init__(self, features, labels=None):
        self.features = torch.FloatTensor(features)
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        feat = self.features[idx]
        if self.labels is None:
            return feat
        else:
            return feat, self.labels[idx]

In [23]:
def get_logits(feats, lin):
    lin.eval()
    batch_size=16
    num_workers=8

    all_preds = None
    dataloader = DataLoader(FeatDataset(feats), batch_size=batch_size, num_workers=num_workers,shuffle=False)
    for batch_feats in tqdm(dataloader, total=len(feats) // batch_size):
        with torch.no_grad():
            pred = lin(batch_feats).numpy()
            all_preds = append_np(all_preds, pred)
    return all_preds

In [24]:
cifar_logits_train = get_logits(cifar_feats_train, model.fc.cpu())#get_logits(model, train_set)
cifar_logits_test = get_logits(cifar_feats_test, model.fc.cpu())#
cifar_logits_train.shape, cifar_logits_test.shape

100%|██████████| 3125/3125 [02:56<00:00, 17.66it/s]
100%|██████████| 625/625 [00:05<00:00, 112.22it/s]


((50000, 1000), (10000, 1000))

In [53]:
def get_recon_loss(recon_model, logits, features, batch_size=16):
    rec_loss = torch.nn.MSELoss()
    recon_model.eval()
    test_loss = 0
    for b in range(0, len(logits), batch_size):
        with torch.no_grad():
            x = logits[b:b+batch_size]
            y = features[b:b+batch_size]
            pred_y = recon_model(x)
            loss = rec_loss(pred_y, y)
            test_loss += loss.item()
    return test_loss

In [25]:
def learn_reconstruct(logits, features, test_logits, test_features):
    lin = torch.nn.Linear(logits.shape[1], features.shape[1])
    rec_loss = torch.nn.MSELoss()
    epochs = 100
    batch_size = 16
    optimizer = torch.optim.SGD(lin.parameters(), lr = 0.1, momentum = 0.9)
    
    logits = torch.FloatTensor(logits)
    features = torch.FloatTensor(features)
    test_logits = torch.FloatTensor(test_logits)
    test_features = torch.FloatTensor(test_features)

    for i in range(epochs):
        total_loss = 0
        for b in range(0, len(logits), batch_size):
            optimizer.zero_grad()
            x = logits[b:b+batch_size]
            y = features[b:b+batch_size]
            pred_y = lin(x)
            loss = rec_loss(pred_y, y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        if i % 10 == 0:
            test_loss = get_recon_loss(lin, test_logits, test_features, batch_size=batch_size)
            print(f"Epoch {i}, train loss={total_loss}, test loss = {test_loss}")
            lin.train()
            
    return lin

In [26]:
recon_model = learn_reconstruct(cifar_logits_train, cifar_feats_train, cifar_logits_test, cifar_feats_test)

Epoch 0, train loss=71.42700271867216, test loss = 6.059823074378073
Epoch 10, train loss=6.075193635420874, test loss = 1.572297875303775
Epoch 20, train loss=3.936132945585996, test loss = 1.029624812770635
Epoch 30, train loss=2.9237039703293703, test loss = 0.7714758660295047
Epoch 40, train loss=2.3084190539084375, test loss = 0.6153458339977078
Epoch 50, train loss=1.889220092096366, test loss = 0.5092021900927648
Epoch 60, train loss=1.583752770384308, test loss = 0.4315956605132669
Epoch 70, train loss=1.350930875953054, test loss = 0.3719757022045087
Epoch 80, train loss=1.1676412418746622, test loss = 0.3245321188296657
Epoch 90, train loss=1.0197711210639682, test loss = 0.285793171817204


In [27]:
def do_feat_recon(logits, recon_model_):
    batch_size = 16
    logits = torch.FloatTensor(logits)

    all_feats = None
    with torch.no_grad():
        for b in range(0, len(logits), batch_size):
            x = logits[b:b+batch_size]
            pred_y = recon_model_(x).numpy()
            all_feats = append_np(all_feats, pred_y)
    return all_feats

In [28]:
recon_cifar_feats_test = do_feat_recon(cifar_logits_test, recon_model)
recon_cifar_feats_test.shape

(10000, 512)

In [29]:
preds_orig = get_logits(cifar_feats_test, model.fc.cpu())
preds_recon = get_logits(recon_cifar_feats_test,  model.fc.cpu())
preds_orig.shape, preds_recon.shape

100%|██████████| 625/625 [00:05<00:00, 112.99it/s]
100%|██████████| 625/625 [00:05<00:00, 118.85it/s]


((10000, 1000), (10000, 1000))

In [30]:
top_recon = np.argmax(preds_recon, axis=1)
top_orig = np.argmax(preds_orig, axis=1)
np.mean(top_orig == top_recon)

0.9919

Training on CIFAR-100 train, and testing on CIFAR-100 test, we get a reconstruction, MSE loss of 0.28 total over the 10k test examples.

If we take these reconstructed features and then use the model's linear head, the top-1 prediction matches the original prediction 99.18% of the time.

Given the predicted logits from a ResNet18, pretrained on ImageNet, can we learn the features from a pretrained ResNet50 with a single linear layer?

In [31]:
resnet50 = models.resnet50(pretrained=True)

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /home/ubuntu/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 212MB/s] 


In [37]:
cifar_feats_train_resnet50 = get_features(resnet50, train_set)
cifar_feats_test_resnet50 = get_features(resnet50, test_set)
cifar_feats_train_resnet50.shape, cifar_feats_test_resnet50.shape

1563it [04:05,  6.36it/s]                          
313it [00:19, 15.97it/s]                         


((50000, 2048), (10000, 2048))

In [38]:
cifar_logits_train_resnet50  = get_logits(cifar_feats_train_resnet50 , resnet50.fc.cpu())#get_logits(model, train_set)
cifar_logits_test_resnet50  = get_logits(cifar_feats_test_resnet50 , resnet50.fc.cpu())#
cifar_logits_train_resnet50.shape, cifar_logits_test_resnet50.shape

100%|██████████| 3125/3125 [06:02<00:00,  8.61it/s]
100%|██████████| 625/625 [00:14<00:00, 42.94it/s] 


((50000, 2048), (10000, 2048))

In [39]:
recon_model_resnet50 = learn_reconstruct(cifar_logits_train, cifar_feats_train_resnet50, cifar_logits_test, cifar_feats_test_resnet50)

Epoch 0, train loss=69.57651571463794, test loss = 7.393297683447599
Epoch 10, train loss=28.515093367546797, test loss = 5.487259719520807
Epoch 20, train loss=27.672723228111863, test loss = 5.346435167361051
Epoch 30, train loss=27.309775521978736, test loss = 5.286164271645248
Epoch 40, train loss=27.093121097888798, test loss = 5.250551599543542
Epoch 50, train loss=26.94400716898963, test loss = 5.226170690264553
Epoch 60, train loss=26.83283564262092, test loss = 5.208049410954118
Epoch 70, train loss=26.74560745153576, test loss = 5.193862346466631
Epoch 80, train loss=26.674702115822583, test loss = 5.182348498608917
Epoch 90, train loss=26.61555330688134, test loss = 5.172754235565662


In [40]:
recon_cifar_feats_test_resnet50 = do_feat_recon(cifar_logits_test, recon_model_resnet50)
preds_orig_resnet50 = get_logits(cifar_feats_test_resnet50, resnet50.fc.cpu())
preds_recon_resnet50 = get_logits(recon_cifar_feats_test_resnet50,  resnet50.fc.cpu())
top_recon = np.argmax(preds_recon_resnet50, axis=1)
top_orig = np.argmax(preds_orig_resnet50, axis=1)
np.mean(top_orig == top_recon)

100%|██████████| 625/625 [00:14<00:00, 44.03it/s] 
100%|██████████| 625/625 [00:14<00:00, 44.26it/s] 


0.9947

Training on CIFAR-100 train, and testing on CIFAR-100 test, we get a reconstruction, MSE loss of 5.17 total over the 10k test examples.

If we take these reconstructed features and then use the model's linear head, the top-1 prediction matches the original prediction 99.47% of the time.

Given the predicted logits from a ResNet18, pretrained on ImageNet, can we learn the features with a single linear layer, while training / testing on a different dataset?

In [45]:
places_tdatasets.Places365(root='/home/ubuntu/data/Places365_small', split='train-standard',small=True, download=True)

Downloading http://data.csail.mit.edu/places/places365/filelist_places365-standard.tar to /home/ubuntu/data/Places365_small/filelist_places365-standard.tar


67499008it [00:03, 19400614.23it/s]                              


Extracting /home/ubuntu/data/Places365_small/filelist_places365-standard.tar to /home/ubuntu/data/Places365_small
Downloading http://data.csail.mit.edu/places/places365/train_256_places365standard.tar to /home/ubuntu/data/Places365_small/train_256_places365standard.tar


100%|██████████| 26103685120/26103685120 [17:53<00:00, 24312527.29it/s]


Extracting /home/ubuntu/data/Places365_small/train_256_places365standard.tar to /home/ubuntu/data/Places365_small


OSError: [Errno 28] No space left on device

In [50]:
places_test = datasets.Places365(root='/home/ubuntu/data/Places365_small', split='val',small=True, download=False, transform=test_transform)
len(places_test)

36500

In [51]:
places_feats_test = get_features(model, places_test)
places_logits_test  = get_logits(places_feats_test , model.fc.cpu())
places_feats_test.shape, places_logits_test.shape

1141it [00:41, 27.32it/s]                          
2282it [01:30, 25.20it/s]                          


((36500, 512), (36500, 1000))

In [52]:
recon_places_feats_test = do_feat_recon(places_logits_test, recon_model)
places_preds_orig = get_logits(places_feats_test, model.fc.cpu())
places_preds_recon = get_logits(recon_places_feats_test,  model.fc.cpu())
top_recon = np.argmax(places_preds_recon, axis=1)
top_orig = np.argmax(places_preds_orig, axis=1)
np.mean(top_orig == top_recon)

2282it [01:30, 25.14it/s]                          
2282it [01:29, 25.49it/s]                          


0.9647945205479452

In [59]:
torch.nn.MSELoss()(torch.FloatTensor(recon_places_feats_test), 
                torch.FloatTensor(places_feats_test)).item()

0.031231464818120003