# Mount Drive

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [23]:
!pip install -U -q PyDrive
!pip install httplib2==0.15.0
import os
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from pydrive.files import GoogleDriveFileList
from google.colab import auth
from oauth2client.client import GoogleCredentials

from getpass import getpass
import urllib

# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Cloning PAL_2021 to access modules.
# Need password to access private repo.

if 'CLIPPER' not in os.listdir():
    cmd_string = 'git clone https://github.com/PAL-ML/CLIPPER.git'
    os.system(cmd_string)



# Installation

## Install multi label metrics dependencies

In [4]:
! pip install scikit-learn==0.24



## Install CLIP dependencies

In [5]:
import subprocess

CUDA_version = [s for s in subprocess.check_output(["nvcc", "--version"]).decode("UTF-8").split(", ") if s.startswith("release")][0].split(" ")[-1]
print("CUDA version:", CUDA_version)

if CUDA_version == "10.0":
    torch_version_suffix = "+cu100"
elif CUDA_version == "10.1":
    torch_version_suffix = "+cu101"
elif CUDA_version == "10.2":
    torch_version_suffix = ""
else:
    torch_version_suffix = "+cu110"

CUDA version: 11.0


In [6]:
! pip install torch==1.7.1{torch_version_suffix} torchvision==0.8.2{torch_version_suffix} -f https://download.pytorch.org/whl/torch_stable.html ftfy regex

Looking in links: https://download.pytorch.org/whl/torch_stable.html


In [7]:
! pip install ftfy regex
! wget https://openaipublic.azureedge.net/clip/bpe_simple_vocab_16e6.txt.gz -O bpe_simple_vocab_16e6.txt.gz

--2021-06-01 19:03:38--  https://openaipublic.azureedge.net/clip/bpe_simple_vocab_16e6.txt.gz
Resolving openaipublic.azureedge.net (openaipublic.azureedge.net)... 13.107.246.39, 13.107.213.39, 2620:1ec:bdf::39, ...
Connecting to openaipublic.azureedge.net (openaipublic.azureedge.net)|13.107.246.39|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1356917 (1.3M) [application/octet-stream]
Saving to: ‘bpe_simple_vocab_16e6.txt.gz’


2021-06-01 19:03:38 (7.30 MB/s) - ‘bpe_simple_vocab_16e6.txt.gz’ saved [1356917/1356917]



In [8]:
!pip install git+https://github.com/Sri-vatsa/CLIP # using this fork because of visualization capabilities

Collecting git+https://github.com/Sri-vatsa/CLIP
  Cloning https://github.com/Sri-vatsa/CLIP to /tmp/pip-req-build-5dbp0sri
  Running command git clone -q https://github.com/Sri-vatsa/CLIP /tmp/pip-req-build-5dbp0sri
Building wheels for collected packages: clip
  Building wheel for clip (setup.py) ... [?25l[?25hdone
  Created wheel for clip: filename=clip-1.0-cp37-none-any.whl size=1368623 sha256=564f735f1a6d8ec015b46beb4e2c25c4c04302ba6b8f5147d557dee4409a7128
  Stored in directory: /tmp/pip-ephem-wheel-cache-nhlpx4fi/wheels/cc/55/69/0d411dabbd5009fd069d47b47cf7839c54e595dc61725b307b
Successfully built clip


## Install clustering dependencies

In [9]:
!pip -q install umap-learn>=0.3.7

## Install dataset manager dependencies

In [10]:
!pip install wget



# Imports

In [11]:
# ML Libraries
import tensorflow as tf
import tensorflow_hub as hub
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from tensorflow import keras


# Data processing
import PIL
import base64
import imageio
import pandas as pd
import numpy as np
import json

from PIL import Image
import cv2
from sklearn.feature_extraction.image import extract_patches_2d

# Plotting
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as patches

from IPython.core.display import display, HTML
from matplotlib import cm
import matplotlib.image as mpimg

# Models
import clip

# Datasets
import tensorflow_datasets as tfds

# Clustering
# import umap

from sklearn import metrics
from sklearn.cluster import KMeans
#from yellowbrick.cluster import KElbowVisualizer

# Misc
import progressbar
import logging
from abc import ABC, abstractmethod
import time
import urllib.request
import os
from sklearn.metrics import jaccard_score, hamming_loss, accuracy_score, f1_score
from sklearn.preprocessing import MultiLabelBinarizer


# Modules
from CLIPPER.code.ExperimentModules import embedding_models
from CLIPPER.code.ExperimentModules.dataset_manager import DatasetManager
from CLIPPER.code.ExperimentModules.weight_imprinting_classifier import WeightImprintingClassifier
from CLIPPER.code.ExperimentModules import simclr_data_augmentations
from CLIPPER.code.ExperimentModules.utils import (save_npy, load_npy, 
                                                       get_folder_id, 
                                                       create_expt_dir, 
                                                       save_to_drive, 
                                                       load_all_from_drive_folder, 
                                                       download_file_by_name, 
                                                       delete_file_by_name)

logging.getLogger('googleapicliet.discovery_cache').setLevel(logging.ERROR)

# Initialization & Constants

## Dataset details

In [12]:
IMG_HEIGHT = 224
IMG_WIDTH = 224

experiment_id = "ImagenetA-Embeddings"

folder_name = experiment_id+"-28-02-21"

# Change parentid to match that of experiments root folder in gdrive
parentid = '1bK72W-Um20EQDEyChNhNJthUNbmoSEjD'

In [13]:
# Initialize sepcific experiment folder in drive
folderid = create_expt_dir(drive, parentid, folder_name)

title: ImagenetA-Embeddings-28-02-21, id: 13IXmLLCxY96gh9FQMR6Il_dXDB2LsYcR


## Embedding function

In [14]:
def run_data_through_model(
    data, 
    embedder, 
    filename, 
    drive,
    folderid,
    total_num_images,
    max_num_samples=5000,
):
    embedder.load_model()

    embeddings = None
    num_images_done = 0

    while embeddings is None or num_images_done < total_num_images:
        download_file_by_name(drive, folderid, filename)

        if filename in os.listdir():
            embeddings = np.load(filename)['data']
            num_images_done = embeddings.shape[0]
            if num_images_done == total_num_images:
                print("All images done already.")
                break
            else:
                print("{}/{} images done already".format(
                    num_images_done, total_num_images)
                )

        print("Running for image indices {}-{}.".format(
            num_images_done, num_images_done+max_num_samples
            )
        )
        if (num_images_done+max_num_samples) <= total_num_images:
            batch = data[num_images_done:num_images_done+max_num_samples]
        else:
            batch = data[num_images_done:]

        processed_batch = embedder.preprocess_data(batch)
        embeddings_batch = embedder.embed_images(
            processed_batch, batch_size=50
            )
        
        if embeddings is None:
            embeddings = embeddings_batch
        else:
            embeddings = np.concatenate(
                [embeddings, embeddings_batch]
                )
            
        delete_file_by_name(drive, folderid, filename)
        embedder.save_embeddings_to_drive(
            embeddings, 
            filename,
            drive,
            folderid
            )
        num_images_done = embeddings.shape[0]
        print("{}/{} images done".format(num_images_done, total_num_images))


# Test data split

## Load Data

In [15]:
dm = DatasetManager()
test_data_generator = dm.load_dataset('imagenet_a', split="test")

In [16]:
_data = []
_labels = []
for i, example in enumerate(test_data_generator):
    resized_image = cv2.resize(example['image'], (IMG_WIDTH, IMG_HEIGHT)).astype(np.uint8)
    _data.append(resized_image)
    _labels.append(example['label'])

    del resized_image

test_data = np.stack(_data)
test_labels = np.array(_labels)

del _data
del _labels

print('Images shape: ', test_data.shape)
print('Alphabet labels length: ', len(test_labels))


Images shape:  (7500, 224, 224, 3)
Alphabet labels length:  7500


In [17]:
# Save test labels
test_labels_filename = 'test_labels.npz'

if test_labels_filename not in os.listdir():
    save_npy(test_labels_filename, test_labels)
    save_to_drive(drive, folderid, test_labels_filename)

In [18]:
total_test_images = len(test_data)
total_test_images

7500

## Inception V3

In [19]:
max_num_samples = 5000 # Colab crashes with too many images
inceptionv3_test_filename = 'rerun_inceptionv3_embeddings_test.npz'

inceptionv3_test_embedder = embedding_models.InceptionV3EmbeddingWrapper()

run_data_through_model(
    test_data, 
    inceptionv3_test_embedder, 
    inceptionv3_test_filename,
    drive,
    folderid,
    total_test_images,
    max_num_samples
    )

Running for image indices 0-5000.
Data saved to xrerun_inceptionv3_embeddings_test.npz
Uploaded xrerun_inceptionv3_embeddings_test.npz to https://drive.google.com/drive/u/1/folders/13IXmLLCxY96gh9FQMR6Il_dXDB2LsYcR
5000/7500 images done
Downloading xrerun_inceptionv3_embeddings_test.npz from GDrive
5000/7500 images done already
Running for image indices 5000-10000.
Deleting xrerun_inceptionv3_embeddings_test.npz from GDrive
Data saved to xrerun_inceptionv3_embeddings_test.npz
Uploaded xrerun_inceptionv3_embeddings_test.npz to https://drive.google.com/drive/u/1/folders/13IXmLLCxY96gh9FQMR6Il_dXDB2LsYcR
7500/7500 images done


## Resnet 50

In [20]:
max_num_samples = 3000
resnet50_test_filename = 'resnet50_embeddings_test.npz'

resnet50_test_embedder = embedding_models.Resnet50EmbeddingWrapper()

run_data_through_model(
    test_data, 
    resnet50_test_embedder, 
    resnet50_test_filename,
    drive,
    folderid,
    total_test_images,
    max_num_samples
    )

Downloading: "https://download.pytorch.org/models/resnet50-19c8e357.pth" to /root/.cache/torch/hub/checkpoints/resnet50-19c8e357.pth


HBox(children=(FloatProgress(value=0.0, max=102502400.0), HTML(value='')))


Running for image indices 0-3000.
Data saved to xrerun_resnet50_embeddings_test.npz
Uploaded xrerun_resnet50_embeddings_test.npz to https://drive.google.com/drive/u/1/folders/13IXmLLCxY96gh9FQMR6Il_dXDB2LsYcR
3000/7500 images done
Downloading xrerun_resnet50_embeddings_test.npz from GDrive
3000/7500 images done already
Running for image indices 3000-6000.
Deleting xrerun_resnet50_embeddings_test.npz from GDrive
Data saved to xrerun_resnet50_embeddings_test.npz
Uploaded xrerun_resnet50_embeddings_test.npz to https://drive.google.com/drive/u/1/folders/13IXmLLCxY96gh9FQMR6Il_dXDB2LsYcR
6000/7500 images done
Downloading xrerun_resnet50_embeddings_test.npz from GDrive
6000/7500 images done already
Running for image indices 6000-9000.
Deleting xrerun_resnet50_embeddings_test.npz from GDrive
Data saved to xrerun_resnet50_embeddings_test.npz
Uploaded xrerun_resnet50_embeddings_test.npz to https://drive.google.com/drive/u/1/folders/13IXmLLCxY96gh9FQMR6Il_dXDB2LsYcR
7500/7500 images done


## MoCo Resnet 50

In [21]:
max_num_samples = 2000
moco_resnet50_test_filename = 'moco_resnet50_embeddings_test.npz'

moco_resnet50_test_embedder = embedding_models.MoCoResnet50EmbeddingWrapper()

run_data_through_model(
    test_data, 
    moco_resnet50_test_embedder, 
    moco_resnet50_test_filename,
    drive,
    folderid,
    total_test_images,
    max_num_samples
    )

Running for image indices 0-2000.
Data saved to xrerun_moco_resnet50_embeddings_test.npz
Uploaded xrerun_moco_resnet50_embeddings_test.npz to https://drive.google.com/drive/u/1/folders/13IXmLLCxY96gh9FQMR6Il_dXDB2LsYcR
2000/7500 images done
Downloading xrerun_moco_resnet50_embeddings_test.npz from GDrive
2000/7500 images done already
Running for image indices 2000-4000.
Deleting xrerun_moco_resnet50_embeddings_test.npz from GDrive
Data saved to xrerun_moco_resnet50_embeddings_test.npz
Uploaded xrerun_moco_resnet50_embeddings_test.npz to https://drive.google.com/drive/u/1/folders/13IXmLLCxY96gh9FQMR6Il_dXDB2LsYcR
4000/7500 images done
Downloading xrerun_moco_resnet50_embeddings_test.npz from GDrive
4000/7500 images done already
Running for image indices 4000-6000.
Deleting xrerun_moco_resnet50_embeddings_test.npz from GDrive
Data saved to xrerun_moco_resnet50_embeddings_test.npz
Uploaded xrerun_moco_resnet50_embeddings_test.npz to https://drive.google.com/drive/u/1/folders/13IXmLLCxY96g

## PCL Resnet 50

In [25]:
max_num_samples = 2000
pcl_resnet50_test_filename = 'pcl_resnet50_embeddings_test.npz'

pcl_resnet50_test_embedder = embedding_models.PCLResnet50EmbeddingWrapper()

run_data_through_model(
    test_data, 
    pcl_resnet50_test_embedder, 
    pcl_resnet50_test_filename,
    drive,
    folderid,
    total_test_images,
    max_num_samples
    )

Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/googleapiclient/discovery_cache/file_cache.py", line 33, in <module>
    from oauth2client.contrib.locked_file import LockedFile
ModuleNotFoundError: No module named 'oauth2client.contrib.locked_file'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/googleapiclient/discovery_cache/file_cache.py", line 37, in <module>
    from oauth2client.locked_file import LockedFile
ModuleNotFoundError: No module named 'oauth2client.locked_file'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/googleapiclient/discovery_cache/__init__.py", line 44, in autodetect
    from . import file_cache
  File "/usr/local/lib/python3.7/dist-packages/googleapiclient/discovery_cache/file_cache.py", line 41, in <module>
    "file_cach

Downloading pcl_resnet50_embeddings_test.npz from GDrive
All images done already.


## SwAV Resnet 50

In [26]:
max_num_samples = 3000
swav_resnet50_test_filename = 'swav_resnet50_embeddings_test.npz'

swav_resnet50_test_embedder = embedding_models.SwAVResnet50EmbeddingWrapper()

run_data_through_model(
    test_data, 
    swav_resnet50_test_embedder, 
    swav_resnet50_test_filename,
    drive,
    folderid,
    total_test_images,
    max_num_samples
    )

Downloading: "https://github.com/facebookresearch/swav/archive/master.zip" to /root/.cache/torch/hub/master.zip
Downloading: "https://dl.fbaipublicfiles.com/deepcluster/swav_800ep_pretrain.pth.tar" to /root/.cache/torch/hub/checkpoints/swav_800ep_pretrain.pth.tar


HBox(children=(FloatProgress(value=0.0, max=113703565.0), HTML(value='')))


Downloading swav_resnet50_embeddings_test.npz from GDrive
All images done already.


## SimCLR

In [27]:
max_num_samples = 3000
simclr_test_filename = 'simclr_embeddings_test.npz'

simclr_test_embedder = embedding_models.SimCLREmbeddingWrapper()

run_data_through_model(
    test_data, 
    simclr_test_embedder, 
    simclr_test_filename,
    drive,
    folderid,
    total_test_images,
    max_num_samples
    )



Downloading simclr_embeddings_test.npz from GDrive
All images done already.


## VGG16

In [28]:
max_num_samples = 3000
vgg16_test_filename = 'vgg16_embeddings_test.npz'

vgg16_test_embedder = embedding_models.VGG16EmbeddingWrapper()

run_data_through_model(
    test_data, 
    vgg16_test_embedder, 
    vgg16_test_filename,
    drive,
    folderid,
    total_test_images,
    max_num_samples
    )

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels.h5
Downloading vgg16_embeddings_test.npz from GDrive
All images done already.


## CLIP

In [29]:
max_num_samples = 500
clip_test_filename = 'clip_embeddings_test.npz'

clip_test_embedder = embedding_models.CLIPEmbeddingWrapper()

run_data_through_model(
    test_data, 
    clip_test_embedder, 
    clip_test_filename,
    drive,
    folderid,
    total_test_images,
    max_num_samples
    )

100%|████████████████████████████████████████| 354M/354M [00:03<00:00, 100MiB/s]


Downloading clip_embeddings_test.npz from GDrive
All images done already.
