# Mount Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [21]:
!pip install -U -q PyDrive
!pip install httplib2==0.15.0
import os
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from pydrive.files import GoogleDriveFileList
from google.colab import auth
from oauth2client.client import GoogleCredentials

from getpass import getpass
import urllib

# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Cloning PAL_2021 to access modules.
# Need password to access private repo.

if 'CLIPPER' not in os.listdir():
    cmd_string = 'git clone https://github.com/PAL-ML/CLIPPER.git'
    os.system(cmd_string)



# Installation

## Install multi label metrics dependencies

In [3]:
! pip install scikit-learn==0.24

Collecting scikit-learn==0.24
[?25l  Downloading https://files.pythonhosted.org/packages/b1/ed/ab51a8da34d2b3f4524b21093081e7f9e2ddf1c9eac9f795dcf68ad0a57d/scikit_learn-0.24.0-cp37-cp37m-manylinux2010_x86_64.whl (22.3MB)
[K     |████████████████████████████████| 22.3MB 1.3MB/s 
Collecting threadpoolctl>=2.0.0
  Downloading https://files.pythonhosted.org/packages/f7/12/ec3f2e203afa394a149911729357aa48affc59c20e2c1c8297a60f33f133/threadpoolctl-2.1.0-py3-none-any.whl
Installing collected packages: threadpoolctl, scikit-learn
  Found existing installation: scikit-learn 0.22.2.post1
    Uninstalling scikit-learn-0.22.2.post1:
      Successfully uninstalled scikit-learn-0.22.2.post1
Successfully installed scikit-learn-0.24.0 threadpoolctl-2.1.0


## Install CLIP dependencies

In [4]:
import subprocess

CUDA_version = [s for s in subprocess.check_output(["nvcc", "--version"]).decode("UTF-8").split(", ") if s.startswith("release")][0].split(" ")[-1]
print("CUDA version:", CUDA_version)

if CUDA_version == "10.0":
    torch_version_suffix = "+cu100"
elif CUDA_version == "10.1":
    torch_version_suffix = "+cu101"
elif CUDA_version == "10.2":
    torch_version_suffix = ""
else:
    torch_version_suffix = "+cu110"

CUDA version: 11.0


In [5]:
! pip install torch==1.7.1{torch_version_suffix} torchvision==0.8.2{torch_version_suffix} -f https://download.pytorch.org/whl/torch_stable.html ftfy regex

Looking in links: https://download.pytorch.org/whl/torch_stable.html
Collecting torch==1.7.1+cu110
[?25l  Downloading https://download.pytorch.org/whl/cu110/torch-1.7.1%2Bcu110-cp37-cp37m-linux_x86_64.whl (1156.8MB)
[K     |███████████████████████         | 834.1MB 1.3MB/s eta 0:04:03tcmalloc: large alloc 1147494400 bytes == 0x55fe930ec000 @  0x7f48cc959615 0x55fe594c6cdc 0x55fe595a652a 0x55fe594c9afd 0x55fe595bafed 0x55fe5953d988 0x55fe595384ae 0x55fe594cb3ea 0x55fe5953d7f0 0x55fe595384ae 0x55fe594cb3ea 0x55fe5953a32a 0x55fe595bbe36 0x55fe59539853 0x55fe595bbe36 0x55fe59539853 0x55fe595bbe36 0x55fe59539853 0x55fe595bbe36 0x55fe5963e3e1 0x55fe5959e6a9 0x55fe59509cc4 0x55fe594ca559 0x55fe5953e4f8 0x55fe594cb30a 0x55fe595393b5 0x55fe595387ad 0x55fe594cb3ea 0x55fe595393b5 0x55fe594cb30a 0x55fe595393b5
[K     |█████████████████████████████▏  | 1055.7MB 1.2MB/s eta 0:01:26tcmalloc: large alloc 1434370048 bytes == 0x55fed7742000 @  0x7f48cc959615 0x55fe594c6cdc 0x55fe595a652a 0x55fe594c9a

In [6]:
! pip install ftfy regex
! wget https://openaipublic.azureedge.net/clip/bpe_simple_vocab_16e6.txt.gz -O bpe_simple_vocab_16e6.txt.gz

--2021-06-01 19:25:38--  https://openaipublic.azureedge.net/clip/bpe_simple_vocab_16e6.txt.gz
Resolving openaipublic.azureedge.net (openaipublic.azureedge.net)... 13.107.246.69, 13.107.213.69, 2620:1ec:bdf::69, ...
Connecting to openaipublic.azureedge.net (openaipublic.azureedge.net)|13.107.246.69|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1356917 (1.3M) [application/octet-stream]
Saving to: ‘bpe_simple_vocab_16e6.txt.gz’


2021-06-01 19:25:38 (24.4 MB/s) - ‘bpe_simple_vocab_16e6.txt.gz’ saved [1356917/1356917]



In [7]:
!pip install git+https://github.com/Sri-vatsa/CLIP # using this fork because of visualization capabilities

Collecting git+https://github.com/Sri-vatsa/CLIP
  Cloning https://github.com/Sri-vatsa/CLIP to /tmp/pip-req-build-4vygj1qf
  Running command git clone -q https://github.com/Sri-vatsa/CLIP /tmp/pip-req-build-4vygj1qf
Building wheels for collected packages: clip
  Building wheel for clip (setup.py) ... [?25l[?25hdone
  Created wheel for clip: filename=clip-1.0-cp37-none-any.whl size=1368623 sha256=9387a71e174a57d732566c2640daf9a3376343e29397b216ce878a25e47d0ecf
  Stored in directory: /tmp/pip-ephem-wheel-cache-osrjfoax/wheels/cc/55/69/0d411dabbd5009fd069d47b47cf7839c54e595dc61725b307b
Successfully built clip
Installing collected packages: clip
Successfully installed clip-1.0


## Install clustering dependencies

In [8]:
!pip -q install umap-learn>=0.3.7

## Install dataset manager dependencies

In [9]:
!pip install wget

Collecting wget
  Downloading https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/wget-3.2.zip
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-cp37-none-any.whl size=9681 sha256=b52a8fbacc283d39e5278f9735da209a2aea189e5177d76135d74b53b23c3a9a
  Stored in directory: /root/.cache/pip/wheels/40/15/30/7d8f7cea2902b4db79e3fea550d7d7b85ecb27ef992b618f3f
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


# Imports

In [10]:
# ML Libraries
import tensorflow as tf
import tensorflow_hub as hub
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from tensorflow import keras


# Data processing
import PIL
import base64
import imageio
import pandas as pd
import numpy as np
import json

from PIL import Image
import cv2
from sklearn.feature_extraction.image import extract_patches_2d

# Plotting
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as patches

from IPython.core.display import display, HTML
from matplotlib import cm
import matplotlib.image as mpimg

# Models
import clip

# Datasets
import tensorflow_datasets as tfds

# Clustering
# import umap

from sklearn import metrics
from sklearn.cluster import KMeans
#from yellowbrick.cluster import KElbowVisualizer

# Misc
import progressbar
import logging
from abc import ABC, abstractmethod
import time
import urllib.request
import os
from sklearn.metrics import jaccard_score, hamming_loss, accuracy_score, f1_score
from sklearn.preprocessing import MultiLabelBinarizer


# Modules
from CLIPPER.code.ExperimentModules import embedding_models
from CLIPPER.code.ExperimentModules.dataset_manager import DatasetManager
from CLIPPER.code.ExperimentModules.weight_imprinting_classifier import WeightImprintingClassifier
from CLIPPER.code.ExperimentModules import simclr_data_augmentations
from CLIPPER.code.ExperimentModules.utils import (save_npy, load_npy, 
                                                       get_folder_id, 
                                                       create_expt_dir, 
                                                       save_to_drive, 
                                                       load_all_from_drive_folder, 
                                                       download_file_by_name, 
                                                       delete_file_by_name)

logging.getLogger('googleapicliet.discovery_cache').setLevel(logging.ERROR)

# Initialization & Constants

## Dataset details

In [11]:
IMG_HEIGHT = 112
IMG_WIDTH = 112

experiment_id = "IndoorSceneRecognition-Embeddings"

folder_name = experiment_id+"-28-02-21"

# Change parentid to match that of experiments root folder in gdrive
parentid = '1bK72W-Um20EQDEyChNhNJthUNbmoSEjD'

In [12]:
# Initialize sepcific experiment folder in drive
folderid = create_expt_dir(drive, parentid, folder_name)

title: IndoorSceneRecognition-Embeddings-28-02-21, id: 1byZ1iLbfdF3Hr6EZNsTubgMys9IDy8gZ


# Embedding function

In [13]:
def run_data_through_model(
    data, 
    embedder, 
    filename, 
    drive,
    folderid,
    total_num_images,
    max_num_samples=5000,
):
    embedder.load_model()

    embeddings = None
    num_images_done = 0

    while embeddings is None or num_images_done < total_num_images:
        download_file_by_name(drive, folderid, filename)

        if filename in os.listdir():
            embeddings = np.load(filename)['data']
            num_images_done = embeddings.shape[0]
            if num_images_done == total_num_images:
                print("All images done already.")
                break
            else:
                print("{}/{} images done already".format(
                    num_images_done, total_num_images)
                )

        print("Running for image indices {}-{}.".format(
            num_images_done, num_images_done+max_num_samples
            )
        )
        if (num_images_done+max_num_samples) <= total_num_images:
            batch = data[num_images_done:num_images_done+max_num_samples]
        else:
            batch = data[num_images_done:]

        processed_batch = embedder.preprocess_data(batch)
        embeddings_batch = embedder.embed_images(
            processed_batch, batch_size=50
            )
        
        if embeddings is None:
            embeddings = embeddings_batch
        else:
            embeddings = np.concatenate(
                [embeddings, embeddings_batch]
                )
            
        delete_file_by_name(drive, folderid, filename)
        embedder.save_embeddings_to_drive(
            embeddings, 
            filename,
            drive,
            folderid
            )
        num_images_done = embeddings.shape[0]
        print("{}/{} images done".format(num_images_done, total_num_images))


# Train data split

## Load Data

In [14]:
dm = DatasetManager()
train_data_generator = dm.load_dataset('indoor_scene_recognition', split="train")

Found 15620 files belonging to 67 classes.
Using 12496 files for training.


In [15]:
_data = []
_labels = []

data_gen = iter(train_data_generator)

while True:
  try:
    image, label = next(data_gen)
    resized_image = cv2.resize(image[0], (IMG_WIDTH, IMG_HEIGHT)).astype(np.uint8)
    _data.append(resized_image)
    _labels.append(label[0])
  except StopIteration:
    break
  except:
    continue

train_data = np.stack(_data)
train_labels = np.array(_labels)

del _data
del _labels

print('Images shape: ', train_data.shape)
print('Labels length: ', len(train_labels))


Images shape:  (9198, 112, 112, 3)
Labels length:  9198


In [16]:
# Save train labels
train_labels_filename = 'train_labels.npz'

if train_labels_filename not in os.listdir():
    save_npy(train_labels_filename, train_labels)
    save_to_drive(drive, folderid, train_labels_filename)

Data saved to train_labels.npz
Uploaded train_labels.npz to https://drive.google.com/drive/u/1/folders/1byZ1iLbfdF3Hr6EZNsTubgMys9IDy8gZ


In [17]:
total_train_images = len(train_data)
total_train_images

9198

## Inception V3

In [18]:
max_num_samples = 4000 # Colab crashes with too many images
inceptionv3_train_filename = 'inceptionv3_embeddings_train.npz'

inceptionv3_train_embedder = embedding_models.InceptionV3EmbeddingWrapper()

run_data_through_model(
    train_data, 
    inceptionv3_train_embedder, 
    inceptionv3_train_filename,
    drive,
    folderid,
    total_train_images,
    max_num_samples
    )

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/inception_v3/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5
Running for image indices 0-4000.
Data saved to x_inceptionv3_embeddings_train.npz
Uploaded x_inceptionv3_embeddings_train.npz to https://drive.google.com/drive/u/1/folders/1byZ1iLbfdF3Hr6EZNsTubgMys9IDy8gZ
4000/9198 images done
Downloading x_inceptionv3_embeddings_train.npz from GDrive
4000/9198 images done already
Running for image indices 4000-8000.
Deleting x_inceptionv3_embeddings_train.npz from GDrive
Data saved to x_inceptionv3_embeddings_train.npz
Uploaded x_inceptionv3_embeddings_train.npz to https://drive.google.com/drive/u/1/folders/1byZ1iLbfdF3Hr6EZNsTubgMys9IDy8gZ
8000/9198 images done
Downloading x_inceptionv3_embeddings_train.npz from GDrive
8000/9198 images done already
Running for image indices 8000-12000.
Deleting x_inceptionv3_embeddings_train.npz from GDrive
Data saved to x_inceptionv3_embeddings_train.npz
Uploaded 

## Resnet 50

In [19]:
max_num_samples = 2000
resnet50_train_filename = 'resnet50_embeddings_train.npz'

resnet50_train_embedder = embedding_models.Resnet50EmbeddingWrapper()

run_data_through_model(
    train_data, 
    resnet50_train_embedder, 
    resnet50_train_filename,
    drive,
    folderid,
    total_train_images,
    max_num_samples
    )

Downloading: "https://download.pytorch.org/models/resnet50-19c8e357.pth" to /root/.cache/torch/hub/checkpoints/resnet50-19c8e357.pth


HBox(children=(FloatProgress(value=0.0, max=102502400.0), HTML(value='')))


Running for image indices 0-2000.
Data saved to x_resnet50_embeddings_train.npz
Uploaded x_resnet50_embeddings_train.npz to https://drive.google.com/drive/u/1/folders/1byZ1iLbfdF3Hr6EZNsTubgMys9IDy8gZ
2000/9198 images done
Downloading x_resnet50_embeddings_train.npz from GDrive
2000/9198 images done already
Running for image indices 2000-4000.
Deleting x_resnet50_embeddings_train.npz from GDrive
Data saved to x_resnet50_embeddings_train.npz
Uploaded x_resnet50_embeddings_train.npz to https://drive.google.com/drive/u/1/folders/1byZ1iLbfdF3Hr6EZNsTubgMys9IDy8gZ
4000/9198 images done
Downloading x_resnet50_embeddings_train.npz from GDrive
4000/9198 images done already
Running for image indices 4000-6000.
Deleting x_resnet50_embeddings_train.npz from GDrive
Data saved to x_resnet50_embeddings_train.npz
Uploaded x_resnet50_embeddings_train.npz to https://drive.google.com/drive/u/1/folders/1byZ1iLbfdF3Hr6EZNsTubgMys9IDy8gZ
6000/9198 images done
Downloading x_resnet50_embeddings_train.npz fr

## MoCo Resnet 50

In [22]:
max_num_samples = 2000
moco_resnet50_train_filename = 'moco_resnet50_embeddings_train.npz'

moco_resnet50_train_embedder = embedding_models.MoCoResnet50EmbeddingWrapper()

run_data_through_model(
    train_data, 
    moco_resnet50_train_embedder, 
    moco_resnet50_train_filename,
    drive,
    folderid,
    total_train_images,
    max_num_samples
    )

Downloading x_moco_resnet50_embeddings_train.npz from GDrive
6000/9198 images done already
Running for image indices 6000-8000.
Deleting x_moco_resnet50_embeddings_train.npz from GDrive
Data saved to x_moco_resnet50_embeddings_train.npz
Uploaded x_moco_resnet50_embeddings_train.npz to https://drive.google.com/drive/u/1/folders/1byZ1iLbfdF3Hr6EZNsTubgMys9IDy8gZ
8000/9198 images done
Downloading x_moco_resnet50_embeddings_train.npz from GDrive
8000/9198 images done already
Running for image indices 8000-10000.
Deleting x_moco_resnet50_embeddings_train.npz from GDrive
Data saved to x_moco_resnet50_embeddings_train.npz
Uploaded x_moco_resnet50_embeddings_train.npz to https://drive.google.com/drive/u/1/folders/1byZ1iLbfdF3Hr6EZNsTubgMys9IDy8gZ
9198/9198 images done


## PCL Resnet 50

In [23]:
max_num_samples = 2000
pcl_resnet50_train_filename = 'pcl_resnet50_embeddings_train.npz'

pcl_resnet50_train_embedder = embedding_models.PCLResnet50EmbeddingWrapper()

run_data_through_model(
    train_data, 
    pcl_resnet50_train_embedder, 
    pcl_resnet50_train_filename,
    drive,
    folderid,
    total_train_images,
    max_num_samples
    )

Downloading pcl_resnet50_embeddings_train.npz from GDrive
All images done already.


## SwAV Resnet 50

In [None]:
max_num_samples = 2000
swav_resnet50_train_filename = 'swav_resnet50_embeddings_train.npz'

swav_resnet50_train_embedder = embedding_models.SwAVResnet50EmbeddingWrapper()

run_data_through_model(
    train_data, 
    swav_resnet50_train_embedder, 
    swav_resnet50_train_filename,
    drive,
    folderid,
    total_train_images,
    max_num_samples
    )

## SimCLR

In [None]:
max_num_samples = 3000
simclr_train_filename = 'simclr_embeddings_train.npz'

simclr_train_embedder = embedding_models.SimCLREmbeddingWrapper()

run_data_through_model(
    train_data, 
    simclr_train_embedder, 
    simclr_train_filename,
    drive,
    folderid,
    total_train_images,
    max_num_samples
    )

## VGG16

In [None]:
max_num_samples = 3000
vgg16_train_filename = 'vgg16_embeddings_train.npz'

vgg16_train_embedder = embedding_models.VGG16EmbeddingWrapper()

run_data_through_model(
    train_data, 
    vgg16_train_embedder, 
    vgg16_train_filename,
    drive,
    folderid,
    total_train_images,
    max_num_samples
    )

## CLIP

In [None]:
max_num_samples = 3000
clip_train_filename = 'clip_embeddings_train.npz'

clip_train_embedder = embedding_models.CLIPEmbeddingWrapper()

run_data_through_model(
    train_data, 
    clip_train_embedder, 
    clip_train_filename,
    drive,
    folderid,
    total_train_images,
    max_num_samples
    )

# Val data split

## Load Data

In [None]:
dm = DatasetManager()
val_data_generator = dm.load_dataset('indoor_scene_recognition', split="val")

Found 1340 files belonging to 67 classes.


In [None]:
_data = []
_labels = []

data_gen = iter(val_data_generator)

while True:
  try:
    image, label = next(data_gen)
    resized_image = cv2.resize(image[0], (IMG_WIDTH, IMG_HEIGHT)).astype(np.uint8)
    _data.append(resized_image)
    _labels.append(label[0])
  except StopIteration:
    break
  except:
    continue

#for i, (image, label) in enumerate(val_data_generator):
#    resized_image = cv2.resize(image[0], (IMG_WIDTH, IMG_HEIGHT)).astype(np.uint8)
#    _data.append(resized_image)
#    _labels.append(label[0])

val_data = np.stack(_data)
val_labels = np.array(_labels)

del _data
del _labels

print('Images shape: ', val_data.shape)
print('Labels length: ', len(val_labels))


Images shape:  (1340, 112, 112, 3)
Labels length:  1340


In [None]:
# Save val labels
val_labels_filename = 'val_labels.npz'

if val_labels_filename not in os.listdir():
    save_npy(val_labels_filename, val_labels)
    save_to_drive(drive, folderid, val_labels_filename)

Data saved to val_labels.npz
Uploaded val_labels.npz to https://drive.google.com/drive/u/1/folders/1byZ1iLbfdF3Hr6EZNsTubgMys9IDy8gZ


In [None]:
total_val_images = len(val_data)
total_val_images

1340

## Inception V3

In [None]:
max_num_samples = 4000 # Colab crashes with too many images
inceptionv3_val_filename = 'inceptionv3_embeddings_val.npz'

inceptionv3_val_embedder = embedding_models.InceptionV3EmbeddingWrapper()

run_data_through_model(
    val_data, 
    inceptionv3_val_embedder, 
    inceptionv3_val_filename,
    drive,
    folderid,
    total_val_images,
    max_num_samples
    )

Downloading inceptionv3_embeddings_val.npz from GDrive
All images done already.


## Resnet 50

In [None]:
max_num_samples = 2000
resnet50_val_filename = 'resnet50_embeddings_val.npz'

resnet50_val_embedder = embedding_models.Resnet50EmbeddingWrapper()

run_data_through_model(
    val_data, 
    resnet50_val_embedder, 
    resnet50_val_filename,
    drive,
    folderid,
    total_val_images,
    max_num_samples
    )

Downloading resnet50_embeddings_val.npz from GDrive
All images done already.


## MoCo Resnet 50

In [None]:
max_num_samples = 2000
moco_resnet50_val_filename = 'moco_resnet50_embeddings_val.npz'

moco_resnet50_val_embedder = embedding_models.MoCoResnet50EmbeddingWrapper()

run_data_through_model(
    val_data, 
    moco_resnet50_val_embedder, 
    moco_resnet50_val_filename,
    drive,
    folderid,
    total_val_images,
    max_num_samples
    )

Downloading moco_resnet50_embeddings_val.npz from GDrive
All images done already.


## PCL Resnet 50

In [None]:
max_num_samples = 2000
pcl_resnet50_val_filename = 'pcl_resnet50_embeddings_val.npz'

pcl_resnet50_val_embedder = embedding_models.PCLResnet50EmbeddingWrapper()

run_data_through_model(
    val_data, 
    pcl_resnet50_val_embedder, 
    pcl_resnet50_val_filename,
    drive,
    folderid,
    total_val_images,
    max_num_samples
    )

Downloading pcl_resnet50_embeddings_val.npz from GDrive
All images done already.


## SwAV Resnet 50

In [None]:
max_num_samples = 3000
swav_resnet50_val_filename = 'swav_resnet50_embeddings_val.npz'

swav_resnet50_val_embedder = embedding_models.SwAVResnet50EmbeddingWrapper()

run_data_through_model(
    val_data, 
    swav_resnet50_val_embedder, 
    swav_resnet50_val_filename,
    drive,
    folderid,
    total_val_images,
    max_num_samples
    )

Using cache found in /root/.cache/torch/hub/facebookresearch_swav_master


Downloading swav_resnet50_embeddings_val.npz from GDrive
All images done already.


## SimCLR

In [None]:
max_num_samples = 3000
simclr_val_filename = 'simclr_embeddings_val.npz'

simclr_val_embedder = embedding_models.SimCLREmbeddingWrapper()

run_data_through_model(
    val_data, 
    simclr_val_embedder, 
    simclr_val_filename,
    drive,
    folderid,
    total_val_images,
    max_num_samples
    )



Downloading simclr_embeddings_val.npz from GDrive
All images done already.


## VGG16

In [None]:
max_num_samples = 3000
vgg16_val_filename = 'vgg16_embeddings_val.npz'

vgg16_val_embedder = embedding_models.VGG16EmbeddingWrapper()

run_data_through_model(
    val_data, 
    vgg16_val_embedder, 
    vgg16_val_filename,
    drive,
    folderid,
    total_val_images,
    max_num_samples
    )

Downloading vgg16_embeddings_val.npz from GDrive
All images done already.


## CLIP

In [None]:
max_num_samples = 100
clip_val_filename = 'clip_embeddings_val.npz'

clip_val_embedder = embedding_models.CLIPEmbeddingWrapper()

run_data_through_model(
    val_data, 
    clip_val_embedder, 
    clip_val_filename,
    drive,
    folderid,
    total_val_images,
    max_num_samples
    )

Running for image indices 0-2000.
Data saved to clip_embeddings_val.npz
Uploaded clip_embeddings_val.npz to https://drive.google.com/drive/u/1/folders/1byZ1iLbfdF3Hr6EZNsTubgMys9IDy8gZ
1340/1340 images done
