## [Example of training SimCLR18 is here](https://b1g7e2p8fugfc16727hk.storage.yandexcloud.net/74cd94c0-a86e-49b9-9eee-487b9e0a14b9/user-data/resources/system/7eb752c2-2c04-4de8-bbb0-24251c0a864161bdeaed-ee85-4a96-ae0d-8348158988fb.html?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20200605T191230Z&X-Amz-SignedHeaders=host&X-Amz-Expires=604800&X-Amz-Credential=mXap0Jx_lcNqHbMis1m-%2F20200605%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=d92e5c7248054a67176224063ab5a915c8f4a41634571260fea612252c37a09e) - availible for one week
[Model checkpoint is here](https://drive.google.com/file/d/1ZKJSPXyABLK46rxtpjMlawzsQ1JPOuGx/view?usp=sharing)

[Converted weights of pretrained Resnet50-1x from the original article are here](https://drive.google.com/file/d/1YgouQx4Vn2st--GwO7r2MGTs-fiUWCKP/view?usp=sharing)

In [1]:
import os
import random
import torch
from torch import cuda
import numpy as np
import torch.nn as nn
import torchvision
from torchvision import transforms
import torch.nn.functional as F
from torch.utils.data import DataLoader

#from torchsummary import summary
from torch.utils.tensorboard import SummaryWriter
from IPython import display
from tqdm import trange
import matplotlib.pyplot as plt
import PIL
from pathlib import Path

# Import scripts
from CustomModels import CustomSimCLR50, CustomSimCLR18
from data import DataWrapper, FeatureExtractor, CustomDataset
from loss import ContrastiveLoss
from aux import count_parameters, save_ckp, load_ckp, get_lr
from train import *

ImportError: TensorBoard logging requires TensorBoard with Python summary writer installed. This should be available in 1.14 or above.

In [2]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
LATENT_DIM = 64 # as in the original article RL
HEAD_DIM = 256  # dimensionality of heads output 
BACKBONE = 'Resnet18' # whether finetune 'Resnet50' or train/finetune 'Resnet18'
BATCH_SIZE = 512 # try to increase 
N_EPOCHS = 300 # adjust further
CLOUD = True # if in cloud TB doesn't work

print(cuda.get_device_name())
print(cuda.get_device_capability())

AssertionError: 
Found no NVIDIA driver on your system. Please check that you
have an NVIDIA GPU and installed a driver from
http://www.nvidia.com/Download/index.aspx

In [None]:
if not Path("data_ver2/warehouse_time_step_0.jpg").is_file():
    import zipfile
    with zipfile.ZipFile('./warehouse_data_ver2.zip', 'r') as archive:
        archive.extractall()
else:
    print('Already there')

In [None]:
im = np.asarray(PIL.Image.open('data_ver2/warehouse_time_step_150.jpg'))
plt.imshow(im)
print(im.shape)

In [None]:
wrapper = DataWrapper(BATCH_SIZE, 0.1, (128, 128, 3), './data_ver2')
train_iterator, val_iterator = wrapper.get_loaders()
print(len(train_iterator))
print(len(val_iterator))

In [None]:
sample = next(iter(train_iterator))
print(len(sample))

In [None]:
image = transforms.ToPILImage()(sample[0][17]).convert("RGB")
plt.imshow(image)
plt.show()
image = transforms.ToPILImage()(sample[1][17]).convert("RGB")
plt.imshow(image)
plt.show()

In [None]:
if BACKBONE == 'Resnet50':
    model = CustomSimCLR50(BATCH_SIZE, LATENT_DIM, HEAD_DIM).to(DEVICE)
elif BACKBONE == 'Resnet18':
    model = CustomSimCLR18(BATCH_SIZE, LATENT_DIM, HEAD_DIM, pretrained = True).to(DEVICE)
else:
    print(BACKBONE, " doesn't match any")

criterion = ContrastiveLoss(True, 0.3, DEVICE, BATCH_SIZE) 
print("Current model is:",  model.__class__.__name__)
print(f"Model has {count_parameters(model):,} trainable parameters")

In [None]:
print(cuda.memory_summary(device = DEVICE, abbreviated = True))

In [None]:
train_loss_history, val_loss_history = train(model, DEVICE, 't0.3HD256BS512', lr = 0.001, weight_decay = 1e-5,
                    gamma = 0.5, step_size = 20, n_epochs = 300, cloud = CLOUD, 
                    train_iterator = train_iterator, val_iterator = val_iterator, 
                    criterion = criterion, scheduler_type = 'ReduceLROnPlateau')

In [None]:
dataset_to_test = CustomDataset('./data_ver2', transform = transforms.Compose([transforms.ToTensor()]))
print(len(dataset_to_test))
print(dataset_to_test[0].shape)

In [None]:
data_iterator = DataLoader(dataset_to_test, batch_size = 512, num_workers = 4, pin_memory = True, drop_last = False)
next(iter(data_iterator)).shape

In [None]:
Extractor = FeatureExtractor(model, DEVICE, data_iterator, "./t0.3HD256BS512best_model.pth")

In [None]:
feature_matrix = Extractor.get_features()
print(feature_matrix.shape)

In [None]:
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering, Birch, MiniBatchKMeans
from sklearn.metrics import silhouette_score

In [None]:
pca = PCA()
principalComponents = pca.fit_transform(feature_matrix)
print(principalComponents.shape)

In [None]:
plt.figure(figsize = (15, 15))
plt.scatter(principalComponents[:, 0], principalComponents[:, 1])

In [None]:
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')

In [None]:
# look if our latent vectors can be clustered which is really doubtful
aggl = AgglomerativeClustering(n_clusters = 5, affinity = 'cosine', linkage = 'average').fit(feature_matrix)
birth = Birch(n_clusters = 5).fit(feature_matrix)
kmeans = MiniBatchKMeans(n_clusters = 5, max_iter = 1e3).fit(feature_matrix)

In [None]:
fig, axes = plt.subplots(nrows = 1, ncols = 3, figsize = (20, 7))

axes[0].scatter(feature_matrix[:, 0], feature_matrix[:, 1], c = aggl.labels_)
axes[1].scatter(feature_matrix[:, 0], feature_matrix[:, 1], c = birth.predict(feature_matrix))
axes[2].scatter(feature_matrix[:, 0], feature_matrix[:, 1], c = kmeans.predict(feature_matrix))

In [None]:
# clusters are very overlapping. sh about 9 %
print(silhouette_score(feature_matrix, aggl.labels_))
print(silhouette_score(feature_matrix, birth.predict(feature_matrix)))
print(silhouette_score(feature_matrix, kmeans.predict(feature_matrix)))