# Perperation

Import dataset from git and setup test set.

In [2]:
!git clone https://github.com/Rasilu/cil
!cp -r cil/datasets/kaggle_data/test/ ./

Cloning into 'cil'...
remote: Enumerating objects: 21138, done.[K
remote: Counting objects: 100% (4947/4947), done.[K
remote: Compressing objects: 100% (4912/4912), done.[K
remote: Total 21138 (delta 40), reused 4933 (delta 31), pack-reused 16191[K
Receiving objects: 100% (21138/21138), 2.96 GiB | 63.01 MiB/s, done.
Resolving deltas: 100% (1066/1066), done.
Checking out files: 100% (18487/18487), done.


In [1]:
!pip install lpips
import lpips
import csv 
import torch
from google.colab import files
import numpy as np
from glob import glob
import math
import torch
from PIL import Image

MAX_BATCH = 2500.

device = 'cuda' if torch.cuda.is_available() else 'cpu'

loss_fn_alex = lpips.LPIPS(net='alex').cuda() # best forward scores
loss_fn_vgg = lpips.LPIPS(net='vgg').cuda() # best forward scores


def np_to_tensor(x, device):
    # allocates tensors from np.arrays
    if device == 'cpu':
        return torch.from_numpy(x).cpu()
    else:
        return torch.from_numpy(x).contiguous().pin_memory().to(device=device, non_blocking=True)

def load_all_from_path_normalized(path, batch):
    # loads all HxW .pngs contained in path as a 4D np.array of shape (n_images, H, W, 3)
    # images are loaded as floats with values in the interval [0., 1.]
    file_names = sorted(glob(path + '/*.png'))
    file_names = file_names[int(batch * MAX_BATCH) : int(min((batch + 1) * MAX_BATCH, len(file_names)))]
    items = np.stack([np.array(Image.open(f).convert('RGB')) for f in file_names])
    trans = []
    for i in range(len(items)):
        trans.append([items[i, :, :, 0], items[i, :, :, 1], items[i, :, :, 2]])

    result = (np.stack(trans).astype(np.float32) - 127.5) / 127.5

    print(f"{len(file_names)} files loaded. Shape = {result.shape}. Max Value = {result.max()}. Min Value = {result.min()}")
    return result

def compare_similarities(images1, images2, saveAs):

    similarities_alex = []
    similarities_vgg = []
    
    
    for i in range(math.ceil(len(glob(images1 + "/*.png")) / MAX_BATCH)):
      images1_tensor = []
      images2_tensor = []
      gc.collect()
      images1_tensor = np_to_tensor(load_all_from_path_normalized(images1, i), device)
      images2_tensor = np_to_tensor(load_all_from_path_normalized(images2, 0), device)

      torch.cuda.empty_cache()
      for img1 in images1_tensor:
          similaritiesAlexRow = []
          similaritiesVggRow = []
          for img2 in images2_tensor:
              result_alex = loss_fn_alex(img1, img2)
              result_vgg = loss_fn_alex(img1, img2)
              similaritiesAlexRow.append(result_alex.item())
              similaritiesVggRow.append(result_vgg.item())
          similarities_alex.append(similaritiesAlexRow)
          similarities_vgg.append(similaritiesVggRow)
    

    nameAlex = "similarities_" + saveAs + "_alex.csv"
    with open(nameAlex, 'w') as f:
        write = csv.writer(f)
        write.writerows(similarities_alex) 
    nameVgg = "similarities_" + saveAs + "_vgg.csv"
    with open(nameVgg, 'w') as f:
        write = csv.writer(f)
        write.writerows(similarities_vgg) 
    #!cp {nameAlex} '/content/gdrive/My Drive/Data/'
    #!cp {nameVgg} '/content/gdrive/My Drive/Data/'

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting lpips
  Downloading lpips-0.1.4-py3-none-any.whl (53 kB)
[K     |████████████████████████████████| 53 kB 1.7 MB/s 
Installing collected packages: lpips
Successfully installed lpips-0.1.4
Setting up [LPIPS] perceptual loss: trunk [alex], v[0.1], spatial [off]


  f"The parameter '{pretrained_param}' is deprecated since 0.13 and will be removed in 0.15, "
Downloading: "https://download.pytorch.org/models/alexnet-owt-7be5be79.pth" to /root/.cache/torch/hub/checkpoints/alexnet-owt-7be5be79.pth


  0%|          | 0.00/233M [00:00<?, ?B/s]

Loading model from: /usr/local/lib/python3.7/dist-packages/lpips/weights/v0.1/alex.pth
Setting up [LPIPS] perceptual loss: trunk [vgg], v[0.1], spatial [off]


Downloading: "https://download.pytorch.org/models/vgg16-397923af.pth" to /root/.cache/torch/hub/checkpoints/vgg16-397923af.pth


  0%|          | 0.00/528M [00:00<?, ?B/s]

Loading model from: /usr/local/lib/python3.7/dist-packages/lpips/weights/v0.1/vgg.pth


In [None]:
# needed for pipeline
import gc
import time

datasets = ["kaggle_data", "new_data", "mass_roads"]
train_images_dataset_path = "/content/cil/datasets/kaggle_data/training/images"
train_groundtruth_dataset_path = "/content/cil/datasets/kaggle_data/training/groundtruth"

for dataset in datasets:
  start_similarity_time = time.time()
  path_images = "/content/cil/datasets/" + dataset + "/training/images"
  path_groundtruth = "/content/cil/datasets/" + dataset + "/training/groundtruth"
  file_name_images = dataset + "_" + "kaggle_data_train_images"
  file_name_groundtruth = dataset + "_" + "kaggle_data_train_groundtruth"
  compare_similarities(path_images, train_images_dataset_path, file_name_images)
  gc.collect()
  stop_similarity_time = time.time()
  print(f"Time for images in {dataset} dataset is: {stop_similarity_time - start_similarity_time}")
  start_similarity_time = time.time()
  compare_similarities(path_groundtruth, train_groundtruth_dataset_path, file_name_groundtruth)
  gc.collect()
  stop_similarity_time = time.time()
  print(f"Time for groundtruth in {dataset} dataset is: {stop_similarity_time - start_similarity_time}")

2500 files loaded. Shape = (2500, 3, 400, 400). Max Value = 1.0. Min Value = -1.0
144 files loaded. Shape = (144, 3, 400, 400). Max Value = 1.0. Min Value = -1.0
2355 files loaded. Shape = (2355, 3, 400, 400). Max Value = 1.0. Min Value = -1.0
144 files loaded. Shape = (144, 3, 400, 400). Max Value = 1.0. Min Value = -1.0
Time for images in mass_roads dataset is: 4069.127070903778
2500 files loaded. Shape = (2500, 3, 400, 400). Max Value = 1.0. Min Value = -1.0
144 files loaded. Shape = (144, 3, 400, 400). Max Value = 1.0. Min Value = -1.0
2355 files loaded. Shape = (2355, 3, 400, 400). Max Value = 1.0. Min Value = -1.0
144 files loaded. Shape = (144, 3, 400, 400). Max Value = 1.0. Min Value = -1.0
Time for groundtruth in mass_roads dataset is: 4011.762069940567
