In [1]:
import torch
import random
import numpy as np
import pickle
from torchvision import transforms
import torch.nn as nn

from tqdm import tqdm
import h5py

random.seed(42)

In [2]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# transform from tensor in the form of np.array to normalised tensor
def tensor_transform(tensor_as_np_array):
    # Resize to (224, 224)
    tensor = torch.from_numpy(tensor_as_np_array).float()
    tensor = nn.functional.interpolate(tensor.unsqueeze(0), size=(224, 224), mode='bilinear', align_corners=False).squeeze(0)
    
    # Normalize using mean and std
    mean = torch.tensor([0.485, 0.456, 0.406], device=tensor.device).view(-1, 1, 1)
    std = torch.tensor([0.229, 0.224, 0.225], device=tensor.device).view(-1, 1, 1)
    tensor = (tensor - mean) / std
    
    return tensor

#### Get the full images from h5 files and resize them to be uniform size and normalised

In [4]:
# The files that have the animals images are the following
h5_files_cat = ["assets/raw dataset/pet_sentinel_data_random cats.h5", "assets/raw dataset/pet_sentinel_data_same cat.h5"]
h5_files_dog = ["assets/raw dataset/pet_sentinel_data_random dogs.h5", "assets/raw dataset/pet_sentinel_data_same dog.h5"]



# Iterate over the each animal files
for h5_files in [h5_files_cat, h5_files_dog]:

    # This list will be saved as a pickle file
    tensor_label_list = []

    # Iterate over same animal files
    # first file is random so label is 0
    # second file is same so label is 1
    for label in range(len(h5_files)):

        # Get animal to include it in pickle file name
        if "cat" in h5_files[label]:
            animal = "cat"
        elif "dog" in h5_files[label]:
            animal = "dog"
        
        tensor_dict = {}
        label_dict = {}

        # Open h5 file and save data to tensor_dict
        with h5py.File(h5_files[label], 'r') as f:
            for key in f:
                group = f[key]
                tensors = []
                labels = []
                for dname in group:
                    dset = group[dname]
                    tensors.append(dset[()])
                    labels.append(dset.attrs['label'])
                tensor_dict[key] = tensors
                label_dict[key] = labels
        
        for tensor_as_np in tqdm(list(tensor_dict.values())[0]):
            normalized_tensor = tensor_transform(tensor_as_np)
            tensor_label_list.append((normalized_tensor, label))


    # Save to pickle file
    with open(f"assets/transformed data/tensor_label_data_{animal}.pkl", "wb") as f:
        pickle.dump(tensor_label_list, f)


100%|█████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 546.79it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 404.76it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 424.59it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 509.08it/s]


## Take the transformed data from the pickle files and create train and test sets

In [8]:
# Create train and test datasets
for animal in ["cat", "dog"]:

    # Load data for each animal
    with open(f"assets/transformed data/tensor_label_data_{animal}.pkl", "rb") as f:
        data = pickle.load(f)
    
    # Separate by label
    data_0 = [item for item in data if item[1] == 0]  # other pets
    data_1 = [item for item in data if item[1] == 1]  # your pet
    
    # Balance training set
    train_ratio = 0.9
    n_train_1 = int(len(data_1) * train_ratio)
    n_train_0 = n_train_1

    train_data_1 = random.sample(data_1, n_train_1)
    train_data_0 = random.sample(data_0, n_train_0)
    train_data = train_data_0 + train_data_1
    random.shuffle(train_data)
    
    # Save to pickle file
    with open(f"assets/transformed data/train_data_{animal}.pkl", "wb") as f:
        pickle.dump(train_data, f)
    
    # Remaining data goes to test set (imbalanced)
    train_ids_1 = set(id(t[0]) for t in train_data_1)
    test_data_1 = [item for item in data_1 if id(item[0]) not in train_ids_1]
    
    train_ids_0 = set(id(t[0]) for t in train_data_0)
    test_data_0 = [item for item in data_0 if id(item[0]) not in train_ids_0]
    test_data = test_data_0 + test_data_1
    random.shuffle(test_data)
    
    with open(f"assets/transformed data/test_data_{animal}.pkl", "wb") as f:
        pickle.dump(test_data, f)


#### Below code allows to see the images after resizing and before normalisation

In [12]:
# Load the data
with open(f"assets/transformed data/test_data_cat.pkl", "rb") as f:
    data = pickle.load(f)

# see the image
tens = data[0][0]

# Normalize using ImageNet mean and std
mean = torch.tensor([0.485, 0.456, 0.406], device=tens.device).view(-1, 1, 1)
std = torch.tensor([0.229, 0.224, 0.225], device=tens.device).view(-1, 1, 1)

b = (tens * std ) + mean

# Convert to PIL image
to_pil = transforms.ToPILImage()
image = to_pil(b)

# Show the image
image.show()