# Requirements

In [1]:
import os
import hickle as hkl
import requests
import h5py
import gzip
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import glob
from VGG import get_image_embeddings, load_image_bytes, load_image, get_score

# Reading the csv

In [2]:
tqdm.pandas()
path = os.getcwd() + "/Datasets"
csv_files = glob.glob(os.path.join(path, "*.csv"))
csv_dfs = {}
for f in csv_files:
    file_name = f.split("\\")[-1]
    
    # remove the .csv extension
    file_name = file_name[:-4]
    
    if file_name == "Amazon-Products":
        continue
    
    # read the csv file 
    df = pd.read_csv(f)
    csv_dfs[file_name] = df

---
# Saving the images embedding

we take the entire dataset Amazon-Products.csv and save the numpy vectors embeddings of the images in the dataset to a gzip file.

In [3]:
# download image
def download_image(url):
    response = requests.get(url)
    if response.status_code != 200:
        return None
    img_data = response.content
    return img_data

In [4]:
def embed_image_from_url(url):
    img_data = download_image(url)
    if img_data is None:
        return np.zeros((1, 512))
    img = load_image_bytes(img_data)
    embedding = get_image_embeddings(img)
    return embedding

In [25]:
if not os.path.exists("images"):
    os.mkdir("images/")

# .keep file to keep the directory in git
with open("images/.keep", 'w') as f:
    pass

In [12]:
with h5py.File("images/dataset.hdf5", "a") as f:
    for file in csv_dfs:
        print("Processing", file)
        if file in f:
            continue
            
        urls = csv_dfs[file]["image"]
        
        # apply embed_image_from_url to all the urls and create a pd.Series with the embeddings
        embeddings = urls.progress_apply(embed_image_from_url)
        # create a dataset for each file in a specific group
        grp = f.create_group(file)
        dataset = grp.create_dataset("images", (len(urls), 1, embeddings[0].shape[1]), data=embeddings.to_list(),
                                     shuffle=True, dtype='f', compression="gzip", compression_opts=9)    

Processing Air Conditioners
Processing All Appliances
Processing All Books
Processing All Car and Motorbike Products
Processing All Electronics
Processing All English
Processing All Exercise and Fitness
Processing All Grocery and Gourmet Foods
Processing All Hindi
Processing All Home and Kitchen


  0%|          | 0/1224 [00:00<?, ?it/s]


KeyboardInterrupt



### Usage

In [6]:
with h5py.File("images/dataset.hdf5", "r") as f:
    print(f.keys())
    print(f["Air Conditioners"]["images"][1].shape)

<KeysViewHDF5 ['Air Conditioners', 'All Appliances', 'All Books', 'All Car and Motorbike Products', 'All Electronics', 'All English', 'All Exercise and Fitness', 'All Grocery and Gourmet Foods', 'All Hindi']>
(1, 512)


---
# Multi-threading

In [5]:
urls = {file: csv_dfs[file]["image"] for file in csv_dfs}

# only values 2 keys
urls = {k: urls[k] for k in list(urls)[:16]}

In [6]:
def download_csv_images(file, img_urls):
    # add 
    tqdm.pandas(desc=f"{file}", postfix=None)
    embeddings = img_urls.progress_apply(embed_image_from_url)
    with h5py.File(f"images/{file}.hdf5", "a") as f:
        if "images" not in f:
            dataset = f.create_dataset("images", (len(img_urls), 1, embeddings[0].shape[1]), data=embeddings.to_list(),
                                         shuffle=True, dtype='f', compression="gzip", compression_opts=9)

In [None]:
from concurrent.futures import ThreadPoolExecutor
import concurrent.futures

with ThreadPoolExecutor(max_workers=16) as executor:
    for file in urls:
        executor.submit(download_csv_images, file, urls[file])

Air Conditioners:   0%|          | 0/720 [00:00<?, ?it/s]

All Appliances:   0%|          | 0/9576 [00:00<?, ?it/s]

All Books: 0it [00:00, ?it/s]

All Car and Motorbike Products:   0%|          | 0/1272 [00:00<?, ?it/s]

All Exercise and Fitness:   0%|          | 0/1176 [00:00<?, ?it/s]

All Hindi: 0it [00:00, ?it/s]

All Electronics:   0%|          | 0/9600 [00:00<?, ?it/s]

All Pet Supplies:   0%|          | 0/648 [00:00<?, ?it/s]

All English: 0it [00:00, ?it/s]

All Music: 0it [00:00, ?it/s]

All Grocery and Gourmet Foods:   0%|          | 0/960 [00:00<?, ?it/s]

All Home and Kitchen:   0%|          | 0/1224 [00:00<?, ?it/s]

All Sports Fitness and Outdoors:   0%|          | 0/1224 [00:00<?, ?it/s]

All Video Games: 0it [00:00, ?it/s]

Amazon Fashion:   0%|          | 0/2352 [00:00<?, ?it/s]

All Movies and TV Shows: 0it [00:00, ?it/s]

---
# Testing the embeddings

In [5]:
input_img_path = "sunflower/sunflower1.jpg"
input_img = load_image(input_img_path)
input_embedding = get_image_embeddings(input_img)

with h5py.File("images/dataset.hdf5", "r") as f:
    embeddings = f["Air Conditioners"]["images"]
    print("Embeddings shape", embeddings.shape)
    for embedding in embeddings:
        embedding_score = get_score(input_embedding, embedding)[0][0]
        print(embedding_score)

Embeddings shape (720, 1, 512)
0.32450312
0.33937034
0.33937034
0.33937034
0.36557698
0.32078904
0.29679808
0.29679808
0.34270072
0.3252776
0.26361418
0.27883372
0.3739883
0.35587588
0.3752666
0.38051835
0.29679808
0.32746947
0.31756395
0.26076716
0.2629915
0.30895996
0.33937034
0.3739883
0.26476973
0.38754058
0.29415733
0.3863021
0.23242761
0.31756395
0.30219442
0.3592698
0.3752666
0.34920347
0.30219442
0.33937034
0.27883372
0.275722
0.273017
0.36314684
0.36278927
0.26132676
0.2935021
0.3101657
0.2835472
0.35587588
0.28153965
0.3380111
0.33948916
0.34173658
0.37599438
0.36557698
0.29307726
0.26476973
0.31606075
0.34920347
0.37372002
0.29774988
0.29096532
0.29676807
0.38304275
0.36517778
0.2750073
0.27212194
0.29585546
0.33019212
0.3095168
0.3880611
0.24493861
0.29942435
0.30606925
0.3268444
0.2605198
0.2757086
0.33937034
0.30219442
0.3101177
0.33745518
0.29415733
0.3730833
0.28402275
0.40402573
0.31096005
0.33019212
0.29096532
0.2664227
0.27518934
0.32981095
0.3739883
0.32136628
0.289