# Requirements

In [3]:
import os
import hickle as hkl
import requests
import h5py
import gzip
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import glob
from VGG import get_image_embeddings, load_image_bytes

# Reading the csv

In [4]:
tqdm.pandas()
path = os.getcwd() + "/Datasets"
csv_files = glob.glob(os.path.join(path, "*.csv"))
csv_dfs = {}
for f in csv_files:
    file_name = f.split("\\")[-1]
    
    # remove the .csv extension
    file_name = file_name[:-4]
    
    if file_name == "Amazon-Products":
        continue
    
    # read the csv file 
    df = pd.read_csv(f)
    csv_dfs[file_name] = df

---
# Saving the images embedding

we take the entire dataset Amazon-Products.csv and save the numpy vectors embeddings of the images in the dataset to a gzip file.

In [5]:
# download image
def download_image(url):
    response = requests.get(url)
    if response.status_code != 200:
        return None
    img_data = response.content
    return img_data

In [6]:
def embed_image_from_url(url):
    img_data = download_image(url)
    if img_data is None:
        return np.zeros((1, 512))
    img = load_image_bytes(img_data)
    embedding = get_image_embeddings(img)
    return embedding

In [7]:
if not os.path.exists("images"):
    os.mkdir("images/")

# .keep file to keep the directory in git
with open("images/.keep", 'w') as f:
    pass

In [ ]:
with h5py.File("images/dataset.hdf5", "a") as f:
    for file in csv_dfs:
        print("Processing", file)
        if file in f:
            continue
            
        urls = csv_dfs[file]["image"]
        
        # apply embed_image_from_url to all the urls and create a pd.Series with the embeddings
        embeddings = urls.progress_apply(embed_image_from_url)
        # create a dataset for each file in a specific group
        grp = f.create_group(file)
        dataset = grp.create_dataset("images", (len(urls), 1, embeddings[0].shape[1]), data=embeddings.to_list(),
                                     shuffle=True, dtype='f', compression="gzip", compression_opts=9)    

Processing Air Conditioners
Processing All Appliances
Processing All Books
Processing All Car and Motorbike Products
Processing All Electronics
Processing All English
Processing All Exercise and Fitness
Processing All Grocery and Gourmet Foods


  0%|          | 0/960 [00:00<?, ?it/s]

### Usage

In [ ]:
with h5py.File("images/dataset.hdf5", "r") as f:
    print(f.keys())
    print(f["Air Conditioners"]["images"][1].shape)