# Requirements

In [2]:
import os
import hickle as hkl
import requests
import h5py
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import glob
from VGG import get_image_embeddings, load_image_bytes, load_image, get_score, label

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m58889256/58889256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 0us/step


# Reading the csv

In [3]:
tqdm.pandas()
path = os.getcwd() + "/Datasets"
csv_files = glob.glob(os.path.join(path, "*.csv"))
csv_dfs = {}
for f in csv_files:
    file_name = f.split("\\")[-1]
    
    # remove the .csv extension
    file_name = file_name[:-4]
    
    if file_name == "Amazon-Products":
        continue
    
    # read the csv file 
    df = pd.read_csv(f)
    csv_dfs[file_name] = df

---
# Saving the images embedding

we take the entire dataset Amazon-Products.csv and save the numpy vectors embeddings of the images in the dataset to a gzip file.

In [4]:
# download image
def download_image(url):
    response = requests.get(url)
    if response.status_code != 200:
        return None
    img_data = response.content
    return img_data

In [5]:
def embed_image_from_url(url):
    img_data = download_image(url)
    if img_data is None:
        return np.zeros((1, 512))
    img = load_image_bytes(img_data)
    embedding = get_image_embeddings(img)
    return embedding

In [6]:
if not os.path.exists("images"):
    os.mkdir("images/")

# .keep file to keep the directory in git
with open("images/.keep", 'w') as f:
    pass

In [7]:
with h5py.File("images/dataset.hdf5", "a") as f:
    for file in csv_dfs:
        print("Processing", file)
        if file in f:
            continue
            
        urls = csv_dfs[file]["image"]
        
        # apply embed_image_from_url to all the urls and create a pd.Series with the embeddings
        embeddings = urls.progress_apply(embed_image_from_url)
        # create a dataset for each file in a specific group
        grp = f.create_group(file)
        dataset = grp.create_dataset("images", (len(urls), 1, embeddings[0].shape[1]), data=embeddings.to_list(),
                                     shuffle=True, dtype='f', compression="gzip", compression_opts=9)    

Processing Air Conditioners
Processing All Appliances
Processing All Books
Processing All Car and Motorbike Products
Processing All Electronics
Processing All English
Processing All Exercise and Fitness
Processing All Grocery and Gourmet Foods
Processing All Hindi
Processing All Home and Kitchen


  0%|          | 0/1224 [00:00<?, ?it/s]

Processing All Movies and TV Shows


0it [00:00, ?it/s]

KeyError: 0

### Usage

In [8]:
with (h5py.File("images/dataset.hdf5", "r") as f):
    print(f.keys())
    emb = f["Air Conditioners"]["images"][0]
    emb2 = embed_image_from_url(csv_dfs["Air Conditioners"]["image"][0])
    print(emb.shape)
    print(emb == emb2)

<KeysViewHDF5 ['Air Conditioners', 'All Appliances', 'All Books', 'All Car and Motorbike Products', 'All Electronics', 'All English', 'All Exercise and Fitness', 'All Grocery and Gourmet Foods', 'All Hindi', 'All Home and Kitchen', 'All Movies and TV Shows']>
(1, 512)
[[False False  True  True False False False  True  True False  True  True
  False  True False False  True False False False  True False False  True
   True False False False False False False  True False  True  True False
   True False  True False False False False  True False False False False
  False False False False False False False False  True False False False
  False  True False  True False  True False False False  True  True False
  False False False False  True False  True  True  True False False  True
   True  True False  True  True False False  True False  True False False
   True False False False  True False False  True False  True False False
  False False  True  True  True False  True False  True False  Tr

---
# Multi-threading

In [9]:
urls = {file: csv_dfs[file]["image"] for file in csv_dfs}

# only values 2 keys
urls = {k: urls[k] for k in list(urls)[32:48]}

In [10]:
def download_csv_images(file, img_urls):
    # add 
    tqdm.pandas(desc=f"{file}", postfix=None)
    embeddings = img_urls.progress_apply(embed_image_from_url)
    with h5py.File(f"images/{file}.hdf5", "a") as f:
        if "images" not in f:
            f.create_dataset("images", (len(img_urls), 1, embeddings[0].shape[1]), data=embeddings.to_list(),
                                         shuffle=True, dtype='f', compression="gzip", compression_opts=9)

In [None]:
from concurrent.futures import ThreadPoolExecutor
import concurrent.futures

with ThreadPoolExecutor(max_workers=32) as executor:
    for file in urls:
        executor.submit(download_csv_images, file, urls[file])

Car Electronics:   0%|          | 0/1008 [00:00<?, ?it/s]

Car Parts:   0%|          | 0/1224 [00:00<?, ?it/s]

Cardio Equipment:   0%|          | 0/240 [00:00<?, ?it/s]

Casual Shoes:   0%|          | 0/19056 [00:00<?, ?it/s]

Childrens Books: 0it [00:00, ?it/s]

Coffee Tea and Beverages:   0%|          | 0/1296 [00:00<?, ?it/s]

Cricket:   0%|          | 0/1224 [00:00<?, ?it/s]

Diet and Nutrition:   0%|          | 0/1200 [00:00<?, ?it/s]

Dog supplies:   0%|          | 0/984 [00:00<?, ?it/s]

Clothing:   0%|          | 0/19152 [00:00<?, ?it/s]

Ethnic Wear:   0%|          | 0/19056 [00:00<?, ?it/s]

Cycling:   0%|          | 0/1152 [00:00<?, ?it/s]

Diapers:   0%|          | 0/1080 [00:00<?, ?it/s]

Fashion and Silver Jewellery:   0%|          | 0/19104 [00:00<?, ?it/s]

Exam Central: 0it [00:00, ?it/s]

Entertainment Collectibles: 0it [00:00, ?it/s]

---
# Testing the embeddings

In [32]:
def argmax_score(input_embedding, embeddings):
    return np.argmax([get_score(input_embedding, emb) for emb in embeddings])

In [44]:
def argmax_csv(input_embedding, csv_dfs):
    csv_scores = {}
    for file in tqdm(csv_dfs):
        if not os.path.exists(f"images/{file}.hdf5"):
            continue
        
        with h5py.File(f"images/{file}.hdf5", "r") as f:
            if "images" in f:
                csv_embeddings = f["images"]
                csv_scores[file] = argmax_score(input_embedding, csv_embeddings)
    return csv_scores

In [45]:
input_img_path = "sunflower/sunflower1.jpg"
input_img = load_image(input_img_path)
input_embedding = get_image_embeddings(input_img)

csv_scores = argmax_csv(input_embedding, csv_dfs)
max_csv = max(csv_scores, key=csv_scores.get)
max_row = max(csv_scores.values())
print(max_csv, max_row)


display(csv_dfs[max_csv].iloc[max_row])

  0%|          | 0/139 [00:00<?, ?it/s]

Amazon Fashion 1861


name              PMW - Hair Accessories - Hair Bun - Bharatanat...
main_category                                                stores
sub_category                                         Amazon Fashion
image             https://m.media-amazon.com/images/I/61yBpKUuV9...
link              https://www.amazon.in/PMW-Accessories-Bharatan...
ratings                                                         4.4
no_of_ratings                                                     9
discount_price                                                 ₹229
actual_price                                                   ₹599
Name: 1861, dtype: object