In [1]:
import re
import typing
import numpy as np
import pandas as pd
from PIL import Image

import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
import sys
sys.path.append('../')

In [None]:
from shared.utils.images.extractor import ImageExtractor
from shared.utils.common import get_list_from_text_tuple
from shared.model.data.features.embedding.image import ImageGroupEmbedding

In [5]:
%pip install aiofiles

Collecting aiofiles
  Using cached aiofiles-23.1.0-py3-none-any.whl (14 kB)
Installing collected packages: aiofiles
Successfully installed aiofiles-23.1.0
Note: you may need to restart the kernel to use updated packages.


In [None]:
PATH_TO_IMAGE_FOLDER: str = '../shared/data/images'

## Loading Data

In [None]:
df = pd.read_csv(
    '../shared/data/amz_products_small_pre_processed.csv.gz', 
    compression='gzip'
)

## Visualizing some Image

In [None]:
df.head()

In [None]:
df_images = df[~df['image'].isnull()]

In [None]:
image_extractor = ImageExtractor(PATH_TO_IMAGE_FOLDER)

file_paths = []
for image_urls in df_images.loc[:20, 'image'].values:
    
    file_paths += await image_extractor.extraction_and_save_image_urls(image_urls=image_urls)

In [None]:
fig, axs = plt.subplots(1, len(file_paths), figsize=(12, 6))
axs = axs.flatten()

for img, ax in zip(file_paths, axs):
    ax.imshow(Image.open(img, 'r'))
    ax.set_xticks([])
    ax.set_yticks([])
    
plt.show()

We can see that for this case, all are automotive products.

## Image Embeddings

For each row we might have none, one or multiple images.

We are going to extract the image embeddings of them (and if there are multiple, going ot apply a mean of their embeddings).

In [None]:
image_group_embedding = ImageGroupEmbedding(
    source_folder=PATH_TO_IMAGE_FOLDER
)

In [None]:
image_group_text = df_images.image.values[150000]
image_group_text

In [None]:
# And we can identify all these different images
get_list_from_text_tuple(image_group_text)

And we can effectively get the embeddings for those images:
1. Extraction of the images (if not already existent in the local file system)
2. Batch processing of the images through the model
3. Average the embeddings obtained

In [None]:
image_embedding = await image_group_embedding.get_image_group_embedding(image_group_text)
image_embedding