Le but de ce notebook est de générer les embeddings des items à partir de leurs description.

Ces embeddings seront ensuite utilisés dans le conteneur avec Merlin en les stockant dans Feast.

In [None]:
#!pip install -U fashion-clip
#!pip install torch

In [None]:
from fashion_clip.fashion_clip import FashionCLIP
import pandas as pd
import numpy as np
from collections import Counter
from PIL import Image
import numpy as np
import os

In [None]:
import torch

On extrait la description des items

In [None]:
articles = pd.read_csv('/home/smegdoud/HetM_projet/data/articles.csv')
data_path = '/home/smegdoud/HetM_projet/data'
images = []
items = articles['article_id'].unique().tolist()
texts = articles.apply(lambda row: f"{row['product_group_name']} {row['colour_group_name']} {row['graphical_appearance_name']} {row['index_group_name']} {row['section_name']} {row['detail_desc']}", axis=1).tolist()

On liste les paths des images

In [None]:
images = []
for image_id in items:
    folder = '0' + str(image_id)[:2]
    image_name = '0' + str(image_id) + '.jpg'
    image_path = os.path.join(data_path, 'images', folder, image_name)
    images.append(image_path)
#images

In [None]:
# Suppression des éléments manquants en commençant par la fin
for k in range(len(items) - 1, -1, -1):
    image_path = images[k]
    if not os.path.isfile(image_path):
        del images[k]
        del texts[k]
        del items[k]

In [None]:
# Définir le GPU à utiliser
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

# Initialisez FashionCLIP
fclip = FashionCLIP('fashion-clip')

# On s'assure que le modèle interne utilise le GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(torch.cuda.get_device_name(0))
fclip.model.to(device)

Embedding des images

In [None]:
# Encodez les images et les textes avec gestion de la mémoire
torch.cuda.empty_cache()  # Nettoyez la mémoire GPU
batch_size = 16

with torch.no_grad():  # Désactivez le calcul des gradients pour économiser la mémoire
    #image_embeddings = fclip.encode_images(images, batch_size=batch_size)
    text_embeddings = fclip.encode_text(texts, batch_size=batch_size)

# we normalize the embeddings to unit norm (so that we can use dot product instead of cosine similarity to do comparisons)
#image_embeddings = image_embeddings/np.linalg.norm(image_embeddings, ord=2, axis=-1, keepdims=True)
text_embeddings = text_embeddings/np.linalg.norm(text_embeddings, ord=2, axis=-1, keepdims=True)

In [None]:
# Convertir les embeddings en DataFrame
#image_df = pd.DataFrame(image_embeddings).astype('float32')
text_df = pd.DataFrame(text_embeddings).astype('float32')

# Sauvegarder les DataFrames en fichiers CSV
#image_df.to_csv('image_embeddings.csv', index=False)
#text_df.to_csv('text_embeddings.csv', index=False)

In [None]:
#loaded_image_df = pd.read_csv('image_embeddings.csv')
#loaded_text_df = pd.read_csv('text_embeddings.csv')

In [None]:
items_id_df = pd.DataFrame(items).astype('int32')
items_id_df.columns = ['item_id']

feast_embeddings = pd.concat([items_id_df, text_df], axis=1)
feast_embeddings.head(2)

In [None]:
faiss_embeddings = feast_embeddings.copy()
embedding_columns = faiss_embeddings.columns[1:]
faiss_embeddings['embedding'] = faiss_embeddings[embedding_columns].apply(lambda row: row.tolist(), axis=1)
faiss_embeddings = faiss_embeddings[['item_id', 'embedding']]
faiss_embeddings


In [None]:
faiss_embeddings.dtypes

Export pour l'utiliser plus tard dans la pipeline

In [None]:
faiss_embeddings.to_csv('faiss_items_embeddings.csv')