# Zalando Bot: AI-Powered Fashion Assistant

> Retriever - Hybrid search with Pinecone

In [None]:
%pip install -qU pinecone
%pip install -qU pinecone-client pinecone-text pinecone-notebooks

In [1]:
import os                                   # type: ignore
import time
import pandas as pd                         # type: ignore
import matplotlib.pyplot as plt             # type: ignore
from tqdm import tqdm                       # type: ignore
from tqdm.notebook import tqdm              # type: ignore
from PIL import Image                       # type: ignore
from IPython.core.display import HTML       # type: ignore
from io import BytesIO
from base64 import b64encode

import nltk                                 # type: ignore
nltk.download('punkt_tab')

In [3]:
from pinecone import Pinecone, ServerlessSpec   # type: ignore
from pinecone_text.sparse import BM25Encoder    # type: ignore

In [4]:
import torch                                            # type: ignore
from sentence_transformers import SentenceTransformer   # type: ignore

In [None]:
from dotenv import load_dotenv                          # type: ignore
load_dotenv()

## Load datset

In [46]:
# load the dataset
img_uri = '../data/master_data/zalando_fashionista_x300'
metadata_uri = '../data/master_data/metadata.csv'

In [None]:
metadata = pd.read_csv(metadata_uri)
metadata.head()

In [None]:
len(metadata)

## Preprocess dataset

### Process image

In [None]:
# add x300_img image uri
metadata['img_uri'] = metadata['x300_img'].apply(lambda i: os.path.join(img_uri, i))

# drop image tags
metadata.drop(columns=['x100_img', 'x300_img'], inplace=True)

metadata.head()

In [None]:
image = Image.open(metadata.iloc[0]['img_uri'])
image

### Process metadata

In [None]:
metadata.fillna(0, inplace=True)
metadata.head()

In [54]:
def create_product_description(row):
    # extract the necessary columns
    sku = row['sku']
    name = row['name']
    brand = row['brand']
    category = row['category']
    url = row['url']
    sizes = row['sizes']
    classification = row['classification']
    original_price = row['original_price']
    discount_percentage = row['discount_percentage']
    promotional_price = row['promotional_price']

    # add SKU (if relevant for your use case)
    description = f"SKU: {sku}.\n"

    # start building the description
    description += f"Product Name: {name} by {brand}.\n"
    description += f"Category: {category}. Status: {classification}.\n"

    # add available sizes, if present
    if pd.notna(sizes):
        description += f"Available Sizes: {sizes}.\n"
    
    # add price and promotional information
    if pd.notna(original_price) and pd.notna(promotional_price) and original_price != 0 and promotional_price != 0:
        description += f"Original Price: {original_price}, now available for {promotional_price}.\n"
        if pd.notna(discount_percentage) and discount_percentage != 0:
            description += f"Discount: {discount_percentage} off.\n"
    elif pd.notna(original_price) and original_price != 0:
        description += f"Price: {original_price}.\n"
    
    # add URL for reference
    description += f"More details can be found at: {url}.\n"   
    
    return description

In [None]:
metadata['description'] = metadata.apply(create_product_description, axis=1)
metadata.head()

In [None]:
metadata.iloc[0]['description'], metadata.iloc[1]['description'], metadata.iloc[100]['description']

## Connect to Pinecorn

In [60]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

In [None]:
index_name = "zalando-query-retriever"

# check if index already exists (it shouldn't if this is first time)
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=512,
        metric="dotproduct",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        ) 
    )

    # wait for index to be initialized
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

# connect to index
index = pc.Index(index_name)

# view index stats
index.describe_index_stats()

## Sparse Vectors

In [62]:
# initialize bn25 sparse embedding
bm25 = BM25Encoder()
bm25.fit(metadata['description'])

In [None]:
# get the sparse embedding for the text using BM25
desc = metadata.iloc[0]['description']
sparse_embedding = bm25.embed_documents(desc)
print("Description:\n", desc, "\nSparse embedding:\n", sparse_embedding)

In [None]:
# get sparse vector for query
description = "I want blue jeans"
bm25.encode_queries(description)

In [None]:
bm25.encode_queries(metadata.iloc[0]['description'])

## Dence Vectors

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Device: ', device)

# load a CLIP model from huggingface
model = SentenceTransformer(
    'sentence-transformers/clip-ViT-B-32',
    device=device
)
model

## Upsert Documents

In [None]:
# define the batch size
batch_size = 200

# initialize an empty list to store items
items = []

# iterate over the rows in the DataFrame
for idx, row in tqdm(metadata.iterrows()):
    # generate a unique ID
    id = row['sku']

    # create a dictionary of metadata
    meta_dict = row.to_dict()

    # create sparse BM25 vectors
    sparse_embeds = bm25.encode_queries(row['description'])

    # create dense vectors from the image URI
    dense_embeds = model.encode(Image.open(row['img_uri']))

    # append the item dictionary to the items list
    items.append({
        'id': id,
        'sparse_values': sparse_embeds,
        'values': dense_embeds,
        'metadata': meta_dict
    })

    # if the number of items reaches the batch size, upload the batch
    if len(items) >= batch_size:
        # upsert the current batch of items to the index
        index.upsert(items)
        
        # clear the items list to prepare for the next batch
        items.clear()

# after the loop, if there are any remaining items, upload them
if items:
    index.upsert(items)

# show the index description after uploading the documents
index.describe_index_stats()

## Hybrid search

In [84]:
# function to display product images
def display_result(image_batch):
    figures = []
    for img in image_batch:
        b = BytesIO()  
        img.save(b, format='png')
        figures.append(f'''
            <figure style="margin: 5px !important;">
              <img src="data:image/png;base64,{b64encode(b.getvalue()).decode('utf-8')}" style="width: 90px; height: 120px" >
            </figure>
        ''')
    return HTML(data=f'''
        <div style="display: flex; flex-flow: row wrap; text-align: center;">
        {''.join(figures)}
        </div>
    ''')

### Method - 1

In [None]:
query = "dark blue french connection jeans for men"

# create sparse and dense vectors
sparse = bm25.encode_queries(query)
dense = model.encode(query).tolist()

# search
result = index.query(
    top_k=5,
    vector=dense,
    sparse_vector=sparse,
    include_metadata=True
)

imgs = []
garments = [item['id'] for item in result['matches']]
for garment in garments:
    img = Image.open(metadata[metadata['sku']==garment]['img_uri'].values[0])
    imgs.append(img)

display_result(imgs)

### Method - 2

In [86]:
def hybrid_scale(dense, sparse, alpha: float):
    """Hybrid vector scaling using a convex combination

    alpha * dense + (1 - alpha) * sparse

    Args:
        dense: Array of floats representing
        sparse: a dict of `indices` and `values`
        alpha: float between 0 and 1 where 0 == sparse only
               and 1 == dense only
    """
    if alpha < 0 or alpha > 1:
        raise ValueError("Alpha must be between 0 and 1")
    # scale sparse and dense vectors to create hybrid search vecs
    hsparse = {
        'indices': sparse['indices'],
        'values':  [v * (1 - alpha) for v in sparse['values']]
    }
    hdense = [v * alpha for v in dense]
    return hdense, hsparse

In [None]:
query = "soft purple topwear"

# create sparse and dense vectors
sparse = bm25.encode_queries(query)
dense = model.encode(query).tolist()

# scale sparse and dense vectors
hdense, hsparse = hybrid_scale(dense, sparse, alpha=0.05)

# search
result = index.query(
    top_k=5,
    vector=hdense,
    sparse_vector=hsparse,
    include_metadata=True
)

imgs = []
garments = [item['id'] for item in result['matches']]
for garment in garments:
    img = Image.open(metadata[metadata['sku']==garment]['img_uri'].values[0])
    imgs.append(img)

display_result(imgs)

## Delete the Index

In [104]:
pc.delete_index(index_name)