# Zalando Bot: AI-Powered Fashion Assistant

> Retriever - Hybrid search with Milvus

In [None]:
%pip install -U pymilvus

In [14]:
import os
import json
import numpy as np                          # type: ignore
import pandas as pd                         # type: ignore
from PIL import Image                       # type: ignore
import matplotlib.pyplot as plt             # type: ignore
from tqdm.autonotebook import tqdm, trange  # type: ignore
from tqdm import tqdm                       # type: ignore

In [79]:
from langchain.retrievers import BM25Retriever      # type: ignore
from langchain.document_loaders import TextLoader   # type: ignore
from langchain.schema import Document               # type: ignore

In [None]:
from sentence_transformers import SentenceTransformer   # type: ignore
import torch                                            # type: ignore

## Load dataset

In [69]:
img_uri = '../data/master_data/hf_df_x300'
metadata_uri = '../data/master_data/metadata.csv'

In [None]:
metadata = pd.read_csv(metadata_uri)
metadata.head()

In [None]:
metadata.info()

In [None]:
# add x100_img image uri
metadata['img_uri'] = metadata['x300_img'].apply(lambda i: os.path.join(img_uri, i))

# drop image tags
metadata.drop(columns=['x100_img', 'x300_img'], inplace=True)

metadata.head()

In [None]:
image = Image.open(metadata.iloc[0]['img_uri'])

# show img inline
plt.imshow(image)

# hide axes
plt.axis('off')

plt.show()

## Sparse Vectors

In [87]:
def create_product_description(row):
    # extract the necessary columns
    sku = row['sku']
    name = row['name']
    brand = row['brand']
    category = json.loads(json.dumps(row['category']))
    url = row['url']
    sizes = json.loads(json.dumps(row['sizes']))
    classification = json.loads(json.dumps(row['classification']))
    original_price = row['original_price']
    discount_percentage = row['discount_percentage']
    promotional_price = row['promotional_price']

    # start building the description
    description = f"Product Name: {name} by {brand}.\n"

    if len(category) != 0:
        description += f"Category: {category}.\n"
    if len(classification) != 0:
        description += f"Status: {classification}.\n"

    # add available sizes, if present
    if len(sizes) != 0:
        description += f"Available Sizes: {sizes}.\n"
    
    # add price and promotional information
    if pd.notna(original_price) and pd.notna(promotional_price):
        description += f"Original Price: {original_price}, now available for {promotional_price}.\n"
        if pd.notna(discount_percentage):
            description += f"Discount: {discount_percentage} off.\n"
    elif pd.notna(original_price):
        description += f"Price: {original_price}.\n"
    
    # add URL for reference
    description += f"More details can be found at: {url}.\n"
    
    # add SKU (if relevant for your use case)
    description += f"SKU: {sku}."
    
    return description

In [None]:
metadata['description'] = metadata.apply(create_product_description, axis=1)
metadata.head()

In [None]:
metadata.iloc[0]['description']

In [93]:
# convert the text data into a list of Document objects (required by BM25Retriever)
documents = [Document(page_content=f"SKU: {row['sku']}, Description: {row['description']}") for idx, row in metadata.iterrows()]

# initialize BM25 retriever with the documents
bm25_retriever = BM25Retriever.from_documents(documents)

In [None]:
result = bm25_retriever.invoke("Blue colour coat")
result

## Dence Vectors

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Device: ', device)

# load a CLIP model from huggingface
model = SentenceTransformer(
    'sentence-transformers/clip-ViT-B-32',
    device=device
)