### Import dependencies

In [3]:
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PayloadSchemaType, PointStruct

import pandas as pd
import openai

### Load Amazon dataset

In [4]:
df_items = pd.read_json("../data/meta_Electronics_2022_2023_with_category_ratings_100_sample_1000.jsonl", lines=True)

### Define functions to preprocess title and features data and extract image url from the first large image in the images list

In [5]:
df_items.head(2)

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together,subtitle,author
0,All Electronics,2 Pack-iPhone Earbuds Wired Lightning Headphon...,3.4,598,[],[],,[{'thumb': 'https://m.media-amazon.com/images/...,[],WASABI MANGO,"[Electronics, Headphones, Earbuds & Accessorie...",{'Product Dimensions': '23.62 x 19.69 x 27.56 ...,B0B1ZVC7GJ,,,
1,Computers,"Mini PC 16GB DDR4 256GB M.2 SSD,Quad-Core 2.7G...",4.3,450,[【Meet to Sufficient Memory Storage】This Mini ...,[],,[{'thumb': 'https://m.media-amazon.com/images/...,[],OUVISLITE,"[Electronics, Computers & Accessories, Compute...","{'Screen Resolution': '3840 x 2160', 'Max Scre...",B0B1HNV2V9,,,


In [6]:
def preprocess_data(row):
    return f"{row['title']} {' '.join(row['features'])}"

In [7]:
def extract_first_large_image(row):
    return row['images'][0].get('large', '')

In [8]:
df_items["preprocessed_data"] = df_items.apply(preprocess_data, axis=1)
df_items["first_large_image"] = df_items.apply(extract_first_large_image, axis=1)

### Create a new Qdrant collection for hybrid search

In [9]:
qdrant_client = QdrantClient(url="http://localhost:6333")

In [10]:
qdrant_client.create_collection(
    collection_name="Amazon-items-collection-01-hybrid",
    vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
)

True

### Create an exact text search index on the payload field "text"

In [11]:
qdrant_client.create_payload_index(
    collection_name="Amazon-items-collection-01-hybrid",
    field_name="text",
    field_schema=PayloadSchemaType.TEXT
)

UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)

### Sample the dataset, embed the text data and add additional firlds to the payload of each vector

In [12]:
df_sample = df_items.sample(n=50, random_state=25)

In [13]:
def get_embedding(text, model="text-embedding-3-small"):
    response = openai.embeddings.create(
        input=[text],
        model=model,
    )
    return response.data[0].embedding

In [14]:
data_to_embed = df_sample[["preprocessed_data", "first_large_image", "rating_number", "price", "average_rating"]].to_dict(orient="records")

In [15]:
pointstructs = []
for i, data in enumerate(data_to_embed):
    embedding = get_embedding(data["preprocessed_data"])
    pointstructs.append(
        PointStruct(
            id=i,
            vector=embedding,
            payload={
                "text": data["preprocessed_data"],
                "first_large_image": data["first_large_image"],
                "average_rating": data["average_rating"],
                "rating_number": data["rating_number"],
                "price": data["price"],
            }
        )
    )

### Write the embedding vectors to the Qdrant collection

In [16]:
qdrant_client.upsert(
    collection_name="Amazon-items-collection-01-hybrid",
    wait=True,
    points=pointstructs
)

UpdateResult(operation_id=2, status=<UpdateStatus.COMPLETED: 'completed'>)

### Perform hybrid search and perform rrf rank fusion on the retrieved results

In [17]:
from qdrant_client.models import Prefetch, Filter, FieldCondition, MatchText, FusionQuery

def retrieve_data(query, k=5):

    query_embedding = get_embedding(query)

    results = qdrant_client.query_points(
        collection_name="Amazon-items-collection-01-hybrid",
        prefetch=[
            Prefetch(
                query=query_embedding,
                limit=20
            ),
            Prefetch(
                filter=Filter(
                    must=[
                        FieldCondition(
                            key="text",
                            match=MatchText(text=query)
                        )
                    ]
                ),
                limit=20
            )
        ],
        query=FusionQuery(fusion="rrf"),
        limit=k
    )

    return results

### Example for structured outpus usage with Instructor

In [18]:
import instructor
from pydantic import BaseModel
from openai import OpenAI
from typing import List

In [19]:
class RAGGenerationResponse(BaseModel):
    answer: str

client = instructor.from_openai(OpenAI())

prompt = """
You are a helpful assistant.
Return an answer to the question.
Question: What is your name?
"""

response, raw_response = client.chat.completions.create_with_completion(
    model="gpt-4.1",
    response_model=RAGGenerationResponse,
    messages=[{"role": "user", "content": prompt}],
    temperature=0.5,
)

In [20]:
response

RAGGenerationResponse(answer='My name is ChatGPT. I am an AI assistant created by OpenAI.')