In [54]:
from datasets import load_dataset, load_from_disk

In [19]:
dataset = load_dataset(
    "smartcat/Amazon_Products_2023",
    cache_dir="./dataset",
    split="train",
)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 100%|██████████| 22767/22767 [00:00<00:00, 78249.00 examples/s]


In [21]:
selected_columns = [
    "main_category",
    "title",
    "average_rating",
    "rating_number",
    "features",
    "description",
    "price",
    "images",
    "store",
    "categories",
    "brand",
    "color",
    "manufacturer",
]

dataset = dataset.select_columns(selected_columns)
dataset.save_to_disk("./dataset/amazon_india_products_22k")

Saving the dataset (0/1 shards):   0%|          | 0/22767 [00:00<?, ? examples/s]

Saving the dataset (1/1 shards): 100%|██████████| 22767/22767 [00:00<00:00, 203192.03 examples/s]


## dataset explore

In [10]:
import pandas as pd
import numpy as np

In [9]:
data = load_from_disk(
    "./dataset/amazon_products_2023",
).to_pandas()

### Filling the missing main category column values

In [23]:
def get_first_category(row):
    if pd.isna(row["main_category"]):
        if isinstance(row["categories"], list) and len(row["categories"]) > 0:
            return row["categories"][0]
        return "Unknown"
    return row["main_category"]


data["main_category"] = data.apply(get_first_category, axis=1)

### Filling missing pricing values

In [27]:
category_avg_price = data.groupby("main_category")["price"].mean()

data["price"] = data.apply(
    lambda x: (
        category_avg_price[x["main_category"]] if pd.isna(x["price"]) else x["price"]
    ),
    axis=1,
)

### Filling missing store values

In [29]:
data["store"] = data["store"].fillna("Unknown")

In [None]:
data = data.dropna()

data.to_parquet("./dataset/amazon_products_2023.parquet", engine="pyarrow")
data = pd.read_parquet("./dataset/amazon_products_2023.parquet")

In [51]:
data = pd.read_parquet("./dataset/amazon_products_2023.parquet")

In [52]:
data.columns

Index(['title', 'description', 'main_category', 'categories', 'store',
       'average_rating', 'rating_number', 'price', 'features', 'details',
       'image'],
      dtype='object')

## Explore 30k products

In [78]:
dataset = load_from_disk(
    "./dataset/amazon_india_products_30k",
).to_pandas()

In [80]:
dataset.columns

Index(['Category', 'Product Title', 'Product Description', 'Brand', 'Mrp',
       'Price', 'Image Urls'],
      dtype='object')

In [58]:
data = data.dropna(subset=["Product Description"])

In [60]:
data["Price"] = data["Price"].astype(str)

# Convert to float, invalid values become NaN
data["Price"] = pd.to_numeric(data["Price"], errors="coerce")

# Calculate mode price for each category
category_mode_prices = data.groupby("Category")["Price"].transform(
    lambda x: x.mode().iloc[0] if not x.mode().empty else 0.0
)

# Fill NaN values with category modes
data["Price"] = data["Price"].fillna(category_mode_prices)

# If any NaN values remain, fill with overall mode
overall_mode = data["Price"].mode().iloc[0]
data["Price"] = data["Price"].fillna(overall_mode)

In [62]:
data["Mrp"] = data["Mrp"].astype(str)

# Convert to float, invalid values become NaN
data["Mrp"] = pd.to_numeric(data["Mrp"], errors="coerce")

# Calculate mode price for each category
category_mode_prices = data.groupby("Category")["Mrp"].transform(
    lambda x: x.mode().iloc[0] if not x.mode().empty else 0.0
)

# Fill NaN values with category modes
data["Mrp"] = data["Mrp"].fillna(category_mode_prices)

# If any NaN values remain, fill with overall mode
overall_mode = data["Mrp"].mode().iloc[0]
data["Mrp"] = data["Mrp"].fillna(overall_mode)

In [64]:
data.to_parquet("./dataset/amazon_india_products_30k.parquet", engine="pyarrow")

## Explore 22k products

In [65]:
data = load_from_disk(
    "./dataset/amazon_india_products_22k",
).to_pandas()

In [68]:
data["main_category"] = data["categories"].apply(
    lambda x: x[0] if len(x) > 0 else "Unknown"
)
data["price"] = data.groupby("main_category")["price"].transform(
    lambda x: x.fillna(x.mean() if not x.mean() != x.mean() else 0.0)
)
data["store"] = data["store"].fillna(data["manufacturer"])

In [70]:
data.to_parquet("./dataset/amazon_india_products_22k.parquet", engine="pyarrow")

In [11]:
data = pd.read_parquet("./dataset/amazon_india_products_22k.parquet")

In [12]:
data.columns

Index(['main_category', 'title', 'average_rating', 'rating_number', 'features',
       'description', 'price', 'images', 'store', 'categories', 'brand',
       'color', 'manufacturer'],
      dtype='object')

In [75]:
data2 = pd.read_parquet("./dataset/amazon_india_products_30k.parquet")
data2.columns

Index(['Category', 'Product Title', 'Product Description', 'Brand', 'Mrp',
       'Price', 'Image Urls'],
      dtype='object')

In [85]:
data["brand"].unique()

array(['Kisbaby', 'Kandoo', 'BERON', ..., 'Woozle Time', 'N/A.', 'EBOOT'],
      shape=(7550,), dtype=object)

In [7]:
from smolagents.models import LiteLLMModel
import os

os.environ["AZURE_API_VERSION"] = "2024-08-01-preview"

model = LiteLLMModel(
    model_id="azure/GPT4-Turbo-128K-0125-preview",
    api_base="https://703227482-gpt4-turbo-0125-latest-version.openai.azure.com/",
    api_key="97e2bc42723d4021a7d40ac94e5d4f64",
)

In [8]:
model(
    messages=[
        {"content": "Hello, how are you?", "role": "user"},
    ]
)

"I'm just a computer program, so I don't have feelings, but thanks for asking! How can I help you today?"