In [2]:
pip install tqdm


Collecting tqdm
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Using cached tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm
Successfully installed tqdm-4.67.1
Note: you may need to restart the kernel to use updated packages.


In [19]:
pip install pinecone

Collecting pinecone
  Downloading pinecone-7.3.0-py3-none-any.whl.metadata (9.5 kB)
Collecting pinecone-plugin-assistant<2.0.0,>=1.6.0 (from pinecone)
  Downloading pinecone_plugin_assistant-1.8.0-py3-none-any.whl.metadata (30 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone)
  Using cached pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Collecting packaging<25.0,>=24.2 (from pinecone-plugin-assistant<2.0.0,>=1.6.0->pinecone)
  Using cached packaging-24.2-py3-none-any.whl.metadata (3.2 kB)
Downloading pinecone-7.3.0-py3-none-any.whl (587 kB)
   ---------------------------------------- 0.0/587.6 kB ? eta -:--:--
   ---------------------------------------- 587.6/587.6 kB 6.9 MB/s  0:00:00
Downloading pinecone_plugin_assistant-1.8.0-py3-none-any.whl (259 kB)
Using cached packaging-24.2-py3-none-any.whl (65 kB)
Using cached pinecone_plugin_interface-0.0.7-py3-none-any.whl (6.2 kB)
Installing collected packages: pinecone-plugin-interface, packaging, p

In [1]:
# Step 1: Imports and path setup
import os
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import pinecone

# Paths
DATA_PATH = "../data/preprocessed_products.csv"
EMB_DIR = "../data/embeddings"
SCHEMA_PATH = "../pinecone/pinecone_schema.json"

# Make dirs
os.makedirs(EMB_DIR, exist_ok=True)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Step 2: Load cleaned dataset
df = pd.read_csv(DATA_PATH)
print("Loaded:", df.shape)
df.head(3)


Loaded: (305, 14)


Unnamed: 0,title,brand,description,price,categories,images,manufacturer,package_dimensions,country_of_origin,material,color,uniq_id,text_blob,primary_category
0,"GOYMFK 1pc Free Standing Shoe Rack, Multi-laye...",GOYMFK,"multiple shoes, coats, hats, and other items E...",24.99,"['Home & Kitchen', 'Storage & Organization', '...",['https://m.media-amazon.com/images/I/416WaLx1...,GOYMFK,"2.36""D x 7.87""W x 21.6""H",China,Metal,White,02593e81-5c09-5069-8516-b0b29f439ded,"goymfk 1pc free standing shoe rack, multi-laye...",Home & Kitchen
1,"subrtex Leather ding Room, Dining Chairs Set o...",subrtex,subrtex Dining chairs Set of 2,54.09,"['Home & Kitchen', 'Furniture', 'Dining Room F...",['https://m.media-amazon.com/images/I/31SejUEW...,Subrtex Houseware INC,"18.5""D x 16""W x 35""H",,Sponge,Black,5938d217-b8c5-5d3e-b1cf-e28e340f292e,"subrtex leather ding room, dining chairs set o...",Home & Kitchen
2,Plant Repotting Mat MUYETOL Waterproof Transpl...,MUYETOL,,5.98,"['Patio, Lawn & Garden', 'Outdoor Décor', 'Doo...",['https://m.media-amazon.com/images/I/41RgefVq...,MUYETOL,"26.8""L x 26.8""W",,Polyethylene,Green,b2ede786-3f51-5a45-9a5b-bcf856958cd8,plant repotting mat muyetol waterproof transpl...,"Patio, Lawn & Garden"


In [3]:
# Step 3: Generate embeddings for text_blob
model = SentenceTransformer('all-MiniLM-L6-v2')   # fast + light
texts = df['text_blob'].astype(str).tolist()

embeddings = model.encode(texts, batch_size=32, show_progress_bar=True, normalize_embeddings=True)
print("Embeddings shape:", embeddings.shape)

# Save locally
np.save(os.path.join(EMB_DIR, "text_vectors.npy"), embeddings)
print("✅ Saved local embeddings to data/embeddings/text_vectors.npy")


Batches: 100%|██████████| 10/10 [00:09<00:00,  1.07it/s]

Embeddings shape: (305, 384)
✅ Saved local embeddings to data/embeddings/text_vectors.npy





In [47]:
pip install pinecone

Note: you may need to restart the kernel to use updated packages.


In [None]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key="PINECONE_API_KEY", environment="us-east1-gcp")

In [None]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key="PINECONE_API_KEY")

index_name = "ikarus-text-embeddings"

existing_indexes = [idx["name"] for idx in pc.list_indexes()]

# ✅ Choose valid region (use "us-east-1" or "us-west-2")
if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=embeddings.shape[1],
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")  # ✅ valid region
    )
    print(f"✅ Created index '{index_name}' in us-east-1")
else:
    print(f"ℹ️ Index '{index_name}' already exists")

index = pc.Index(index_name)


✅ Created index 'ikarus-text-embeddings' in us-east-1


In [12]:
# Step 5: Upload embeddings in batches using new SDK
from tqdm import tqdm

vectors_to_upsert = []
for i, row in tqdm(df.iterrows(), total=len(df)):
    meta = {
        "uniq_id": str(row.uniq_id),
        "title": str(row.title),
        "brand": str(row.brand),
        "price": float(row.price),
        "categories": str(row.categories),
        "primary_category": str(row.get("primary_category", "")),
        "color": str(row.color),
        "material": str(row.material)
    }
    vectors_to_upsert.append({
        "id": str(row.uniq_id),
        "values": embeddings[i].tolist(),
        "metadata": meta
    })

# batch upload in chunks
batch_size = 100
for i in range(0, len(vectors_to_upsert), batch_size):
    batch = vectors_to_upsert[i:i+batch_size]
    index.upsert(vectors=batch)
    print(f"✅ Uploaded {i + len(batch)} / {len(vectors_to_upsert)} vectors")

print("✅ Pinecone index population complete.")


100%|██████████| 305/305 [00:00<00:00, 3953.56it/s]


✅ Uploaded 100 / 305 vectors
✅ Uploaded 200 / 305 vectors
✅ Uploaded 300 / 305 vectors
✅ Uploaded 305 / 305 vectors
✅ Pinecone index population complete.


In [None]:
import json

# Make sure SCHEMA_PATH is correct
SCHEMA_PATH = "pinecone_schema.json"  # change to your preferred path/filename

schema = {
    "index_name": index_name,  # your Pinecone index
    "dimension": embeddings.shape[1],  # embedding vector size
    "metric": "cosine",
    "fields": [
        "uniq_id",
        "title",
        "brand",
        "price",
        "categories",
        "primary_category",
        "color",
        "material"
    ]
}

with open(SCHEMA_PATH, "w") as f:
    json.dump(schema, f, indent=4)

print(f"✅ Pinecone schema saved to {SCHEMA_PATH}")


✅ Pinecone schema saved to pinecone_schema.json
