In [None]:
import os
import pandas as pd
import pinecone
from sentence_transformers import SentenceTransformer
from dotenv import load_dotenv
from tqdm.auto import tqdm
import ast

load_dotenv(dotenv_path='../backend/.env')

PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

if not PINECONE_API_KEY:
    raise ValueError("PINECONE_API_KEY not found in environment variables. Check your .env file.")

print("Environment variables loaded.")

file_path = '../backend/data/intern_data_ikarus.csv'
df = pd.read_csv(file_path)

df.dropna(subset=['title', 'description', 'uniq_id', 'images'], inplace=True)
df.reset_index(drop=True, inplace=True)

def get_first_image_url(images_str):
    try:
        images_list = ast.literal_eval(images_str)
        if isinstance(images_list, list) and len(images_list) > 0:
            return images_list[0]
    except (ValueError, SyntaxError):
        return None
    return None

df['image_url'] = df['images'].apply(get_first_image_url)

df.dropna(subset=['image_url'], inplace=True)
df.reset_index(drop=True, inplace=True)

df['price'] = df['price'].astype(str)
df['price'] = df['price'].str.replace(r'[^\d.]', '', regex=True)
df['price'] = pd.to_numeric(df['price'], errors='coerce')

print("\nPrice column has been robustly cleaned and converted to a numeric type.")

print(f"\nLoaded and cleaned dataset for upserting. Final Shape: {df.shape}")
display(df[['uniq_id', 'title', 'image_url', 'price']].head())

print("Loading 'clip-ViT-B-32' model...")
model = SentenceTransformer('clip-ViT-B-32')
print("Model loaded successfully.")

print("Initializing Pinecone connection...")
pc = pinecone.Pinecone(api_key=PINECONE_API_KEY)
print("Pinecone connection successful.")

INDEX_NAME = "product"
DIMENSION = 512

if INDEX_NAME not in pc.list_indexes().names():
    print(f"Index '{INDEX_NAME}' not found. Creating a new one...")
    pc.create_index(
        name=INDEX_NAME,
        dimension=DIMENSION,
        metric='cosine',
        spec=pinecone.ServerlessSpec(cloud='aws', region='us-east-1')
    )
    print("Index created successfully.")
else:
    print(f"Index '{INDEX_NAME}' already exists. Connecting to it.")

index = pc.Index(INDEX_NAME)

print("\n--- Current Index Stats ---")
print(index.describe_index_stats())

BATCH_SIZE = 128

print(f"\nStarting data upsert in batches of {BATCH_SIZE}...")

for i in tqdm(range(0, len(df), BATCH_SIZE)):
    i_end = min(i + BATCH_SIZE, len(df))
    batch_df = df.iloc[i:i_end]
    
    texts_to_embed = (
        "Title: " + batch_df['title'].astype(str) +
        "; Brand: " + batch_df['brand'].fillna('N/A').astype(str) +
        "; Description: " + batch_df['description'].astype(str) +
        "; Material: " + batch_df['material'].fillna('N/A').astype(str) +
        "; Color: " + batch_df['color'].fillna('N/A').astype(str)
    ).tolist()

    embeddings = model.encode(texts_to_embed).tolist()
    
    metadata = []
    for j, row in batch_df.iterrows():
        meta = {
            'title': row['title'],
            'description': row['description'],
            'price': float(row['price']) if pd.notna(row['price']) else 0.0,
            'brand': row['brand'],
            'images': row['image_url']
        }
        metadata.append(meta)
        
    vectors_to_upsert = list(zip(batch_df['uniq_id'], embeddings, metadata))
    
    index.upsert(vectors=vectors_to_upsert)

print("\nData upsert process complete!")

print("\n--- Index Stats After Upsert ---")
final_stats = index.describe_index_stats()
print(final_stats)

if final_stats['total_vector_count'] >= len(df):
    print("\nVerification successful! All vectors have been uploaded.")
else:
    print("\nVerification warning: The number of vectors in the index is less than the number of rows in the dataframe.")

Environment variables loaded.
Loaded and cleaned dataset for upserting. Final Shape: (159, 13)


Unnamed: 0,uniq_id,title,image_url
0,02593e81-5c09-5069-8516-b0b29f439ded,"GOYMFK 1pc Free Standing Shoe Rack, Multi-laye...",https://m.media-amazon.com/images/I/416WaLx10j...
1,5938d217-b8c5-5d3e-b1cf-e28e340f292e,"subrtex Leather ding Room, Dining Chairs Set o...",https://m.media-amazon.com/images/I/31SejUEWY7...
2,8fd9377b-cfa6-5f10-835c-6b8eca2816b5,"Pickleball Doormat, Welcome Doormat Absorbent ...",https://m.media-amazon.com/images/I/61vz1Igler...
3,bdc9aa30-9439-50dc-8e89-213ea211d66a,JOIN IRON Foldable TV Trays for Eating Set of ...,https://m.media-amazon.com/images/I/41p4d4VJnN...
4,20da3703-26f1-53e5-aa0b-a8104527d1bb,"LOVMOR 30'' Bathroom Vanity Sink Base Cabine, ...",https://m.media-amazon.com/images/I/41zMuj2wvv...


Loading 'clip-ViT-B-32' model...
Model loaded successfully.
Initializing Pinecone connection...
Pinecone connection successful.
Index 'product' already exists. Connecting to it.

--- Current Index Stats ---
{'dimension': 512,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {},
 'total_vector_count': 0,
 'vector_type': 'dense'}

Starting data upsert in batches of 128...


  0%|          | 0/2 [00:00<?, ?it/s]

ValueError: could not convert string to float: '$24.99'