In [1]:
# Cell 1: Setup and Initialization
# ---------------------------------
# This cell imports the necessary libraries and sets up the configuration variables
# that you will need to edit.

import pandas as pd
from pinecone import Pinecone
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm
import os 


# 1. Get this from the "API Keys" section in your Pinecone dashboard.
PINECONE_API_KEY = "pcsk_3spAud_D66kNnQiYH4TF99hStEuThoSqjoingbaw7zXLRzRfPmvaXjurbpVMg38bdxU2gM"

# 2. This is the name of the index you created in the Pinecone dashboard.
PINECONE_INDEX_NAME = "recomend"

# 3. Get this from your Pinecone index dashboard. Click on your index, and you will see
#    the "Host" value. It looks like a URL.
INDEX_HOST = "https://recomend-6c7pmst.svc.aped-4627-b74a.pinecone.io"


# --- Define File Path ---
# This points to the cleaned data file created by the first notebook.
CLEANED_DATA_PATH = os.path.join('data', 'cleaned_products.csv')

# --- Load the Cleaned Dataset ---
print(f"Loading cleaned data from: {CLEANED_DATA_PATH}")
try:
    df = pd.read_csv(CLEANED_DATA_PATH)
    print(f"Loaded {len(df)} cleaned products successfully.")
except FileNotFoundError:
    print(f" ERROR: Cleaned data not found. Please run the 'Data_Analytics_Notebook.ipynb' first to create it.")
    df = pd.DataFrame()

if not df.empty:
    display(df.head())


Loading cleaned data from: data/cleaned_products.csv
Loaded 305 cleaned products successfully.


Unnamed: 0,title,brand,description,price,categories,images,manufacturer,package_dimensions,country_of_origin,material,color,uniq_id
0,"GOYMFK 1pc Free Standing Shoe Rack, Multi-laye...",GOYMFK,"multiple shoes, coats, hats, and other items E...",24.99,Home & Kitchen,['https://m.media-amazon.com/images/I/416WaLx1...,GOYMFK,"2.36""D x 7.87""W x 21.6""H",China,Metal,White,02593e81-5c09-5069-8516-b0b29f439ded
1,"subrtex Leather ding Room, Dining Chairs Set o...",subrtex,subrtex Dining chairs Set of 2,53.99,Home & Kitchen,['https://m.media-amazon.com/images/I/31SejUEW...,Subrtex Houseware INC,"18.5""D x 16""W x 35""H",,Sponge,Black,5938d217-b8c5-5d3e-b1cf-e28e340f292e
2,Plant Repotting Mat MUYETOL Waterproof Transpl...,MUYETOL,No description available,5.98,"Patio, Lawn & Garden",['https://m.media-amazon.com/images/I/41RgefVq...,MUYETOL,"26.8""L x 26.8""W",,Polyethylene,Green,b2ede786-3f51-5a45-9a5b-bcf856958cd8
3,"Pickleball Doormat, Welcome Doormat Absorbent ...",VEWETOL,The decorative doormat features a subtle textu...,13.99,"Patio, Lawn & Garden",['https://m.media-amazon.com/images/I/61vz1Igl...,Contrence,"24""L x 16""W",,Rubber,A5589,8fd9377b-cfa6-5f10-835c-6b8eca2816b5
4,JOIN IRON Foldable TV Trays for Eating Set of ...,JOIN IRON Store,Set of Four Folding Trays With Matching Storag...,89.99,Home & Kitchen,['https://m.media-amazon.com/images/I/41p4d4VJ...,,"18.9""D x 14.2""W x 26""H",,Iron,Grey Set of 4,bdc9aa30-9439-50dc-8e89-213ea211d66a


In [2]:
# Cell 2: Initialize AI Model and Connect to Pinecone
# ----------------------------------------------------
# This cell loads the AI model that converts text into vectors and establishes
# the connection to your Pinecone index.

if not df.empty:
    # --- Initialize Text Embedding Model ---
    print("Loading the sentence-transformer model (all-MiniLM-L6-v2)...")
    # This model creates 384-dimension vectors, which perfectly matches your index dimension.
    # We use 'cpu' to ensure it works on any computer, even without a GPU.
    model = SentenceTransformer('./all-MiniLM-L6-v2', device='cpu')
    print("AI Model loaded successfully.")

    # --- Initialize Pinecone Connection ---
    print("\nConnecting to Pinecone...")
    try:
        # The modern way to connect using the Pinecone client
        pc = Pinecone(api_key=PINECONE_API_KEY)
        index = pc.Index(host=INDEX_HOST)
        print("Pinecone index connected successfully.")
        
        # We check the index stats to make sure we're connected.
        # Before running the next cell, the vector count should be 0.
        print("\nCurrent Index Stats:")
        print(index.describe_index_stats())
    except Exception as e:
        print(f"ERROR: Could not connect to the index. Please check your API Key and Index Host URL. Error: {e}")
        index = None # Set index to None to prevent the next cell from running

Loading the sentence-transformer model (all-MiniLM-L6-v2)...
AI Model loaded successfully.

Connecting to Pinecone...
Pinecone index connected successfully.

Current Index Stats:
{'dimension': 384,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {},
 'total_vector_count': 0,
 'vector_type': 'dense'}


In [3]:
# Cell 3: Generate Embeddings and Upsert Data to Pinecone
# --------------------------------------------------------
# This is the main workhorse cell. It iterates through your data in batches,
# creates vector embeddings for the text, and uploads them to Pinecone.

if not df.empty and index is not None:
    print("\n--- Starting Embedding Generation and Upsert Process ---")
    batch_size = 100  # We will process and upload 100 products at a time.

    # We use tqdm to show a progress bar
    for i in tqdm(range(0, len(df), batch_size)):
        i_end = min(i + batch_size, len(df))
        batch_df = df.iloc[i:i_end]
        
        # 1. Prepare IDs for the batch
        ids = batch_df['uniq_id'].tolist()
        
        # 2. Prepare the text data by combining title and description
        # This gives the model more context to create a better embedding.
        combined_text = (batch_df['title'].astype(str) + ". " + batch_df['description'].astype(str)).tolist()
        
        # 3. Generate the vector embeddings for the text
        vectors = model.encode(combined_text).tolist()
        
        # 4. Prepare the metadata for each vector
        # This is the extra information that gets stored alongside the vector.
        metadata = []
        for _, row in batch_df.iterrows():
            metadata.append({
                'title': row['title'],
                'price': row['price'],
                # We store the first image URL to show in the frontend
                'image_url': str(row['images']).split(',')[0].strip("[]'\" ")
            })
        
        # 5. Format the data for upserting
        # Pinecone expects a list of tuples, where each tuple is (id, vector, metadata).
        to_upsert = list(zip(ids, vectors, metadata))
        
        # 6. Upsert the batch to Pinecone
        index.upsert(vectors=to_upsert)

    print("\n Upsert process complete.")
    print("\nFinal Pinecone index stats:")
    # The vector count should now be equal to the number of products in your CSV.
    print(index.describe_index_stats())


--- Starting Embedding Generation and Upsert Process ---


  0%|          | 0/4 [00:00<?, ?it/s]


 Upsert process complete.

Final Pinecone index stats:
{'dimension': 384,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {},
 'total_vector_count': 0,
 'vector_type': 'dense'}
