In [1]:
# [Cell 1] Install necessary libraries
# We need sentence-transformers for embeddings and pinecone as our vector DB.
!pip install sentence-transformers pinecone-client




[notice] A new release of pip is available: 24.2 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
# [Cell 2] Import libraries
import pandas as pd
from sentence_transformers import SentenceTransformer
import pinecone
from getpass import getpass # To securely ask for your API key
from tqdm.auto import tqdm # Shows a progress bar
import warnings
warnings.filterwarnings('ignore')

In [8]:
# [Cell 3] Load the dataset
# We reload the dataset to start fresh in this notebook.
df = pd.read_csv('products_dataset.csv')

# CRITICAL: We must handle missing values in the text fields, or the model will fail.
# We'll fill 'NaN' (missing values) with an empty string.
df = df.fillna("")
print(f"Dataset loaded. Total products: {len(df)}")

Dataset loaded. Total products: 312


In [9]:
# [Cell 4] Load the NLP Model (all-MiniLM-L6-v2)
# This model is excellent at creating 'embeddings' (vector representations) for text.
# It maps sentences with similar meanings to similar vectors.
# This model creates 384-dimensional vectors.
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')
print("Model loaded. Embedding dimension: ", model.get_sentence_embedding_dimension())

Model loaded. Embedding dimension:  384


In [11]:
# [Cell 5] Initialize Pinecone (NEW SYNTAX)
import pinecone
from pinecone import Pinecone, ServerlessSpec
from getpass import getpass

# It will ask for your API key. Paste it in and press Enter.
PINECONE_API_KEY = getpass("Enter your Pinecone API Key: ")

# These are from your screenshot
PINECONE_REGION = "us-east-1"
PINECONE_CLOUD = "aws"

# 1. Initialize the Pinecone client
pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = 'product-recommender'

# 2. Check if the index already exists.
if index_name not in pc.list_indexes().names():
    print(f"Creating new index: {index_name}")
    # 3. Create the index with the new 'spec' format
    pc.create_index(
        name=index_name,
        dimension=model.get_sentence_embedding_dimension(), # This will be 384
        metric='cosine',
        spec=ServerlessSpec(
            cloud=PINECONE_CLOUD,
            region=PINECONE_REGION
        )
    )
    print(f"Index '{index_name}' created.")
else:
    print(f"Index '{index_name}' already exists.")

# 4. Connect to the index
index = pc.Index(index_name)

Enter your Pinecone API Key:  ········


Index 'product-recommender' already exists.


In [12]:
# [Cell 6] Generate Embeddings and Upload to Pinecone
# This is the main loop. We'll go through the CSV in batches.
from tqdm.auto import tqdm # Shows a progress bar

batch_size = 100 # We'll process 100 products at a time

print("Starting to generate embeddings and upload to Pinecone...")

for i in tqdm(range(0, len(df), batch_size)):
    # Get a batch of rows from the dataframe
    i_end = min(i + batch_size, len(df))
    batch = df.iloc[i:i_end]
    
    # 1. Create a combined text field for embedding
    # This IS the NLP step: combining features to create a rich description.
    combined_text = (
        "Title: " + batch['title'] +
        "; Brand: " + batch['brand'] +
        "; Categories: " + batch['categories'] +
        "; Description: " + batch['description']
    ).tolist()
    
    # 2. Generate embeddings (ML Step)
    embeddings = model.encode(combined_text).tolist()
    
    # 3. Prepare data for Pinecone
    # We must include the unique ID, the embedding 'values', and 'metadata'
    # The metadata is what we'll get back from our search.
    to_upsert = []
    for idx, row in batch.iterrows():
        to_upsert.append({
            "id": row['uniq_id'],
            "values": embeddings[idx - i], # Get the embedding for this row
            "metadata": {
                "title": row['title'],
                "price": row['price'],
                # Get the first image URL, or empty string if none
                "image_url": (row['images'].split(',')[0] if row['images'] else ""),
                "brand": row['brand'],
                "categories": row['categories']
            }
        })
    
    # 4. Upload the batch to Pinecone
    index.upsert(vectors=to_upsert)

print("All products have been embedded and uploaded to Pinecone.")

Starting to generate embeddings and upload to Pinecone...


  0%|          | 0/4 [00:00<?, ?it/s]

All products have been embedded and uploaded to Pinecone.


In [13]:
# [Cell 7] Install PyTorch and other libraries for CV
!pip install torch torchvision Pillow requests




[notice] A new release of pip is available: 24.2 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [14]:
# [Cell 8] Import CV libraries
import os
import requests
from PIL import Image
from io import BytesIO

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
from torchvision import transforms, models

# Let's check if a GPU is available, otherwise we'll use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [16]:
# [Cell 9 - New Cell] Clean the categories column
import ast # This library safely evaluates string-lists

def clean_category(cat_str):
    try:
        # Safely evaluate the string as a Python list
        cat_list = ast.literal_eval(cat_str)
        
        # Heuristic: Take the *second* element as the main category
        # (e.g., ['Home & Kitchen', 'Furniture', ...]) -> 'Furniture'
        # If it's shorter, take the last element.
        if isinstance(cat_list, list) and len(cat_list) > 1:
            return cat_list[1] # e.g., 'Furniture'
        elif isinstance(cat_list, list) and len(cat_list) > 0:
            return cat_list[-1] # e.g., 'Doormats'
        else:
            return "Uncategorized" # Fallback
    except (ValueError, SyntaxError):
        # If it's not a list-string, just return the original string or a fallback
        return cat_str if pd.notna(cat_str) else "Uncategorized"

# Apply this function to create a new column
df['clean_category'] = df['categories'].apply(clean_category)

# Let's check our new, cleaner top categories!
print("New Top 5 Cleaned Categories:")
print(df['clean_category'].value_counts().nlargest(5))

New Top 5 Cleaned Categories:
clean_category
Furniture                 203
Outdoor Décor              22
Storage & Organization     21
Home Décor Products        21
Hardware                   17
Name: count, dtype: int64


In [19]:
# [Cell 10 - CORRECTED] Define a function to download our sample images
import os
import requests
from PIL import Image
from io import BytesIO
from tqdm.auto import tqdm
import ast # Import ast to clean the string-lists

DATA_PATH = "data/cv_training"

def download_sample_images(df, num_per_category=20):
    # Use the 'clean_category' column
    top_5_categories = df['clean_category'].value_counts().nlargest(5).index.tolist()
    print(f"Top 5 clean categories: {top_5_categories}")
    
    sample_df_list = []
    for category in top_5_categories:
        category_df = df[df['clean_category'] == category]
        
        # Safety check
        category_count = len(category_df)
        n_samples = min(num_per_category, category_count)
        
        if n_samples > 0:
            sample_df_list.append(category_df.sample(n=n_samples, random_state=42))
    
    if not sample_df_list:
        print("No samples to download. Check your category cleaning.")
        return []

    sample_df = pd.concat(sample_df_list)
    print(f"Downloading {len(sample_df)} sample images...")
    
    # Create directories and download
    for idx, row in tqdm(sample_df.iterrows(), total=len(sample_df)):
        category_name = str(row['clean_category']).replace('/', '_').replace('&', 'and')
        category_path = os.path.join(DATA_PATH, category_name)
        
        if not os.path.exists(category_path):
            os.makedirs(category_path)
        
        # === START OF NEW FIX ===
        img_url = ""
        try:
            # 1. Safely evaluate the string as a Python list
            img_list = ast.literal_eval(row['images'])
            
            # 2. Check if the list is not empty and get the first URL
            if isinstance(img_list, list) and len(img_list) > 0:
                img_url = img_list[0].strip() # 3. .strip() removes extra spaces
        except (ValueError, SyntaxError):
            # Fallback if it's not a list (e.g., just a single URL string)
            img_url = row['images'].split(',')[0].strip()
        # === END OF NEW FIX ===

        if not img_url:
            print(f"Skipping product {row['uniq_id']} - No valid image URL found.")
            continue
            
        try:
            response = requests.get(img_url, timeout=5)
            # Check if the request was successful
            if response.status_code == 200:
                img = Image.open(BytesIO(response.content)).convert("RGB")
                img.save(os.path.join(category_path, f"{row['uniq_id']}.jpg"))
            else:
                print(f"Could not download {img_url} - Status Code: {response.status_code}")
        except Exception as e:
            print(f"Could not download {img_url}: {e}")
            
    print("Image download complete.")
    return top_5_categories

# Run the new and improved function
top_5_categories = download_sample_images(df, num_per_category=20)

Top 5 clean categories: ['Furniture', 'Outdoor Décor', 'Storage & Organization', 'Home Décor Products', 'Hardware']
Downloading 97 sample images...


  0%|          | 0/97 [00:00<?, ?it/s]

Image download complete.


In [21]:
# [Cell 11 - CORRECTED] Create a PyTorch Dataset and DataLoaders
# We'll use a standard ImageFolder dataset, which is perfect for this structure.
# We also define transforms to make all images the same size and normalize them.

from torch.utils.data import DataLoader, random_split
# --- FIX: Import 'datasets' from torchvision ---
from torchvision import transforms, models, datasets

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# --- FIX: Use 'datasets.ImageFolder' instead of 'models.folder.ImageFolder' ---
# Load the dataset from the folder we just created
full_dataset = datasets.ImageFolder(root=DATA_PATH, transform=transform)

# Split into training and validation (80% train, 20% validation)
train_size = int(0.8 * len(full_dataset))
val_size = len(full_dataset) - train_size
train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size])

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Save the class names (so we know what category '0', '1', '2' etc. mean)
class_names = full_dataset.classes
print(f"Dataset created. Using {len(class_names)} classes: {class_names}")
print(f"Training samples: {len(train_dataset)}, Validation samples: {len(val_dataset)}")

Dataset created. Using 5 classes: ['Furniture', 'Hardware', 'Home Décor Products', 'Outdoor Décor', 'Storage and Organization']
Training samples: 75, Validation samples: 19


In [22]:
# [Cell 12] Load the pre-trained CV Model (ResNet-18)
# We use 'transfer learning' - using a model already trained on millions of images
# and just fine-tuning the last layer for our 5 categories.

import torch.nn as nn
from torchvision import models

model_cv = models.resnet18(pretrained=True)

# Freeze all layers except the final one
for param in model_cv.parameters():
    param.requires_grad = False

# Replace the final layer to output 5 classes (our top 5 categories)
num_ftrs = model_cv.fc.in_features
model_cv.fc = nn.Linear(num_ftrs, len(class_names)) # len(class_names) is 5

model_cv = model_cv.to(device) # Move the model to the GPU if available
print("ResNet-18 model loaded and final layer replaced.")

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to C:\Users\taman/.cache\torch\hub\checkpoints\resnet18-f37072fd.pth


100%|█████████████████████████████████████████████████████████████████████████████| 44.7M/44.7M [00:12<00:00, 3.66MB/s]

ResNet-18 model loaded and final layer replaced.





In [23]:
# [Cell 13] Train the CV Model
# This cell will run for a few minutes. It will print the progress for each 'epoch' (pass).
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
# We only want to optimize the parameters of the new final layer
optimizer = optim.Adam(model_cv.fc.parameters(), lr=0.001)

num_epochs = 10
print("Starting CV model training...")

for epoch in range(num_epochs):
    model_cv.train() # Set model to training mode
    running_loss = 0.0
    
    for inputs, labels in train_loader:
        # Move data to the same device as the model
        inputs, labels = inputs.to(device), labels.to(device)
        
        # 1. Clear old gradients
        optimizer.zero_grad()
        
        # 2. Forward pass: Get model predictions
        outputs = model_cv(inputs)
        
        # 3. Calculate the loss
        loss = criterion(outputs, labels)
        
        # 4. Backward pass: Calculate gradients
        loss.backward()
        
        # 5. Update the weights
        optimizer.step()
        
        running_loss += loss.item() * inputs.size(0)
        
    epoch_loss = running_loss / len(train_loader.dataset)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}')

print("Training complete.")

# [Cell 14 - Save the model]
# This saves our 'perfect' CV model.
torch.save(model_cv.state_dict(), 'cv_classifier_model.pth')
print(f"CV Model saved as 'cv_classifier_model.pth' in your project folder.")

Starting CV model training...
Epoch 1/10, Loss: 1.8886
Epoch 2/10, Loss: 1.6903
Epoch 3/10, Loss: 1.4433
Epoch 4/10, Loss: 1.2180
Epoch 5/10, Loss: 1.0967
Epoch 6/10, Loss: 0.9351
Epoch 7/10, Loss: 0.8242
Epoch 8/10, Loss: 0.7380
Epoch 9/10, Loss: 0.6489
Epoch 10/10, Loss: 0.6073
Training complete.
CV Model saved as 'cv_classifier_model.pth' in your project folder.
