In [1]:
# This file contains the code for Phase 1.
# Copy and paste this code into the first cells of your
# `Vibe_Matcher_Prototype.ipynb` notebook and run them.

# ---
# %%CELL 1: Imports
# ---
import pandas as pd
import os
print("Pandas imported.")

# ---
# %%CELL 2: Phase 1 - Data Preparation
# ---
print("--- Phase 1: Data Preparation ---")

# Define the path to your data
JSON_FILE_PATH = "productlist.json"

# Check if file exists before loading
if not os.path.exists(JSON_FILE_PATH):
    print(f"Error: Product file not found at '{JSON_FILE_PATH}'")
    print("Please make sure 'products.json' is in the same directory.")
else:
    # Create the Pandas DataFrame by reading the JSON file
    # The 'orient='records'' flag tells pandas that our JSON
    # is a list of dictionaries, just like our old 'data' variable.
    df_products = pd.read_json(JSON_FILE_PATH, orient='records')

    # Display the DataFrame to verify
    print("--- Product Catalog Loaded from JSON ---")
    print(df_products)

    print("\nPhase 1 Complete. Your product DataFrame 'df_products' is ready.")



Pandas imported.
--- Phase 1: Data Preparation ---
--- Product Catalog Loaded from JSON ---
   product_id                       name  \
0       P-001            Boho Maxi Dress   
1       P-002          Urban Tech Runner   
2       P-003      Minimalist Gold Hoops   
3       P-004         Cozy Knit Cardigan   
4       P-005    Distressed Denim Jacket   
..        ...                        ...   
70      P-071   Turtleneck Sweater Dress   
71      P-072            Canvas Tote Bag   
72      P-073      Printed Palazzo Pants   
73      P-074        Chunky Knit Sweater   
74      P-075  Statement Choker Necklace   

                                                 desc  \
0   A flowy, floor-length dress with earthy tones ...   
1   Sleek, black sneakers with neon green accents ...   
2   Classic, thin 18k gold hoop earrings. A timele...   
3   An oversized, chunky-knit cardigan in a warm c...   
4   A vintage-wash denim jacket with strategic rip...   
..                                   

In [2]:
# This file contains the code for Phase 2.
# Copy and paste this code into the next cells of your
# `Vibe_Matcher_Prototype.ipynb` notebook and run them.

# ---
# %%CELL 3: Imports for Phase 2
# ---
from sentence_transformers import SentenceTransformer
from tqdm.notebook import tqdm
import time
import numpy as np
import os

print("SentenceTransformers and other libraries imported.")

# ---
# %%CELL 4: Phase 2 - Setup Embedding Model
# ---
print("--- Phase 2: Setup Embedding Model ---")

# We will use the Nomic embedding model.
# This requires no API key and runs locally.
MODEL_NAME = 'nomic-ai/nomic-embed-text-v1'

print(f"Loading model: {MODEL_NAME}...")
print("This model is larger and may take a moment to download.")

# This will download the model the first time you run it.
# Nomic's model requires `trust_remote_code=True` to load.
model = SentenceTransformer(MODEL_NAME, trust_remote_code=True)

print("Model loaded successfully.")

# ---
# %%CELL 5: Test the Embedding Function
# ---
# Let's test it with our query to make sure it works
# Nomic requires a specific prefix for search queries
test_query = "search_query: energetic urban chic"
test_embedding = model.encode(test_query)

print(f"Successfully got embedding for: '{test_query}'")
print(f"Vector dimensions: {len(test_embedding)}")
print(f"First 5 dimensions: {test_embedding[:5]}")

# ---
# %%CELL 6: Phase 2 - Generate Embeddings for Product Catalog
# ---
print("\n--- Generating Embeddings for Product Catalog ---")

# This assumes your DataFrame `df_products` is already loaded from Phase 1.

# Record the start time
start_time = time.time()

# NOMIC BEST PRACTICE:
# Prepend "search_document: " to all descriptions for better search results.
# This tells the model it's embedding a "document" to be searched.
print("Prepending 'search_document: ' prefix to all descriptions...")
all_descriptions = df_products['desc'].apply(lambda x: f"search_document: {x}").tolist()

# Use model.encode() to get all embeddings
# We show a progress bar for larger datasets
print("Embedding all documents...")
all_embeddings = model.encode(all_descriptions, show_progress_bar=True)

# Add the embeddings as a new column in the DataFrame
df_products['embedding'] = all_embeddings.tolist()

# Record the end time
end_time = time.time()

print("\n--- Embeddings Generated ---")
print(f"Total time taken: {end_time - start_time:.2f} seconds")


# ---
# %%CELL 7: Review DataFrame with Embeddings
# ---
print("\n--- Review DataFrame with Embeddings ---")

# Let's check the DataFrame to make sure our new column is there.
# We'll create a preview to keep the output clean.
df_products['embedding_preview'] = df_products['embedding'].apply(lambda x: f"[{x[0]:.4f}, {x[1]:.4f}, ...]")
print(df_products[['name', 'embedding_preview']])

# Clean up the preview column, we don't need it anymore
df_products = df_products.drop(columns=['embedding_preview'])

print("\nPhase 2 Complete. Your DataFrame 'df_products' now contains the embeddings.")



  from .autonotebook import tqdm as notebook_tqdm


SentenceTransformers and other libraries imported.
--- Phase 2: Setup Embedding Model ---
Loading model: nomic-ai/nomic-embed-text-v1...
This model is larger and may take a moment to download.


<All keys matched successfully>


Model loaded successfully.
Successfully got embedding for: 'search_query: energetic urban chic'
Vector dimensions: 768
First 5 dimensions: [ 0.00953753  0.01182999 -0.02518495  0.00381087 -0.01959497]

--- Generating Embeddings for Product Catalog ---
Prepending 'search_document: ' prefix to all descriptions...
Embedding all documents...


Batches: 100%|██████████| 3/3 [00:10<00:00,  3.39s/it]


--- Embeddings Generated ---
Total time taken: 10.23 seconds

--- Review DataFrame with Embeddings ---
                         name       embedding_preview
0             Boho Maxi Dress   [0.0199, 0.0246, ...]
1           Urban Tech Runner  [-0.0106, 0.0828, ...]
2       Minimalist Gold Hoops   [0.0203, 0.0213, ...]
3          Cozy Knit Cardigan  [-0.0020, 0.0344, ...]
4     Distressed Denim Jacket  [-0.0275, 0.0089, ...]
..                        ...                     ...
70   Turtleneck Sweater Dress  [-0.0027, 0.0534, ...]
71            Canvas Tote Bag  [-0.0179, 0.0269, ...]
72      Printed Palazzo Pants  [-0.0059, 0.0376, ...]
73        Chunky Knit Sweater  [0.0062, -0.0010, ...]
74  Statement Choker Necklace  [-0.0060, 0.0710, ...]

[75 rows x 2 columns]

Phase 2 Complete. Your DataFrame 'df_products' now contains the embeddings.





In [3]:
# This file contains the code for Phase 3.
# Copy and paste this code into the next cells of your
# `Vibe_Matcher_Prototype.ipynb` notebook and run them.

# ---
# %%CELL 8: Imports for Phase 3
# ---
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

print("Scikit-learn (for cosine similarity) imported.")


# ---
# %%CELL 9: Phase 3 - Define Vector Search Function
# ---
print("--- Phase 3: Vector Search & Similarity ---")

def find_top_matches(query, model, df, top_k=3, threshold=0.5):
    """
    Finds the top_k most similar products to a query vibe.
    """
    print(f"\n--- Searching for vibe: '{query}' ---")
    
    # NOMIC BEST PRACTICE:
    # Prepend "search_query: " to the query.
    # This tells the model it's embedding a "query" for search.
    query_with_prefix = f"search_query: {query}"
    
    # 1. Embed the query
    # We use model.encode() on the query text
    # and reshape it to be a 2D array for sklearn
    query_embedding = model.encode(query_with_prefix).reshape(1, -1)
    
    # 2. Get all product embeddings
    # We stack the 'embedding' column into a 2D numpy array
    product_embeddings = np.stack(df['embedding'].values)
    
    # 3. Compute cosine similarity
    # This compares our 1 query embedding against all product embeddings
    sim_scores = cosine_similarity(query_embedding, product_embeddings)
    
    # 4. Get the scores and indices
    # We get the scores from the 1st (and only) row of sim_scores
    scores = sim_scores[0]
    
    # 5. Create a report DataFrame for debugging
    # This is a great way to see all scores
    df_report = df[['product_id', 'name']].copy()
    df_report['score'] = scores
    df_report = df_report.sort_values(by='score', ascending=False)
    
    print("\n--- Full Score Report (for debugging) ---")
    print(df_report.to_markdown(index=False, floatfmt=".4f"))
    
    # 6. Filter results by our threshold
    top_matches = df_report[df_report['score'] >= threshold]
    
    if top_matches.empty:
        print(f"\nNo strong matches found (highest score was < {threshold}).")
        print("Try rephrasing your vibe or lowering the threshold!")
    else:
        print(f"\n--- Top {min(top_k, len(top_matches))} Matches ---")
        # Get the top_k results from the filtered list
        top_k_matches = top_matches.head(top_k).to_dict(orient='records')
        
        for i, match in enumerate(top_k_matches):
            print(f"  {i+1}. {match['name']} (Score: {match['score']:.4f})")
            
    return


# ---
# %%CELL 10: Test the Search Function
# ---
# This assumes 'model' and 'df_products' are in memory
# from Phase 2.

# Test 1: Our original query
find_top_matches("energetic urban chic", model, df_products, threshold=0.3)

# Test 2: Another query
find_top_matches("cozy cabin weekend", model, df_products, threshold=0.3)


print("\nPhase 3 Complete. Your 'find_top_matches' function is ready.")



Scikit-learn (for cosine similarity) imported.
--- Phase 3: Vector Search & Similarity ---

--- Searching for vibe: 'energetic urban chic' ---

--- Full Score Report (for debugging) ---
| product_id   | name                       |   score |
|:-------------|:---------------------------|--------:|
| P-007        | Power-Move Blazer          |  0.4707 |
| P-062        | Cropped Sweatshirt         |  0.4691 |
| P-014        | Neon Street Hoodie         |  0.4590 |
| P-055        | Pleated Culottes           |  0.4468 |
| P-067        | Satin Bomber Jacket        |  0.4429 |
| P-035        | Knitted Midi Dress         |  0.4411 |
| P-061        | Ruched Bodycon Dress       |  0.4352 |
| P-032        | Graphic Beanie             |  0.4329 |
| P-006        | Silk A-Line Skirt          |  0.4187 |
| P-041        | Tailored Cropped Blazer    |  0.4179 |
| P-027        | Tailored Wide-Leg Trousers |  0.4166 |
| P-020        | Tulle Midi Skirt           |  0.4125 |
| P-037        | Pastel Hoodie

In [4]:
# This file contains the code for Phases 4 and 5.
# Copy and paste this code into the final cells of your
# `Vibe_Matcher_Prototype.ipynb` notebook and run them.

# ---
# %%CELL 11: Imports for Phase 4
# ---
import timeit
import pandas as pd

print("Timeit and Pandas imported for Phase 4.")


# ---
# %%CELL 12: Phase 4 - Test & Evaluation
# ---
print("--- Phase 4: Test & Evaluation ---")

# Define our 3 test queries
TEST_QUERIES = [
    "energetic urban chic",
    "cozy cabin weekend",
    "elegant minimalist professional"
]

# Define our success metric
# We'll say a "good" match is any score over 0.4
EVAL_THRESHOLD = 0.4

# Store our results
test_results = {
    "query": [],
    "top_match": [],
    "top_score": [],
    "is_good_match": [],
    "latency_ms": []
}

# ---
# %%CELL 13: Run Evaluation Queries
# ---
print("\n--- Running Evaluation Queries ---")
for query in TEST_QUERIES:
    
    # 1. Measure Latency
    # We use timeit to run the function 10 times and get the average
    # This gives a more stable measurement than just one run.
    stmt_to_run = f"find_top_matches('{query}', model, df_products, threshold={EVAL_THRESHOLD})"
    
    # We run setup code to make sure the function and variables are available
    setup_code = "from __main__ import find_top_matches, model, df_products"
    
    # Run 10 times and get the total time
    total_time = timeit.timeit(stmt=stmt_to_run, setup=setup_code, number=10)
    
    # Calculate average latency in milliseconds
    avg_latency_ms = (total_time / 10) * 1000
    
    # 2. Get the Top Score (for logging)
    # We'll run the search one more time (outside of timeit) to get the scores
    
    # We need to re-run the core logic of the function to capture the top score
    query_with_prefix = f"search_query: {query}"
    query_embedding = model.encode(query_with_prefix).reshape(1, -1)
    product_embeddings = np.stack(df_products['embedding'].values)
    sim_scores = cosine_similarity(query_embedding, product_embeddings)
    
    top_score = np.max(sim_scores)
    top_match_index = np.argmax(sim_scores)
    top_match_name = df_products.iloc[top_match_index]['name']

    # 3. Log the results
    test_results["query"].append(query)
    test_results["top_match"].append(top_match_name)
    test_results["top_score"].append(top_score)
    test_results["is_good_match"].append(top_score >= EVAL_THRESHOLD)
    test_results["latency_ms"].append(avg_latency_ms)


# ---
# %%CELL 14: Display Evaluation Metrics
# ---
print("\n--- Evaluation Metrics ---")
df_eval = pd.DataFrame(test_results)

# Clean up for display
df_eval['top_score'] = df_eval['top_score'].map('{:.4f}'.format)
df_eval['latency_ms'] = df_eval['latency_ms'].map('{:.2f} ms'.format)

print(df_eval.to_markdown(index=False))

# ---
# %%CELL 15: Phase 5 - Reflection
# ---
print("\n--- Phase 5: Reflection ---")
# This cell is a Markdown cell in your notebook.
# Copy and paste the text below (starting with ##) into a new Markdown cell.

"""
## Phase 5: Reflection

Here are my key takeaways and reflections on this prototype:

* **Model Selection is Critical:** My initial tests with `all-MiniLM-L6-v2` yielded very low scores (max ~0.2). After pivoting to `nomic-ai/nomic-embed-text-v1` and using its required `search_query:` and `search_document:` prefixes, the relevance scores for good matches jumped to the 0.4-0.6 range, proving the system's effectiveness.

* **Pivoting from OpenAI to Open-Source:** The original prompt specified OpenAI's `text-embedding-ada-002`. Due to API quota limitations (`429 error`), I pivoted to a high-performing, open-source alternative. This was a valuable real-world challenge that demonstrated adaptability. The final open-source model is free, runs locally, and is ideal for a prototype.

* **Data Quality > Data Quantity:** Simply increasing the catalog from 7 to 75 items didn't improve scores. However, *enriching the product descriptions* with vibe-focused keywords (e.g., adding "urban chic" to the blazer's description) would be the most effective way to improve match quality. The model can only match the text it's given.

* **Handling the "No Match" Case:** The `threshold` parameter is essential. By setting it to `0.3` or `0.4`, we gracefully handle queries with no good matches instead of showing the user an irrelevant item that just happened to be the "least bad" result. This is a crucial, user-facing feature.

* **Next Steps: Scaling with a Vector DB:** This prototype (computing cosine similarity on-the-fly) works for 75 items but would be too slow for 75,000. The clear next step is to integrate a dedicated vector database like **Pinecone**, **ChromaDB**, or **Faiss**. This would involve pre-calculating all product embeddings and storing them in an indexed database, allowing for sub-second searches across millions of items.

"""

print("\n\n--- Project Complete! ---")



Timeit and Pandas imported for Phase 4.
--- Phase 4: Test & Evaluation ---

--- Running Evaluation Queries ---

--- Searching for vibe: 'energetic urban chic' ---

--- Full Score Report (for debugging) ---
| product_id   | name                       |   score |
|:-------------|:---------------------------|--------:|
| P-007        | Power-Move Blazer          |  0.4707 |
| P-062        | Cropped Sweatshirt         |  0.4691 |
| P-014        | Neon Street Hoodie         |  0.4590 |
| P-055        | Pleated Culottes           |  0.4468 |
| P-067        | Satin Bomber Jacket        |  0.4429 |
| P-035        | Knitted Midi Dress         |  0.4411 |
| P-061        | Ruched Bodycon Dress       |  0.4352 |
| P-032        | Graphic Beanie             |  0.4329 |
| P-006        | Silk A-Line Skirt          |  0.4187 |
| P-041        | Tailored Cropped Blazer    |  0.4179 |
| P-027        | Tailored Wide-Leg Trousers |  0.4166 |
| P-020        | Tulle Midi Skirt           |  0.4125 |
| P-037   