# Vibe Matcher — Mini Recommendation System

This notebook demonstrates a small AI-powered recommender that maps a short vibe query (e.g. `energetic urban chic`) to the top-3 matching fashion products using OpenAI embeddings and cosine similarity.

Instructions:
- Provide an `OPENAI_API_KEY` via a `.env` file in the notebook environment or paste it when prompted.
- Run cells top-to-bottom.

In [None]:
# Install dependencies (run once in Colab or Jupyter)
%pip install --quiet openai pandas numpy scikit-learn python-dotenv

print('Installed: openai, pandas, numpy, scikit-learn, python-dotenv')

In [None]:
# Imports and environment setup
import os
from dotenv import load_dotenv
from getpass import getpass
import time
import timeit
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import openai

# Load .env (if present) and set OpenAI API key from environment
load_dotenv()
if not os.getenv('OPENAI_API_KEY'):
    try:
        # interactive prompt (works in Colab/Jupyter)
        key = getpass('Enter your OpenAI API key (or leave blank to stop): ')
        if key:
            os.environ['OPENAI_API_KEY'] = key
    except Exception:
        pass

# Initialize OpenAI with the key from environment
openai.api_key = os.getenv('OPENAI_API_KEY')
print('OPENAI_API_KEY present:', bool(openai.api_key))

In [None]:
# Data preparation: mock fashion products
products = [
    {
        'name': 'Boho Dress',
        'desc': 'Flowy maxi dress in earthy tones with embroidered details — festival-ready bohemian vibes.',
        'tags': ['boho', 'flowy', 'festival']
    },
    {
        'name': 'Urban Bomber',
        'desc': 'Matte black bomber jacket with structured shoulders — energetic and urban chic for nights out.',
        'tags': ['urban', 'chic', 'edgy']
    },
    {
        'name': 'Cozy Knit Sweater',
        'desc': 'Oversized chunky knit sweater in soft neutrals — warm, relaxed, and cozy.',
        'tags': ['cozy', 'casual', 'warm']
    },
    {
        'name': 'Minimal Blazer',
        'desc': 'Tailored single-breasted blazer in a minimalist palette — clean, polished workwear.',
        'tags': ['minimal', 'workwear', 'polished']
    },
    {
        'name': 'Sport Luxe Jogger',
        'desc': 'Sleek joggers with luxe fabric and tapered fit — sporty yet streetwise and energetic.',
        'tags': ['sporty', 'streetwear', 'energetic']
    },
    {
        'name': 'Retro Floral Blouse',
        'desc': 'Romantic floral blouse with puff sleeves — colorful, vintage-inspired style.',
        'tags': ['retro', 'floral', 'romantic']
    },
    {
        'name': 'Tailored Trousers',
        'desc': 'High-waisted tailored trousers for confident city looks — polished and structured.',
        'tags': ['polished', 'city', 'confident']
    }
]
df = pd.DataFrame(products)
df

In [None]:
# Helper: get embedding using OpenAI embeddings API (text-embedding-ada-002)
def get_embedding(text, model='text-embedding-ada-002'):
    "Return a numpy array embedding for the given text using OpenAI.
    Raises a RuntimeError if the API call fails or key is missing.
    """
    if not openai.api_key:
        raise RuntimeError('OPENAI_API_KEY not set. Please provide an API key in .env or via the prompt.')
    try:
        resp = openai.Embedding.create(model=model, input=text)
        emb = resp['data'][0]['embedding']
        return np.array(emb, dtype=float)
    except Exception as e:
        # try alternate client attribute (compatibility across client versions)
        try:
            resp = openai.embeddings.create(model=model, input=text)
            emb = resp['data'][0]['embedding']
            return np.array(emb, dtype=float)
        except Exception as e2:
            raise RuntimeError('Embedding request failed: ' + str(e2))

# Generate embeddings for each product description and store in DataFrame
embeddings = []
start_all = time.perf_counter()
for desc in df['desc']:
    emb = get_embedding(desc)
    embeddings.append(emb)
end_all = time.perf_counter()
df['embedding'] = embeddings
print(f'Generated {len(embeddings)} embeddings in {end_all - start_all:.2f} seconds')

# Show the dataframe with truncated embeddings for readability
display_df = df.copy()
display_df['embedding'] = display_df['embedding'].apply(lambda v: v[:5].tolist())
display_df

In [None]:
# Matching logic: embed query, compute cosine similarity, return top-k results and timing info
def match_query(query, top_k=3, model='text-embedding-ada-002', timeit_runs=3):
    """Return top-k matches for the query along with latency samples and average (seconds).
    Uses timeit.repeat to measure embedding latency (number=1, repeat=timeit_runs).
    """
    # wrapper to call get_embedding once for timeit
    def embed_once():
        get_embedding(query, model=model)

    try:
        samples = timeit.repeat(stmt='embed_once()', globals=globals(), repeat=timeit_runs, number=1)
        latency_avg = float(sum(samples)/len(samples))
    except Exception as e:
        # Fallback single-run timing if timeit fails
        t0 = time.perf_counter()
        get_embedding(query, model=model)
        t1 = time.perf_counter()
        samples = [t1 - t0]
        latency_avg = samples[0]

    # Obtain the actual embedding to compute similarities
    q_emb = get_embedding(query, model=model)
    prod_embs = np.vstack(df['embedding'].values)
    sims = cosine_similarity(q_emb.reshape(1, -1), prod_embs)[0]

    # Collect top-k matches
    top_idx = sims.argsort()[::-1][:top_k]
    results = []
    for idx in top_idx:
        results.append({
            'name': df.iloc[idx]['name'],
            'desc': df.iloc[idx]['desc'],
            'tags': df.iloc[idx]['tags'],
            'score': float(sims[idx])
        })

    return {
        'query': query,
        'latency_avg_s': latency_avg,
        'latency_samples_s': samples,
        'results': results
    }

In [None]:
# Evaluation: run sample queries and print formatted results, counts > 0.7, and latency
sample_queries = ['energetic urban chic', 'cozy boho festival', 'minimalist workwear']
overall_high = 0
latencies = []
print('Running sample queries...')
for q in sample_queries:
    out = match_query(q, top_k=3, timeit_runs=3)
    latencies.append(out['latency_avg_s'])
    print('
Query:', out['query'])
    print(f"Latency (avg over samples): {out['latency_avg_s']*1000:.1f} ms")
    print('Top-3 matches:')
    for r in out['results']:
        print(f" - {r['name']} (score={r['score']:.4f})
     desc: {r['desc']}
     tags: {r['tags']}")
    # count matches above threshold
    count_high = sum(1 for r in out['results'] if r['score'] > 0.7)
    overall_high += count_high
    print(f"Matches with similarity > 0.7: {count_high}")

# Summary metrics
print('
Summary:')
print(f'Queries run: {len(sample_queries)}')
print(f'Total top-3 matches with similarity > 0.7 (aggregated): {overall_high}')
print('Per-query average latencies (ms):', [f"{l*1000:.1f}" for l in latencies])
print(f'Average latency across queries: {np.mean(latencies)*1000:.1f} ms')

## Reflection & Improvements

- Accuracy: With a small mock catalog, embeddings capture broad semantic similarity but may confuse closely related styles. More descriptive product texts and tags improve precision.
- Latency: Embedding latency depends on network and model; `text-embedding-ada-002` is reasonably fast but each query requires an API call. Use batching or local models for lower latency.
- Improvements: Persist embeddings (Parquet) or use a vector DB (Pinecone, FAISS) for large catalogs; expand the dataset; experiment with larger embedding models; add user feedback loop for personalization.