# Vibe Matcher — Mini Recommendation Notebook (with local fallback)

This notebook is identical to the original but adds a local fallback using sentence-transformers if you don't have an OpenAI API key.

How it works:
- If `OPENAI_API_KEY` is set, the notebook uses OpenAI's `text-embedding-ada-002` to create embeddings.
- If no key is present (or OpenAI calls fail), it falls back to a local SentenceTransformers model (`all-MiniLM-L6-v2`).
- This keeps the demo runnable offline at the cost of slightly different embedding vectors.

In [None]:
%pip install --quiet openai pandas scikit-learn numpy sentence-transformers
print('Installed/available packages')

In [None]:
import os
from getpass import getpass
import time
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import openai

if not os.getenv('OPENAI_API_KEY'):
    try:
        key = getpass('Enter your OpenAI API key (or leave blank to use local fallback): ')
        if key:
            os.environ['OPENAI_API_KEY'] = key
    except Exception:
        pass
openai.api_key = os.getenv('OPENAI_API_KEY')
print('OpenAI key set:', bool(openai.api_key))

In [None]:
products = [
    {'name': 'Boho Dress', 'desc': 'Flowy, earthy tones with embroidery — festival-ready, laid-back bohemian vibe.', 'tags': ['boho','flowy','festival','earthy']},
    {'name': 'Urban Bomber', 'desc': 'Sleek bomber jacket in matte black — energetic, urban chic, great for nights out.', 'tags': ['urban','chic','energetic','edgy']},
    {'name': 'Cozy Knit Sweater', 'desc': 'Chunky knit, soft neutrals — comfortable, cozy, perfect for relaxed days.', 'tags': ['cozy','casual','warm']},
    {'name': 'Minimal Blazer', 'desc': 'Crisp lines and neutral tones for minimalist workwear — clean and refined.', 'tags': ['minimal','workwear','refined']},
    {'name': 'Sport Luxe Jogger', 'desc': 'Athletic silhouette with luxe fabric — casual but energetic streetwear.', 'tags': ['sporty','streetwear','energetic']},
    {'name': 'Retro Floral Blouse', 'desc': 'Vintage-inspired floral print — romantic, colorful, a playful retro feel.', 'tags': ['retro','floral','romantic']},
    {'name': 'Tailored Trousers', 'desc': 'Tailored, high-waist trousers for polished and confident city looks.', 'tags': ['polished','city','confident']},
]
df = pd.DataFrame(products)
df

In [None]:
# Embedding helper with OpenAI + local fallback
_local_embed_model = None

def get_local_embedding(text):
    global _local_embed_model
    if _local_embed_model is None:
        try:
            from sentence_transformers import SentenceTransformer
        except Exception as e:
            raise RuntimeError('sentence-transformers is required for local fallback. Install it with pip.')
        _local_embed_model = SentenceTransformer('all-MiniLM-L6-v2')
    emb = _local_embed_model.encode(text, convert_to_numpy=True)
    return np.array(emb, dtype=float)

def get_embedding(text, model='text-embedding-ada-002'):
    if openai.api_key:
        try:
            resp = openai.Embedding.create(model=model, input=text)
            emb = resp['data'][0]['embedding']
            return np.array(emb, dtype=float)
        except Exception as e:
            try:
                resp = openai.embeddings.create(model=model, input=text)
                emb = resp['data'][0]['embedding']
                return np.array(emb, dtype=float)
            except Exception as e2:
                print('OpenAI embedding failed — falling back to local model:', e2)
                return get_local_embedding(text)
    else:
        return get_local_embedding(text)

print('Embedding helper ready')

In [None]:
# Generate embeddings for each product description and store in DataFrame
embeddings = []
start = time.perf_counter()
for desc in df['desc']:
    emb = get_embedding(desc)
    embeddings.append(emb)
end = time.perf_counter()
df['embedding'] = embeddings
print(f'Generated {len(embeddings)} embeddings in {end - start:.2f} seconds')
display_df = df.copy()
display_df['embedding'] = display_df['embedding'].apply(lambda v: v[:5].tolist())
display_df

In [None]:
import timeit

def match_query(query, top_k=3, model='text-embedding-ada-002', timeit_runs=3):
    def embed_once():
        get_embedding(query, model=model)
    try:
        times = timeit.repeat(stmt='embed_once()', globals=globals(), repeat=timeit_runs, number=1)
        latency_avg = float(sum(times) / len(times))
    except Exception:
        t0 = time.perf_counter()
        get_embedding(query, model=model)
        t1 = time.perf_counter()
        times = [t1 - t0]
        latency_avg = times[0]

    q_emb = get_embedding(query, model=model)
    prod_embs = np.vstack(df['embedding'].values)
    sims = cosine_similarity(q_emb.reshape(1, -1), prod_embs)[0]
    top_idx = sims.argsort()[::-1][:top_k]
    results = []
    for idx in top_idx:
        results.append({'name': df.iloc[idx]['name'], 'desc': df.iloc[idx]['desc'], 'tags': df.iloc[idx]['tags'], 'score': float(sims[idx])})
    return {'query': query, 'latency_avg_s': latency_avg, 'latency_samples_s': times, 'results': results}

print('Matching function ready')

In [None]:
sample_queries = ['energetic urban chic','cozy boho festival','minimalist workwear']
total_high_confidence = 0
per_query_counts = []
latencies = []
for q in sample_queries:
    out = match_query(q, top_k=3, timeit_runs=5)
    latencies.append(out['latency_avg_s'])
    count_high = sum(1 for r in out['results'] if r['score'] > 0.7)
    per_query_counts.append({'query': q, 'count_gt_0_7': count_high})
    total_high_confidence += count_high
    print('\nQuery:', out['query'])
    print(f"Embedding latency (avg): {out['latency_avg_s']*1000:.1f} ms")
    print('Latency samples (ms):', [f"{s*1000:.1f}" for s in out['latency_samples_s']])
    print('Top results:')
    for r in out['results']:
        print(f" - {r['name']} (score={r['score']:.4f}) | tags={r['tags']}")
    print(f"Matches > 0.7 for this query: {count_high}")

print('\nSummary:')
print(f'Ran {len(sample_queries)} queries.')
print('Per-query counts > 0.7:')
for p in per_query_counts:
    print(f" - {p['query']}: {p['count_gt_0_7']}")
print(f'Overall total top-3 matches with similarity > 0.7: {total_high_confidence}')
print('Latencies (ms, per-query average):', [f"{l*1000:.1f}" for l in latencies])
print(f'Average latency across queries: {np.mean(latencies)*1000:.1f} ms')

## Reflection & Improvements
- Use a dedicated vector DB (Pinecone, Weaviate) for scale.
- Increase dataset size and collect user feedback for offline evaluation.
- Cache embeddings to disk to avoid repeated API calls.
- Try newer/larger embedding models for better semantic alignment.