In [1]:
import numpy as np
import matplotlib.pyplot as plt

def evaluate_retrieval(ground_truth, predicted, similarity_scores, k=5):
    """
    Evaluates image retrieval performance.

    Args:
        ground_truth (dict): {query: [relevant_image_ids]}
        predicted (dict): {query: [retrieved_image_ids]}
        similarity_scores (dict): {query: [cosine_sim_scores aligned with predicted]}
        k (int): top-K value to evaluate

    Returns:
        dict: containing precision@k, recall@k, top-k accuracy
    """
    precision_list, recall_list, top_k_correct = [], [], 0
    all_cosine_scores = []

    for query in ground_truth:
        gt_ids = set(ground_truth[query])
        pred_ids = predicted.get(query, [])[:k]
        sims = similarity_scores.get(query, [])[:k]

        all_cosine_scores.extend(sims)

        relevant_retrieved = gt_ids.intersection(pred_ids)
        precision = len(relevant_retrieved) / k
        recall = len(relevant_retrieved) / len(gt_ids) if gt_ids else 0
        top_k_hit = any(img_id in gt_ids for img_id in pred_ids)

        precision_list.append(precision)
        recall_list.append(recall)
        top_k_correct += 1 if top_k_hit else 0

    results = {
        "Precision@K": round(np.mean(precision_list), 4),
        "Recall@K": round(np.mean(recall_list), 4),
        "Top-K Accuracy": round(top_k_correct / len(ground_truth), 4),
    }

    # Plot cosine similarity distribution
    plt.hist(all_cosine_scores, bins=20, color='skyblue', edgecolor='black')
    plt.title("Cosine Similarity Distribution")
    plt.xlabel("Cosine Similarity")
    plt.ylabel("Frequency")
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    return results


In [2]:
from fastapi import FastAPI, HTTPException
import streamlit as st
import os, json, requests
from openai import OpenAI
from dotenv import load_dotenv
from snowflake_list import annotations_list
from typing import Optional
import boto3
from PIL import Image
from io import BytesIO

In [3]:
from pages.Stylist import interact_with_gpt

  warn_incompatible_dep(


In [4]:
URL_CLIP    = "http://127.0.0.1:8000/image-search"

In [5]:
queries=[
    'outfit for a party',
    'Give me an outfit for a day outing. I would like it to be casual and calm',
    'What would you suggest if I am going on a date?',
    'What dress would you suggest me if I am going on a road trip? I want it to be a T-shirt.'
]

In [6]:
predicted=dict()
for query in queries:
    recs = interact_with_gpt(query)
    top = recs["Top"]
    gender = "Men"
    for part in (["Top"]):
        item = recs[part]
        keyword = "+".join([item["color"],item["clothing type"],item["pattern"],gender])
        # 1) fetch similar images via your FastAPI
        img_resp = requests.post(URL_CLIP, json={"query": keyword})
        predicted[query]=img_resp.json()["image_ids"]

In [7]:
ground_truth = {
    "outfit for a party": ["22.jpg", "24.jpg", "5.jpg"],
    "Give me an outfit for a day outing. I would like it to be casual and calm": ['14.jpg','1.jpg','33.jpg'],
    "What would you suggest if I am going on a date?": ['26.jpg', '18.jpg','27.jpg'],
    "What dress would you suggest me if I am going on a road trip? I want it to be a T-shirt.": ['15.jpg','13.jpg','12.jpg']
}

In [8]:
predicted

{'outfit for a party': ['28.jpg', '18.jpg'],
 'Give me an outfit for a day outing. I would like it to be casual and calm': ['31884252_fpx.png',
  '17.jpg'],
 'What would you suggest if I am going on a date?': ['1.jpg', '2.jpg'],
 'What dress would you suggest me if I am going on a road trip? I want it to be a T-shirt.': ['15.jpg',
  '13.jpg']}

In [9]:
import numpy as np
import matplotlib.pyplot as plt

def evaluate_retrieval(ground_truth, predicted, k=5):
    """
    Evaluates image retrieval performance.

    Args:
        ground_truth (dict): {query: [relevant_image_ids]}
        predicted (dict): {query: [retrieved_image_ids]}
        similarity_scores (dict): {query: [cosine_sim_scores aligned with predicted]}
        k (int): top-K value to evaluate

    Returns:
        dict: containing precision@k, recall@k, top-k accuracy
    """
    precision_list, recall_list, top_k_correct = [], [], 0

    for query in ground_truth:
        gt_ids = set(ground_truth[query])
        pred_ids = predicted.get(query, [])[:k]

        relevant_retrieved = gt_ids.intersection(pred_ids)
        precision = len(relevant_retrieved) / k
        recall = len(relevant_retrieved) / len(gt_ids) if gt_ids else 0
        top_k_hit = any(img_id in gt_ids for img_id in pred_ids)

        precision_list.append(precision)
        recall_list.append(recall)
        top_k_correct += 1 if top_k_hit else 0

    results = {
        "Precision@K": round(np.mean(precision_list), 4),
        "Recall@K": round(np.mean(recall_list), 4),
        "Top-K Accuracy": round(top_k_correct / len(ground_truth), 4),
    }

    return results


In [13]:
for i in [1,3,5]:
    print(evaluate_retrieval(ground_truth, predicted, i))

{'Precision@K': np.float64(0.25), 'Recall@K': np.float64(0.0833), 'Top-K Accuracy': 0.25}
{'Precision@K': np.float64(0.1667), 'Recall@K': np.float64(0.1667), 'Top-K Accuracy': 0.25}
{'Precision@K': np.float64(0.1), 'Recall@K': np.float64(0.1667), 'Top-K Accuracy': 0.25}


In [None]:
import numpy as np
from sklearn.metrics import precision_score, recall_score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer

# ---- 1. GPT-Based Evaluation: Text Recommendations ----

def compute_bleu_scores(references, candidates):
    smoothie = SmoothingFunction().method4
    bleu2 = np.mean([
        sentence_bleu([ref.split()], cand.split(), weights=(0.5, 0.5), smoothing_function=smoothie)
        for ref, cand in zip(references, candidates)
    ])
    bleu4 = np.mean([
        sentence_bleu([ref.split()], cand.split(), weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothie)
        for ref, cand in zip(references, candidates)
    ])
    return bleu2, bleu4

def compute_rouge_l(references, candidates):
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    scores = [scorer.score(ref, cand)['rougeL'].fmeasure for ref, cand in zip(references, candidates)]
    return np.mean(scores)

# Sample input (replace with actual model predictions and human references)
references = ["a floral summer dress perfect for beach outings"]
candidates = ["a summer floral dress good for beach outings"]

bleu2, bleu4 = compute_bleu_scores(references, candidates)
rouge_l = compute_rouge_l(references, candidates)

print(f"BLEU-2 Score: {bleu2:.2f}")
print(f"BLEU-4 Score: {bleu4:.2f}")
print(f"ROUGE-L Score: {rouge_l:.2f}")





In [None]:
# ---- 2. CLIP-Based Evaluation: Top-K Retrieval ----

def precision_recall_at_k(gt_list, pred_list, k):
    precision_list, recall_list = [], []
    for gt, pred in zip(gt_list, pred_list):
        pred_k = pred[:k]
        hits = len(set(gt).intersection(pred_k))
        precision = hits / k
        recall = hits / len(gt) if len(gt) > 0 else 0
        precision_list.append(precision)
        recall_list.append(recall)
    return np.mean(precision_list), np.mean(recall_list)

# Sample input (replace with your own indices)
ground_truth_indices = [[101], [202], [303]]
predicted_indices = [[101, 102, 103], [204, 202, 206], [307, 303, 301]]

for k in [1, 3, 5]:
    p_at_k, r_at_k = precision_recall_at_k(ground_truth_indices, predicted_indices, k)
    print(f"Precision@{k}: {p_at_k:.2f}, Recall@{k}: {r_at_k:.2f}")