# Multimodal Model Embedding Test

This notebook tests the embedding capabilities of multimodal models using the provided API endpoint.

## Setup

First, let's install the required dependencies and set up the API configuration.

In [None]:
# Install required packages
%pip install ipykernel jupyter requests pillow numpy matplotlib seaborn scikit-learn python-dotenv

In [None]:
import requests
import os
from dotenv import load_dotenv
import json
import base64
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import io
from sklearn.metrics.pairwise import cosine_similarity
from typing import List, Dict, Any, Optional, Union
import warnings
warnings.filterwarnings('ignore')
load_dotenv()

# Set up API configuration
API_BASE_URL = os.getenv("VLM_API")
API_KEY = os.getenv("VLM_API_KEY")
MODEL_NAME = "vlm2vec-full"

# headers for API requests
headers = {
    "Authorization": f"Bearer {API_KEY}",
    "Content-Type": "application/json"
}

print(f"API Base URL: {API_BASE_URL}")
print(f"API Key configured: {'Yes' if API_KEY != 'YOUR_API_KEY_HERE' else 'No'}")

## Helper Functions

Let's create helper functions for making API calls and processing embeddings.

In [93]:
def load_image_from_url(url):
    """Fetches an image from a URL (wikimedia commons) and returns it as a PIL Image object."""
    try:
        headers = {
            'User-Agent': 'MyImageLoader/1.0'
        }
        response = requests.get(url, headers=headers)
        response.raise_for_status()  
        image = Image.open(io.BytesIO(response.content))
        return image.convert('RGB')
    except requests.exceptions.RequestException as e:
        print(f"Error loading image from {url}: {e}")
        return None

def encode_image_to_base64(image_path: str) -> str:
    """Encode an image file to base64 string."""
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

def encode_pil_image_to_base64_jpeg(image: Image.Image) -> str:
    """Encode a PIL Image to jpeg base64 string."""
    buffer = io.BytesIO()
    image.convert('RGB').save(buffer, format='JPEG', quality=50, optimize=True)
    return base64.b64encode(buffer.getvalue()).decode('utf-8')

def encode_pil_image_to_base64_png(image: Image.Image) -> str:
    """Encode a PIL Image to png base64 string."""
    buffer = io.BytesIO()
    image.convert('RGB').save(buffer, format='PNG', quality=20,optimize=True)
    return base64.b64encode(buffer.getvalue()).decode('utf-8')

def get_text_embedding(text: str) -> Optional[List[float]]:
    """
    Get text embedding from the provided model.
    """
    payload = {
        "input": text,
        "model": MODEL_NAME
    }
        
    try:
        response = requests.post(
            f"{API_BASE_URL}/v1/embeddings",
            headers=headers,
            json=payload        
        )
        response.raise_for_status()
        return response.json()["data"][0]["embedding"]
    except requests.HTTPError as e:
        print(f"Embedding error: {e.response.status_code} - {e.response.text}")
    except Exception as e:
        print(f"Unexpected error: {e}")
    return None

def get_image_embedding_from_url(url: str) -> Optional[List[float]]:
    """
    Fetches an embedding for the image at `url` using vLLM's OpenAI-compatible /v1/embeddings API.
    """
    payload = {
        "model": MODEL_NAME,
        "messages": [
            {
                "role": "user",
                "content": [
                    {"type": "image_url", "image_url": {"url": url}},
                    {"type": "text",      "text": "Generate an embedding for this image."}
                ]
            }
        ],
        "encoding_format": "float"
    }

    try:
        response = requests.post(
            f"{API_BASE_URL}/v1/embeddings",
            headers=headers,
            json=payload        
        )
        response.raise_for_status()
        return response.json()["data"][0]["embedding"]
    except requests.HTTPError as e:
        print(f"Embedding error: {e.response.status_code} - {e.response.text}")
    except Exception as e:
        print(f"Unexpected error: {e}")
    return None

def get_multimodal_embedding(image_url: str, text: str) -> Optional[List[float]]:
    """
    Fetches a joint image+text embedding for the given image URL and text prompt
    using vLLM's OpenAI-compatible /v1/embeddings API.
    """
    payload = {
        "model": MODEL_NAME,
        "messages": [
            {
                "role": "user",
                "content": [
                    {"type": "image_url", "image_url": {"url": image_url}},
                    {"type": "text",      "text": text}
                ]
            }
        ],
        "encoding_format": "float"
    }

    try:
        resp = requests.post(
            f"{API_BASE_URL}/v1/embeddings",
            headers=headers,
            json=payload,
        )
        resp.raise_for_status()
        return resp.json()["data"][0]["embedding"]
    except requests.HTTPError as e:
        print(f"Embedding error: {e.response.status_code} - {e.response.text}")
    except Exception as e:
        print(f"Unexpected error: {e}")
    return None

def cosine_similarity_vectors(vec1: List[float], vec2: List[float]) -> float:
    """Calculate cosine similarity between two vectors."""
    vec1 = np.array(vec1)
    vec2 = np.array(vec2)
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

def analyze_embedding(embedding: List[float], name: str = "Embedding") -> None:
    """Analyze and visualize embedding statistics."""
    embedding_array = np.array(embedding)
    
    print(f"\n{name} Analysis:")
    print(f"  - Dimension: {len(embedding)}")
    print(f"  - Mean: {embedding_array.mean():.6f}")
    print(f"  - Std: {embedding_array.std():.6f}")
    print(f"  - Min: {embedding_array.min():.6f}")
    print(f"  - Max: {embedding_array.max():.6f}")
    print(f"  - L2 Norm: {np.linalg.norm(embedding_array):.6f}")
    
    # Plot distribution
    plt.figure(figsize=(12, 4))
    
    plt.subplot(1, 2, 1)
    plt.hist(embedding_array, bins=50, alpha=0.7, edgecolor='black')
    plt.title(f'{name} Distribution')
    plt.xlabel('Value')
    plt.ylabel('Frequency')
    
    plt.subplot(1, 2, 2)
    plt.plot(embedding_array[:100])  # Plot first 100 dimensions
    plt.title(f'{name} - First 100 Dimensions')
    plt.xlabel('Dimension')
    plt.ylabel('Value')
    
    plt.tight_layout()
    plt.show()

## Test 1: Text Embeddings

Let's test the text embedding capabilities with various types of text.

In [None]:
# Test texts for embedding
test_texts = [
    "A beautiful sunset over the ocean",
    "A cat sitting on a windowsill",
    "The latest advances in artificial intelligence",
    "A delicious pizza with melted cheese",
    "A car driving on a mountain road",
    "A person reading a book in a library",
    "A flower garden in full bloom",
    "A computer screen showing code",
    "A dog playing in the park",
    "A city skyline at night"
]

# compare and contrast with two 

print("Testing text embeddings...")
text_embeddings = {}

for i, text in enumerate(test_texts):
    print(f"Processing text {i+1}/{len(test_texts)}: {text[:50]}...")
    embedding = get_text_embedding(text)
    if embedding:
        text_embeddings[text] = embedding
        print(f"  :) Success - Embedding dimension: {len(embedding)}")
    else:
        print(f"  :( Failed")

print(f"\nSuccessfully generated {len(text_embeddings)} text embeddings")

In [None]:
# Analyze text embeddings
if text_embeddings:
    # Analyze first embedding
    first_text = list(text_embeddings.keys())[0]
    first_embedding = text_embeddings[first_text]
    analyze_embedding(first_embedding, f"Text: {first_text[:30]}...")
    
    # Calculate similarities between all text embeddings
    texts = list(text_embeddings.keys())
    embeddings_matrix = np.array([text_embeddings[text] for text in texts])
    
    similarity_matrix = cosine_similarity(embeddings_matrix)
    
    # Plot similarity matrix
    plt.figure(figsize=(12, 10))
    sns.heatmap(similarity_matrix, 
                xticklabels=[text[:20] + '...' for text in texts],
                yticklabels=[text[:20] + '...' for text in texts],
                cmap='viridis', 
                annot=True, 
                fmt='.3f',
                square=True)
    plt.title('Text Embedding Similarities')
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.show()

## Test 2: Image Embeddings

2.1) Let's test image embedding capabilities. First, we'll do a sanity check to see whether the image embedding is truly embedding the image or not.

In [None]:

test_image_url = (
    "https://upload.wikimedia.org/wikipedia/commons/d/df/Porsche_911_GT3_Touring%2C_IAA_2017%2C_Frankfurt_%281Y7A2766%29.jpg"
)

# 1) Embed the image via URL
image_emb = get_image_embedding_from_url(test_image_url)
# 2) Embed the URL string as plain text
text_emb  = get_text_embedding(test_image_url)

if image_emb is None or text_emb is None:
    raise RuntimeError("Failed to retrieve one or both embeddings.")

print(f"Embedding dimension: image={len(image_emb)}, text={len(text_emb)}")

sim = cosine_similarity_vectors(image_emb, text_emb)
print(f"Cosine similarity(image vs text): {sim:.6f}")

# If the model were just embedding the URL string, sim would be ≈1.0
# Expect it to be *low* (say < 0.5) for a true image embedding
if sim < 0.5:
    print(f"High similarity ({sim:.3f}) -> server may be embedding the URL text, not the image content.")
else:
    print(f"Low similarity ({sim:.3f}) -> server is embedding the image content.")


2.2) Then, let's create some sample images for testing. You should comment out the dictionary that you don't want to use.

In [None]:
# URLs to images for url input
sample_images = {
    "apple": "https://upload.wikimedia.org/wikipedia/commons/1/15/Red_Apple.jpg",
    "banana": "https://upload.wikimedia.org/wikipedia/commons/8/8a/Banana-Single.jpg",
    "orange": "https://upload.wikimedia.org/wikipedia/commons/c/c4/Orange-Fruit-Pieces.jpg",
    "watermelon": "https://upload.wikimedia.org/wikipedia/commons/4/47/Taiwan_2009_Tainan_City_Organic_Farm_Watermelon_FRD_7962.jpg",
    "peach": "https://upload.wikimedia.org/wikipedia/commons/9/9e/Autumn_Red_peaches.jpg",
    "tennis racket": "https://upload.wikimedia.org/wikipedia/commons/3/3e/Tennis_Racket_and_Balls.jpg" 
}

loaded_images = {}
for name, url in sample_images.items():
    img = load_image_from_url(url)
    loaded_images[name] = img

# Display sample images
fig, axes = plt.subplots(1, len(loaded_images), figsize=(15, 3))
for ax, (name, img) in zip(axes, loaded_images.items()):
    ax.imshow(img)
    ax.set_title(name)
    ax.axis('off')

plt.tight_layout()
plt.show()

In [None]:
# Test image embeddings
print("Testing image embeddings...")
image_embeddings = {}

for name, image in sample_images.items():
    print(f"Processing image: {name}")
    embedding = get_image_embedding_from_url(image)
    if embedding:
        image_embeddings[name] = embedding
        print(f"  :) Success - Embedding dimension: {len(embedding)}")
    else:
        print(f"  :( Failed")

print(f"\nSuccessfully generated {len(image_embeddings)} image embeddings")

In [None]:
# Analyze image embeddings
if image_embeddings:
    # Analyze first embedding
    first_image = list(image_embeddings.keys())[0]
    first_embedding = image_embeddings[first_image]
    analyze_embedding(first_embedding, f"Image: {first_image}")
    
    # Calculate similarities between all image embeddings
    images = list(image_embeddings.keys())
    embeddings_matrix = np.array([image_embeddings[img] for img in images])
    
    similarity_matrix = cosine_similarity(embeddings_matrix)
    
    # Plot similarity matrix
    plt.figure(figsize=(10, 8))
    sns.heatmap(similarity_matrix, 
                xticklabels=images,
                yticklabels=images,
                cmap='viridis', 
                annot=True, 
                fmt='.3f',
                square=True)
    plt.title('Image Embedding Similarities')
    plt.tight_layout()
    plt.show()

## Test 3: Multimodal Embeddings

Now let's test multimodal embeddings that combine text and images.

In [None]:
# Test multimodal embeddings
print("Testing multimodal embeddings...")
multimodal_embeddings = {}

multimodal_tests_url = [
    ("https://upload.wikimedia.org/wikipedia/commons/f/f4/Honeycrisp.jpg", "A bright red apple"),
    ("https://upload.wikimedia.org/wikipedia/commons/9/98/Bananas_on_black_background_02.jpg", "A ripe yellow banana"),
    ("https://upload.wikimedia.org/wikipedia/commons/4/43/Ambersweet_oranges.jpg", "Some bright cut oranges"),
    ("https://upload.wikimedia.org/wikipedia/commons/8/89/Citrullus_lanatus5SHSU.jpg", "A watermelon in a pile of leaves"),
    ("https://upload.wikimedia.org/wikipedia/commons/c/cb/White_nectarine_and_cross_section02_edit.jpg", "A cut open juicy peach"),
    ("https://upload.wikimedia.org/wikipedia/commons/3/3e/Tennis_Racket_and_Balls.jpg", "A tennis racket and ball on a hard court")
]

for image, text in multimodal_tests_url:
    print(f"Processing multimodal: '{text}'")
    embedding = get_multimodal_embedding(image, text)
    if embedding:
        multimodal_embeddings[text] = embedding
        print(f"  :) Success - Embedding dimension: {len(embedding)}")
    else:
        print(f"  :( Failed")

print(f"\nSuccessfully generated {len(multimodal_embeddings)} multimodal embeddings")

In [None]:
# Analyze multimodal embeddings
if multimodal_embeddings:
    # Analyze first embedding
    first_multimodal = list(multimodal_embeddings.keys())[0]
    first_embedding = multimodal_embeddings[first_multimodal]
    analyze_embedding(first_embedding, f"Multimodal: {first_multimodal}")
    
    # Calculate similarities between all multimodal embeddings
    multimodal_items = list(multimodal_embeddings.keys())
    embeddings_matrix = np.array([multimodal_embeddings[item] for item in multimodal_items])
    
    similarity_matrix = cosine_similarity(embeddings_matrix)
    
    # Plot similarity matrix
    plt.figure(figsize=(12, 10))
    sns.heatmap(similarity_matrix, 
                xticklabels=[item[:15] + '...' for item in multimodal_items],
                yticklabels=[item[:15] + '...' for item in multimodal_items],
                cmap='viridis', 
                annot=True, 
                fmt='.3f',
                square=True)
    plt.title('Multimodal Embedding Similarities')
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.show()

## Test 4: Cross-Modal Similarity Analysis

Let's analyze how similar embeddings are across different modalities (text, image, multimodal).

In [None]:
# Cross-modal similarity analysis with comprehensive testing
if image_embeddings and multimodal_embeddings:
    print("Cross-modal similarity analysis...")
    print("Generating descriptive texts and their embeddings...")
    
    # Create descriptive texts for each item
    descriptive_texts = {
        "apple": "A juicy red fruit with a crisp texture and sweet-tart flavor, often enjoyed fresh or in pies",
        "banana": "An elongated yellow tropical fruit with a soft, creamy texture and naturally sweet taste",
        "orange": "A round citrus fruit with bright orange skin, juicy segments, and a tangy-sweet flavor",
        "watermelon": "A large green striped melon with refreshing red flesh filled with black seeds and high water content",
        "peach": "A fuzzy stone fruit with soft orange-pink flesh, sweet aromatic flavor, and a large pit in the center",
        "tennis racket": "A tennis racket and ball on a hard court"
    }
    
    # Generate embeddings for descriptive texts
    descriptive_embeddings = {}
    for item_name, description in descriptive_texts.items():
        print(f"  Generating embedding for {item_name} description...")
        embedding = get_text_embedding(description)
        if embedding:
            descriptive_embeddings[item_name] = embedding
            print(f"    ✓ Success")
        else:
            print(f"    ✗ Failed")
    
    # Map image names to their corresponding multimodal texts
    image_to_multimodal = {
        "apple": "A bright red apple",
        "banana": "A ripe yellow banana",
        "orange": "Some bright cut oranges",
        "watermelon": "A watermelon in a pile of leaves",
        "peach": "A cut open juicy peach",
        "tennis racket": "A tennis racket and ball on a hard court"
    }
    
    # Collect all available items
    all_items = []
    for item_name in descriptive_embeddings.keys():
        if (item_name in image_embeddings and 
            item_name in image_to_multimodal and 
            image_to_multimodal[item_name] in multimodal_embeddings):
            
            all_items.append({
                'name': item_name,
                'descriptive_text': descriptive_texts[item_name],
                'multimodal_text': image_to_multimodal[item_name],
                'desc_emb': descriptive_embeddings[item_name],
                'image_emb': image_embeddings[item_name],
                'multimodal_emb': multimodal_embeddings[image_to_multimodal[item_name]]
            })
    
    if all_items:
        print(f"\nFound {len(all_items)} items for comprehensive analysis")
        
        # 1. SELF-SIMILARITY ANALYSIS (same item across modalities)
        print("\n" + "="*80)
        print("SELF-SIMILARITY ANALYSIS (same item across modalities)")
        print("="*80)
        
        self_results = []
        for item in all_items:
            desc_img_sim = cosine_similarity_vectors(item['desc_emb'], item['image_emb'])
            desc_multi_sim = cosine_similarity_vectors(item['desc_emb'], item['multimodal_emb'])
            img_multi_sim = cosine_similarity_vectors(item['image_emb'], item['multimodal_emb'])
            
            self_results.append({
                'name': item['name'],
                'desc_img': desc_img_sim,
                'desc_multimodal': desc_multi_sim,
                'img_multimodal': img_multi_sim
            })
            
            print(f"\n{item['name'].upper()}:")
            print(f"  Descriptive Text ↔ Image:      {desc_img_sim:.4f}")
            print(f"  Descriptive Text ↔ Multimodal: {desc_multi_sim:.4f}")
            print(f"  Image ↔ Multimodal:            {img_multi_sim:.4f}")
        
        # 2. CROSS-ITEM SIMILARITY ANALYSIS
        print("\n" + "="*80)
        print("CROSS-ITEM SIMILARITY ANALYSIS")
        print("="*80)
        
        # Create similarity matrices for each modality pair
        item_names = [item['name'] for item in all_items]
        n_items = len(all_items)
        
        # Initialize similarity matrices
        img_to_img_matrix = np.zeros((n_items, n_items))
        desc_to_desc_matrix = np.zeros((n_items, n_items))
        multi_to_multi_matrix = np.zeros((n_items, n_items))
        img_to_multi_matrix = np.zeros((n_items, n_items))
        desc_to_img_matrix = np.zeros((n_items, n_items))
        desc_to_multi_matrix = np.zeros((n_items, n_items))
        
        # Calculate all pairwise similarities
        for i, item1 in enumerate(all_items):
            for j, item2 in enumerate(all_items):
                img_to_img_matrix[i, j] = cosine_similarity_vectors(item1['image_emb'], item2['image_emb'])
                desc_to_desc_matrix[i, j] = cosine_similarity_vectors(item1['desc_emb'], item2['desc_emb'])
                multi_to_multi_matrix[i, j] = cosine_similarity_vectors(item1['multimodal_emb'], item2['multimodal_emb'])
                img_to_multi_matrix[i, j] = cosine_similarity_vectors(item1['image_emb'], item2['multimodal_emb'])
                desc_to_img_matrix[i, j] = cosine_similarity_vectors(item1['desc_emb'], item2['image_emb'])
                desc_to_multi_matrix[i, j] = cosine_similarity_vectors(item1['desc_emb'], item2['multimodal_emb'])
        
        # Visualize similarity matrices
        fig, axes = plt.subplots(2, 3, figsize=(18, 12))
        fig.suptitle('Cross-Item Similarity Matrices', fontsize=16)
        
        matrices = [
            (img_to_img_matrix, 'Image -> Image'),
            (desc_to_desc_matrix, 'Desc -> Desc'),
            (multi_to_multi_matrix, 'Multi -> Multi'),
            (img_to_multi_matrix, 'Image -> Multi'),
            (desc_to_img_matrix, 'Desc -> Image'),
            (desc_to_multi_matrix, 'Desc -> Multi')
        ]
        
        for idx, (matrix, title) in enumerate(matrices):
            ax = axes[idx // 3, idx % 3]
            im = ax.imshow(matrix, cmap='RdYlGn', aspect='auto', vmin=0, vmax=1)
            ax.set_title(title)
            ax.set_xticks(np.arange(n_items))
            ax.set_yticks(np.arange(n_items))
            ax.set_xticklabels(item_names, rotation=45, ha='right')
            ax.set_yticklabels(item_names)
            
            # Add text annotations for values
            for i in range(n_items):
                for j in range(n_items):
                    text = ax.text(j, i, f'{matrix[i, j]:.2f}',
                                 ha="center", va="center", 
                                 color="white" if matrix[i, j] < 0.5 else "black",
                                 fontsize=9)
        
        plt.tight_layout()
        plt.show()
        
        # 3. TENNIS RACKET vs FRUITS ANALYSIS
        print("\n" + "="*80)
        print("TENNIS RACKET vs FRUITS ANALYSIS")
        print("="*80)
        
        if 'tennis racket' in item_names:
            tennis_idx = item_names.index('tennis racket')
            tennis_item = all_items[tennis_idx]
            
            print("\nComparing tennis racket with each fruit:")
            fruit_comparisons = []
            
            for item in all_items:
                if item['name'] != 'tennis racket':
                    # Calculate all cross-modal similarities
                    tennis_img_to_fruit_img = cosine_similarity_vectors(tennis_item['image_emb'], item['image_emb'])
                    tennis_img_to_fruit_multi = cosine_similarity_vectors(tennis_item['image_emb'], item['multimodal_emb'])
                    tennis_multi_to_fruit_img = cosine_similarity_vectors(tennis_item['multimodal_emb'], item['image_emb'])
                    tennis_multi_to_fruit_multi = cosine_similarity_vectors(tennis_item['multimodal_emb'], item['multimodal_emb'])
                    tennis_desc_to_fruit_desc = cosine_similarity_vectors(tennis_item['desc_emb'], item['desc_emb'])
                    
                    fruit_comparisons.append({
                        'fruit': item['name'],
                        'img_to_img': tennis_img_to_fruit_img,
                        'img_to_multi': tennis_img_to_fruit_multi,
                        'multi_to_img': tennis_multi_to_fruit_img,
                        'multi_to_multi': tennis_multi_to_fruit_multi,
                        'desc_to_desc': tennis_desc_to_fruit_desc
                    })
                    
                    print(f"\n  Tennis Racket -> {item['name'].capitalize()}:")
                    print(f"    Image -> Image:           {tennis_img_to_fruit_img:.4f}")
                    print(f"    Image -> Multimodal:      {tennis_img_to_fruit_multi:.4f}")
                    print(f"    Multimodal -> Image:      {tennis_multi_to_fruit_img:.4f}")
                    print(f"    Multimodal -> Multimodal: {tennis_multi_to_fruit_multi:.4f}")
                    print(f"    Desc -> Desc:             {tennis_desc_to_fruit_desc:.4f}")
            
            # Visualize tennis racket comparisons with bar chart only
            if fruit_comparisons:
                fig, ax = plt.subplots(figsize=(12, 6))
                
                # Bar chart of tennis racket similarities
                fruits = [comp['fruit'] for comp in fruit_comparisons]
                x = np.arange(len(fruits))
                width = 0.15
                
                metrics = ['img_to_img', 'img_to_multi', 'multi_to_img', 'multi_to_multi', 'desc_to_desc']
                colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd']
                
                for i, metric in enumerate(metrics):
                    values = [comp[metric] for comp in fruit_comparisons]
                    ax.bar(x + i * width - 2 * width, values, width, 
                           label=metric.replace('_', ' -> ').replace('to', '->').title(),
                           color=colors[i])
                
                ax.set_xlabel('Fruit')
                ax.set_ylabel('Cosine Similarity')
                ax.set_title('Tennis Racket Similarity to Fruits (All Modalities)')
                ax.set_xticks(x)
                ax.set_xticklabels(fruits)
                ax.legend(loc='upper right')
                ax.grid(axis='y', alpha=0.3)
                ax.set_ylim(0, 0.8)
                
                plt.tight_layout()
                plt.show()
        
        # 4. SUMMARY STATISTICS
        print("\n" + "="*80)
        print("SUMMARY STATISTICS")
        print("="*80)
        
        # Find most and least similar pairs across modalities
        print("\nMost Similar Cross-Item Pairs (excluding self-similarities):")
        cross_similarities = []
        for i, item1 in enumerate(all_items):
            for j, item2 in enumerate(all_items):
                if i != j:  # Exclude self-similarities
                    cross_similarities.append({
                        'pair': f"{item1['name']} -> {item2['name']}",
                        'img_to_img': img_to_img_matrix[i, j],
                        'multi_to_multi': multi_to_multi_matrix[i, j],
                        'img_to_multi': img_to_multi_matrix[i, j]
                    })
        
        # Sort by different metrics
        for metric in ['img_to_img', 'multi_to_multi', 'img_to_multi']:
            sorted_pairs = sorted(cross_similarities, key=lambda x: x[metric], reverse=True)
            print(f"\n  Top 3 by {metric}:")
            for i, pair in enumerate(sorted_pairs[:3]):
                print(f"    {i+1}. {pair['pair']}: {pair[metric]:.4f}")

## Test 5: Semantic Search Demo

Let's demonstrate semantic search capabilities using the embeddings.

In [None]:
def semantic_search(query_embedding: List[float], 
                    candidate_embeddings: Dict[str, List[float]], 
                    top_k: int = 3) -> List[tuple]:
    """Perform semantic search using cosine similarity."""
    similarities = []
    
    for name, embedding in candidate_embeddings.items():
        similarity = cosine_similarity_vectors(query_embedding, embedding)
        similarities.append((name, similarity))
    
    # Sort by similarity (descending)
    similarities.sort(key=lambda x: x[1], reverse=True)
    
    return similarities[:top_k]

# Create multimodal embeddings (text + image combinations)
print("Creating multimodal embeddings...")
multimodal_embeddings = {}

# Define text descriptions for each image
image_descriptions = {
    "apple": "A fresh red apple fruit",
    "banana": "A yellow banana fruit", 
    "orange": "An orange citrus fruit",
    "watermelon": "A large green watermelon",
    "peach": "Autumn red peaches",
    "tennis racket": "Tennis racket with balls"
}

# Generate multimodal embeddings
for name, image_url in sample_images.items():
    if name in image_descriptions:
        print(f"Processing {name}...")
        description = image_descriptions[name]
        embedding = get_multimodal_embedding(image_url, description)
        if embedding:
            multimodal_embeddings[f"{description} | {name}"] = embedding
            print(f"  :( Success")
        else:
            print(f"  :) Failed")

print(f"\nGenerated {len(multimodal_embeddings)} multimodal embeddings")

# Test multimodal semantic search
if multimodal_embeddings:
    print("\n" + "="*50)
    print("Multimodal Semantic Search Demo")
    print("="*50)
    
    # Test with text + image queries
    test_queries = [
        {
            "text": "healthy snack for athletes",
            "image": sample_images["apple"]  # Apple as reference image
        },
        {
            "text": "tropical fruit that's yellow", 
            "image": sample_images["banana"]  # Banana as reference
        },
        {
            "text": "sports equipment for outdoor games",
            "image": sample_images["tennis racket"]  # Tennis racket as reference
        },
        {
            "text": "juicy summer fruit",
            "image": sample_images["watermelon"]  # Watermelon as reference
        }
    ]
    
    for i, query in enumerate(test_queries, 1):
        print(f"\nQuery {i}: '{query['text']}'")
        print(f"Reference image: {query['image'].split('/')[-1]}")
        
        # Generate multimodal query embedding
        query_embedding = get_multimodal_embedding(query['image'], query['text'])
        
        if query_embedding:
            results = semantic_search(query_embedding, multimodal_embeddings, top_k=5)
            print("Top matches:")
            for j, (name, similarity) in enumerate(results, 1):
                print(f"  {j}. {name} (similarity: {similarity:.4f})")
        else:
            print("Failed to generate query embedding")
    
    # Cross-modal search: Text-only query against multimodal embeddings
    print("\n" + "="*50)
    print("Cross-modal Search (Text -> Multimodal)")
    print("="*50)
    
    text_only_queries = [
        "vitamin C rich citrus",
        "tennis and sports",
        "sweet red fruit",
        "large green fruit with seeds"
    ]
    
    for query_text in text_only_queries:
        print(f"\nText query: '{query_text}'")
        
        # Use a neutral/placeholder image for text-only queries
        # Or you could create a separate text-only embedding function
        query_embedding = get_text_embedding(query_text)
        
        if query_embedding:
            results = semantic_search(query_embedding, multimodal_embeddings, top_k=3)
            print("Top matches:")
            for j, (name, similarity) in enumerate(results, 1):
                print(f"  {j}. {name} (similarity: {similarity:.4f})")
        else:
            print("Failed to generate query embedding")

## Test 6: Embedding Quality Analysis

Let's analyze the quality and characteristics of the embeddings.

In [None]:
def analyze_embedding_quality(embeddings_dict: Dict[str, List[float]], 
                             name: str = "Embeddings") -> None:
    """Analyze the quality and characteristics of embeddings."""
    if not embeddings_dict:
        print(f"No {name.lower()} available for analysis")
        return
    
    embeddings_list = list(embeddings_dict.values())
    embeddings_array = np.array(embeddings_list)
    
    print(f"\n{name} Quality Analysis:")
    print(f"  - Number of embeddings: {len(embeddings_list)}")
    print(f"  - Embedding dimension: {embeddings_array.shape[1]}")
    print(f"  - Mean embedding norm: {np.mean([np.linalg.norm(emb) for emb in embeddings_list]):.6f}")
    print(f"  - Std embedding norm: {np.std([np.linalg.norm(emb) for emb in embeddings_list]):.6f}")
    
    # Calculate pairwise similarities
    similarity_matrix = cosine_similarity(embeddings_array)
    
    # Remove diagonal (self-similarities)
    mask = ~np.eye(similarity_matrix.shape[0], dtype=bool)
    off_diagonal_similarities = similarity_matrix[mask]
    
    print(f"  - Mean pairwise similarity: {off_diagonal_similarities.mean():.6f}")
    print(f"  - Std pairwise similarity: {off_diagonal_similarities.std():.6f}")
    print(f"  - Min pairwise similarity: {off_diagonal_similarities.min():.6f}")
    print(f"  - Max pairwise similarity: {off_diagonal_similarities.max():.6f}")
    
    # Plot similarity distribution
    plt.figure(figsize=(10, 6))
    
    plt.subplot(1, 2, 1)
    plt.hist(off_diagonal_similarities, bins=30, alpha=0.7, edgecolor='black')
    plt.title(f'{name} - Pairwise Similarity Distribution')
    plt.xlabel('Cosine Similarity')
    plt.ylabel('Frequency')
    
    plt.subplot(1, 2, 2)
    plt.imshow(similarity_matrix, cmap='viridis', aspect='auto')
    plt.colorbar(label='Cosine Similarity')
    plt.title(f'{name} - Similarity Matrix')
    
    plt.tight_layout()
    plt.show()

# Analyze quality for each modality
analyze_embedding_quality(text_embeddings, "Text Embeddings")
analyze_embedding_quality(image_embeddings, "Image Embeddings")
analyze_embedding_quality(multimodal_embeddings, "Multimodal Embeddings")