In [2]:
# Initialize Faiss vector store
import faiss
import numpy as np

In [4]:
# Parse embeddings from string to numpy arrays
import pandas as pd
import numpy as np

def parse_embedding(emb_str):
    if pd.isna(emb_str) or emb_str == '':
        return np.array([])
    # Remove brackets and split by spaces
    emb_str = emb_str.strip('[]')
    values = [float(x) for x in emb_str.split()]
    return np.array(values)

# Load the embedded dataset
df = pd.read_csv('../data/04_feature/embedded_art_dataset.csv')

df['text_embedding'] = df['text_embedding'].apply(parse_embedding)
df['image_embedding'] = df['image_embedding'].apply(parse_embedding)

# Extract relevant columns
text_embeddings = df['text_embedding'].tolist()
image_embeddings = df['image_embedding'].tolist()
metadata = df[['artist', 'title', 'year', 'dataset_source']].to_dict('records')

print(f"Loaded {len(df)} artworks")
print("Sample metadata:", metadata[0])
print("Text embedding shape:", text_embeddings[0].shape if text_embeddings[0].size > 0 else 0)
print("Image embedding shape:", image_embeddings[0].shape if image_embeddings[0].size > 0 else 0)

Loaded 1660 artworks
Sample metadata: {'artist': 'aaron siskind', 'title': 'acolman-1-1955', 'year': nan, 'dataset_source': 'WikiArt_ArtEmis'}
Text embedding shape: (128,)
Image embedding shape: (512,)


## Step 2: Prepare Data for Indexing

- **Combine Embeddings**: Concatenate text (128-dim) and image (512-dim) embeddings into 640-dim vectors for multimodal search. Pad with zeros if one is missing.
- **Normalize**: Apply L2 normalization for cosine similarity in vector search.
- **Metadata**: Already prepared as list of dicts with artist, title, year, dataset_source.

In [5]:
# Prepare data for indexing
import numpy as np

# Strategy: Concatenate text and image embeddings for multimodal search
combined_embeddings = []
for text_emb, image_emb in zip(text_embeddings, image_embeddings):
    if text_emb.size > 0 and image_emb.size > 0:
        combined = np.concatenate([text_emb, image_emb])
    elif text_emb.size > 0:
        combined = np.concatenate([text_emb, np.zeros(512)])  # Pad with zeros if no image
    elif image_emb.size > 0:
        combined = np.concatenate([np.zeros(128), image_emb])  # Pad with zeros if no text
    else:
        combined = np.zeros(128 + 512)
    combined_embeddings.append(combined)

# Normalize embeddings for cosine similarity
combined_embeddings = np.array(combined_embeddings)
norms = np.linalg.norm(combined_embeddings, axis=1, keepdims=True)
norms[norms == 0] = 1  # Avoid division by zero
normalized_embeddings = combined_embeddings / norms

# Metadata is already prepared as list of dicts
print(f"Prepared {len(normalized_embeddings)} combined embeddings")
print("Combined embedding shape:", normalized_embeddings[0].shape)
print("Sample normalized embedding norm:", np.linalg.norm(normalized_embeddings[0]))

Prepared 1660 combined embeddings
Combined embedding shape: (640,)
Sample normalized embedding norm: 1.0


## Step 3: Initialize Vector Store

- **Faiss Setup**: Use IndexFlatIP for exact cosine similarity search on 640-dim normalized embeddings.
- **Why Faiss**: Efficient, fast for small-to-medium datasets like 1.6k artworks, supports multimodal search without metadata storage (handled separately).

In [6]:
# Create Faiss index for cosine similarity (using inner product on normalized vectors)
dimension = normalized_embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)  # Inner product for cosine similarity

# Add normalized embeddings to the index
index.add(normalized_embeddings.astype('float32'))

print(f"Faiss index initialized with {index.ntotal} vectors")
print(f"Index dimension: {dimension}")

Faiss index initialized with 1660 vectors
Index dimension: 640


## Step 4: Test Vector Search

- **Query Example**: Search for the top 5 most similar artworks to the first one in the dataset.
- **Faiss Search**: Use index.search() to find nearest neighbors by cosine similarity.

In [7]:
# Test search: Find top 5 similar artworks to the first one
query_vector = normalized_embeddings[0:1].astype('float32')  # First vector as query
distances, indices = index.search(query_vector, k=5)  # k=5 nearest neighbors

print("Query artwork:", metadata[0])
print("\nTop 5 similar artworks:")
for i, idx in enumerate(indices[0]):
    print(f"{i+1}. Distance: {distances[0][i]:.4f}")
    print(f"   Metadata: {metadata[idx]}")
    print()

Query artwork: {'artist': 'aaron siskind', 'title': 'acolman-1-1955', 'year': nan, 'dataset_source': 'WikiArt_ArtEmis'}

Top 5 similar artworks:
1. Distance: 1.0000
   Metadata: {'artist': 'aaron siskind', 'title': 'acolman-1-1955', 'year': nan, 'dataset_source': 'WikiArt_ArtEmis'}

2. Distance: 0.8372
   Metadata: {'artist': 'howard hodgkin', 'title': 'all-alone-in-the-museum-of-art-1979', 'year': nan, 'dataset_source': 'WikiArt_ArtEmis'}

3. Distance: 0.8356
   Metadata: {'artist': 'aaron siskind', 'title': 'uruapan-11-1955', 'year': nan, 'dataset_source': 'WikiArt_ArtEmis'}

4. Distance: 0.8344
   Metadata: {'artist': 'edward corbett', 'title': 'untitled-1951', 'year': nan, 'dataset_source': 'WikiArt_ArtEmis'}

5. Distance: 0.8255
   Metadata: {'artist': 'jay defeo', 'title': 'origin-1956', 'year': nan, 'dataset_source': 'WikiArt_ArtEmis'}



In [None]:
# Save the Faiss index to disk
faiss.write_index(index, "art_embeddings.index")
print("Faiss index saved to art_embeddings.index")


Faiss index saved to art_embeddings.index


In [None]:
# To load later: index = faiss.read_index("art_embeddings.index")

## Step 5: Store Metadata

Store Metadata:

Ensure metadata is stored alongside embeddings for retrieval (e.g., artist, title, year, dataset_source).
For FAISS, use a separate data structure (e.g., dict or list) to map indices to metadata.

In [9]:
# Save metadata to JSON for persistence
import json

with open("art_metadata.json", "w") as f:
    json.dump(metadata, f, indent=4)

print("Metadata saved to art_metadata.json")
print(f"Total metadata entries: {len(metadata)}")

Metadata saved to art_metadata.json
Total metadata entries: 1660


In [None]:
# Example: Load index and metadata for querying
# index = faiss.read_index("art_embeddings.index")
# with open("art_metadata.json", "r") as f:
#     loaded_metadata = json.load(f)
# print(f"Loaded index with {index.ntotal} vectors and {len(loaded_metadata)} metadata entries")

## Step 6: Test Retrieval

Test Retrieval:

Implement a simple query function: Input a text query or image embedding, compute similarity, and return top-k results with metadata.
Example queries: "Find art similar to Van Gogh's style" (encode query text/image and search).

In [10]:
# Load models for encoding queries
from transformers import BertTokenizer, BertModel, CLIPProcessor, CLIPModel
import torch

# Load BERT for text
bert_tokenizer = BertTokenizer.from_pretrained('prajjwal1/bert-tiny')
bert_model = BertModel.from_pretrained('prajjwal1/bert-tiny')

# Load CLIP for images (if needed)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")

def encode_text(text):
    inputs = bert_tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

def query_art(query_text, top_k=5):
    # Encode query text
    text_emb = encode_text(query_text)
    
    # Pad with zeros for image part
    query_emb = np.concatenate([text_emb, np.zeros(512)])
    
    # Normalize
    norm = np.linalg.norm(query_emb)
    if norm > 0:
        query_emb = query_emb / norm
    
    # Search
    query_emb = query_emb.astype('float32').reshape(1, -1)
    distances, indices = index.search(query_emb, k=top_k)
    
    # Return results
    results = []
    for i, idx in enumerate(indices[0]):
        results.append({
            'distance': distances[0][i],
            'metadata': metadata[idx]
        })
    return results

# Example query
results = query_art("Find art similar to Van Gogh's style", top_k=5)
print("Query results:")
for res in results:
    print(f"Distance: {res['distance']:.4f}, Artist: {res['metadata']['artist']}, Title: {res['metadata']['title']}")

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Query results:
Distance: 0.5785, Artist: hans hofmann, Title: ecstasy-1947
Distance: 0.5577, Artist: elaine de kooning, Title: glass-wall-1987
Distance: 0.5564, Artist: manabu mabe, Title: passage-de-fuego-1961
Distance: 0.5564, Artist: brett whiteley, Title: american-dream-1969
Distance: 0.5557, Artist: bui xuan phai, Title: abstract(4)


## Step 7: Persistence and Optimization

Persistence and Optimization:

Persist the vector store to disk (ChromaDB does this automatically; FAISS requires saving the index).
Optimize for performance: Use approximate nearest neighbors if scaling up (e.g., FAISS IVF index).

In [11]:
# Persistence: Index and metadata are already saved
# Demonstrate loading
loaded_index = faiss.read_index("art_embeddings.index")
with open("art_metadata.json", "r") as f:
    loaded_metadata = json.load(f)
print(f"Loaded index with {loaded_index.ntotal} vectors and {len(loaded_metadata)} metadata entries")

# Optimization: For larger datasets, use IVF (Inverted File) for approximate search
# Example: Create IVF index (nlist=100 clusters, suitable for ~10k vectors)
nlist = 100  # Number of clusters
quantizer = faiss.IndexFlatIP(dimension)  # Quantizer
ivf_index = faiss.IndexIVFFlat(quantizer, dimension, nlist, faiss.METRIC_INNER_PRODUCT)

# Train the index (required for IVF)
ivf_index.train(normalized_embeddings.astype('float32'))
ivf_index.add(normalized_embeddings.astype('float32'))

# Save optimized index
faiss.write_index(ivf_index, "art_embeddings_ivf.index")
print("IVF index trained and saved for approximate search")

# Note: For small datasets like this (1660), IndexFlatIP is faster and exact.
# Use IVF when dataset > 10k for better performance at slight accuracy cost.

Loaded index with 1660 vectors and 1660 metadata entries
IVF index trained and saved for approximate search
