In [9]:
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
import requests
import os
import time

csv_path = './opendata-main/data/published_images.csv'
published_images = pd.read_csv(csv_path)

#Filter for primary images
primary_images = published_images[published_images['viewtype'] == 'primary']

alternate_images = published_images[published_images['viewtype'] == 'alternate']

os.makedirs('thumbnails', exist_ok=True)

failed_downloads = []

# download function with failure tracking
def download_image(row):
    image_url = row['iiifthumburl']
    image_id = row['depictstmsobjectid']
    image_path = f"thumbnails/{image_id}.jpg"

    if os.path.exists(image_path):
        # print(f"File already exists: {image_id}")
        return

    try:
        response = requests.get(image_url, timeout=10)
        with open(image_path, 'wb') as file:
            file.write(response.content)
        print(f"Downloaded: {image_id}")
    except Exception as e:
        print(f"Failed to download {image_id}: {e}")
        failed_downloads.append({'iiifThumbURL': image_url, 'depictstmsobjectid': image_id})

# Download images using ThreadPoolExecutor
with ThreadPoolExecutor(max_workers=20) as executor:
    for _, row in alternate_images.iterrows():
        executor.submit(download_image, row)

print(f"Failed downloads: {len(failed_downloads)}")
pd.DataFrame(failed_downloads).to_csv('failed_downloads.csv', index=False)


Failed downloads: 0


In [11]:
import os

downloaded_files = set(os.listdir('thumbnails'))

image_ids = set(published_images['depictstmsobjectid'].astype(str) + '.jpg')

missing_images = image_ids - downloaded_files
print(f"Missing images: {len(missing_images)}")

Missing images: 0


### Image Preprocessing

In [None]:
import os
from PIL import Image
from torchvision import transforms
import numpy as np

image_folder = 'thumbnails'

preprocessed_folder = 'preprocessed_data'
os.makedirs(preprocessed_folder, exist_ok=True)

preprocess = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Batch size for saving into .npz files
batch_size = 10000
image_list = os.listdir(image_folder)

# Processing images in batches
for batch_start in range(0, len(image_list), batch_size):
    batch_end = min(batch_start + batch_size, len(image_list))
    current_batch = image_list[batch_start:batch_end]
    
    print(f"Processing batch: {batch_start} to {batch_end}...")
    batch_data = {}
    
    for image_name in current_batch:
        try:
            image_path = os.path.join(image_folder, image_name)
            image = Image.open(image_path).convert('RGB')
            preprocessed_image = preprocess(image).numpy()
            
            # Add to batch data
            image_id = image_name.replace('.jpg', '')  # Removing file extension
            batch_data[image_id] = preprocessed_image
            print(f"Preprocessed: {image_name}")
        except Exception as e:
            print(f"Failed to preprocess {image_name}: {e}")
    
    # Saving batch as .npz file with unique name
    batch_save_path = os.path.join(preprocessed_folder, f"batch_{batch_start}_{batch_end}.npz")
    np.savez_compressed(batch_save_path, **batch_data)
    print(f"Saved batch to: {batch_save_path}")

print("Preprocessing and batch saving complete!")

Processing batch: 0 to 10000...
Preprocessed: 0.jpg
Preprocessed: 1.jpg
Preprocessed: 10.jpg
Preprocessed: 100.jpg
Preprocessed: 1000.jpg
Preprocessed: 10000.jpg
Preprocessed: 10001.jpg
Preprocessed: 100015.jpg
Preprocessed: 100018.jpg
Preprocessed: 10002.jpg
Preprocessed: 10003.jpg
Preprocessed: 10004.jpg
Preprocessed: 100049.jpg
Preprocessed: 10005.jpg
Preprocessed: 10006.jpg
Preprocessed: 100067.jpg
Preprocessed: 100069.jpg
Preprocessed: 10007.jpg
Preprocessed: 100070.jpg
Preprocessed: 100078.jpg
Preprocessed: 10008.jpg
Preprocessed: 10009.jpg
Preprocessed: 1001.jpg
Preprocessed: 10010.jpg
Preprocessed: 10011.jpg
Preprocessed: 10013.jpg
Preprocessed: 100137.jpg
Preprocessed: 10014.jpg
Preprocessed: 10015.jpg
Preprocessed: 10016.jpg
Preprocessed: 10017.jpg
Preprocessed: 10018.jpg
Preprocessed: 10019.jpg
Preprocessed: 1002.jpg
Preprocessed: 10020.jpg
Preprocessed: 10021.jpg
Preprocessed: 100210.jpg
Preprocessed: 100211.jpg
Preprocessed: 10022.jpg
Preprocessed: 10023.jpg
Preprocessed: 

In [6]:
import pandas as pd

#f iltering necessary columns from the datasets
images_csv = pd.read_csv(
    './opendata-main/data/published_images.csv',
    usecols=['depictstmsobjectid', 'iiifthumburl']
)
objects_csv = pd.read_csv(
    './opendata-main/data/objects.csv',
    usecols=['objectid', 'title', 'medium', 'classification']
)
terms_csv = pd.read_csv(
    './opendata-main/data/objects_terms.csv',
    usecols=['objectid', 'term']
)
constituents_csv = pd.read_csv(
    './opendata-main/data/constituents.csv',
    usecols=['constituentid', 'preferreddisplayname']
)
objects_constituents_csv = pd.read_csv(
    './opendata-main/data/objects_constituents.csv',
    usecols=['objectid', 'constituentid']
)

# Merging images with objects metadata
images_with_objects = images_csv.merge(
    objects_csv, left_on='depictstmsobjectid', right_on='objectid', how='left'
)
# Saving intermediate result to free memory
images_with_objects.to_csv('temp_images_with_objects.csv', index=False)

# Reloading intermediate result for the next merge
images_with_objects = pd.read_csv('temp_images_with_objects.csv')

# Merging with terms metadata
images_with_terms = images_with_objects.merge(
    terms_csv, left_on='objectid', right_on='objectid', how='left'
)
# Saving intermediate result again
images_with_terms.to_csv('temp_images_with_terms.csv', index=False)

# Reloading intermediate result for the next merge
images_with_terms = pd.read_csv('temp_images_with_terms.csv')

# Merging objects_constituents with constituents to include contributor names
objects_constituents_with_names = objects_constituents_csv.merge(
    constituents_csv, left_on='constituentid', right_on='constituentid', how='left'
)

# Merging the enriched metadata with contributors
final_metadata = images_with_terms.merge(
    objects_constituents_with_names, left_on='objectid', right_on='objectid', how='left'
)

# Saving the final enriched dataset
final_metadata.to_csv('enriched_metadata.csv', index=False)
print("Metadata linkage complete. Saved to 'enriched_metadata.csv'.")


Metadata linkage complete. Saved to 'enriched_metadata.csv'.


In [7]:
import pandas as pd

# Loading the enriched dataset
enriched_metadata = pd.read_csv('enriched_metadata.csv')

print(f"Total rows before removing duplicates: {enriched_metadata.shape[0]}")

enriched_metadata = enriched_metadata.drop_duplicates()

# Saving the cleaned dataset
enriched_metadata.to_csv('enriched_metadata_cleaned.csv', index=False)

print(f"Total rows after removing duplicates: {enriched_metadata.shape[0]}")
print("Cleaned dataset saved as 'enriched_metadata_cleaned.csv'.")


Total rows before removing duplicates: 3078746
Total rows after removing duplicates: 2042912
Cleaned dataset saved as 'enriched_metadata_cleaned.csv'.


### integrating metadata with preprocessed_data(.npz files)

In [11]:
npz_path = './preprocessed_data/batch_0_10000.npz'
npz_data = np.load(npz_path)
image_ids = list(npz_data.keys())

enriched_metadata = pd.read_csv('enriched_metadata_cleaned.csv')
metadata_ids = enriched_metadata['depictstmsobjectid'].astype(str).unique()

# Checking intersection of ids
matching_ids = set(image_ids).intersection(set(metadata_ids))
print(f"Number of matching IDs: {len(matching_ids)}")


Number of matching IDs: 10000


In [19]:
enriched_metadata['depictstmsobjectid'].astype(str)

0           17387
1           17387
2           17387
3           17387
4           17387
            ...  
2042907    165884
2042908    165884
2042909    165884
2042910    165884
2042911    165884
Name: depictstmsobjectid, Length: 2042912, dtype: object

In [22]:
filtered_metadata = enriched_metadata[enriched_metadata['depictstmsobjectid'].isin(image_ids)]
filtered_metadata

Unnamed: 0,iiifthumburl,depictstmsobjectid,objectid,title,medium,classification,term,constituentid,preferreddisplayname


In [28]:
import os
npz_path = './preprocessed_data/batch_0_10000.npz'
print(f"File size: {os.path.getsize(npz_path)} bytes")


File size: 1335838792 bytes


In [32]:
import os
npz_path = './preprocessed_data/batch_10000_20000.npz'
print(f"File size: {os.path.getsize(npz_path)} bytes")


File size: 1321692432 bytes


In [1]:
import os
import numpy as np
import pandas as pd

npz_folder = './preprocessed_data'
metadata_path = 'enriched_metadata_cleaned.csv'
combined_npz_folder = './combined_npz_files'
os.makedirs(combined_npz_folder, exist_ok=True)
combined_metadata_csv = 'combined_metadata.csv'
BATCH_SIZE = 10000     # Number of images per combined npz file
SUB_CHUNK_SIZE = 1000  # Number of keys to process at once from each npz file

enriched_metadata = pd.read_csv(metadata_path)
enriched_metadata['depictstmsobjectid'] = enriched_metadata['depictstmsobjectid'].astype(str)

if os.path.exists(combined_metadata_csv):
    os.remove(combined_metadata_csv)

metadata_cols = enriched_metadata.columns.tolist()
pd.DataFrame(columns=metadata_cols).to_csv(combined_metadata_csv, index=False)

current_images = {}      # will accumulate image tensors here (as a dict: key --> tensor)
current_metadata = []    # And corresponding metadata rows (as dicts)

global_image_counter = 0  # Total images processed across files
batch_counter = 0         # used For naming combined npz files

npz_files = sorted([f for f in os.listdir(npz_folder) if f.endswith('.npz')])
for npz_file in npz_files:
    npz_path = os.path.join(npz_folder, npz_file)
    print(f"Processing: {npz_file}")
    
    try:
        npz_data = np.load(npz_path)
    except Exception as e:
        print(f"Error loading {npz_file}: {e}")
        continue

    image_ids = list(npz_data.keys())
    print(f"Loaded {len(image_ids)} image IDs from {npz_file}")
    
    for i in range(0, len(image_ids), SUB_CHUNK_SIZE):
        sub_chunk_ids = image_ids[i : i + SUB_CHUNK_SIZE]
        sub_chunk_metadata = enriched_metadata[enriched_metadata['depictstmsobjectid'].isin(sub_chunk_ids)]
        print(f"Processing sub-chunk {i} to {i+len(sub_chunk_ids)}: found {len(sub_chunk_metadata)} matching metadata rows.")
        
        for image_id in sub_chunk_ids:
            if image_id in sub_chunk_metadata['depictstmsobjectid'].values:
                try:
                    image_tensor = npz_data[image_id]
                except Exception as e:
                    print(f"Error processing image ID {image_id} in {npz_file}: {e}")
                    continue

                # Accumulating the image tensor and its metadata
                current_images[image_id] = image_tensor
                metadata_row = sub_chunk_metadata[sub_chunk_metadata['depictstmsobjectid'] == image_id].iloc[0].to_dict()
                current_metadata.append(metadata_row)
                global_image_counter += 1

                # If reached the batch size, flush the current batch
                if global_image_counter % BATCH_SIZE == 0:
                    batch_name = f"combined_batch_{batch_counter}.npz"
                    batch_path = os.path.join(combined_npz_folder, batch_name)
                    print(f"Flushing batch {batch_counter} with {len(current_images)} images.")
                    np.savez_compressed(batch_path, **current_images)
                    
                    pd.DataFrame(current_metadata).to_csv(combined_metadata_csv, mode='a', header=False, index=False)
                    
                    # Reset accumulators for the next batch
                    current_images = {}
                    current_metadata = []
                    batch_counter += 1

    del npz_data

# Flushing any remaining images after processing all files
if len(current_images) > 0:
    batch_name = f"combined_batch_{batch_counter}.npz"
    batch_path = os.path.join(combined_npz_folder, batch_name)
    print(f"Flushing final batch {batch_counter} with {len(current_images)} images.")
    np.savez_compressed(batch_path, **current_images)
    pd.DataFrame(current_metadata).to_csv(combined_metadata_csv, mode='a', header=False, index=False)

print("Integration complete. Combined metadata saved to", combined_metadata_csv)


Processing: batch_0_10000.npz
Loaded 10000 image IDs from batch_0_10000.npz
Processing sub-chunk 0 to 1000: found 18914 matching metadata rows.
Processing sub-chunk 1000 to 2000: found 19011 matching metadata rows.
Processing sub-chunk 2000 to 3000: found 23577 matching metadata rows.
Processing sub-chunk 3000 to 4000: found 18895 matching metadata rows.
Processing sub-chunk 4000 to 5000: found 20211 matching metadata rows.
Processing sub-chunk 5000 to 6000: found 21696 matching metadata rows.
Processing sub-chunk 6000 to 7000: found 20399 matching metadata rows.
Error processing image ID 120704 in batch_0_10000.npz: Error -3 while decompressing data: invalid block type
Error processing image ID 120705 in batch_0_10000.npz: Bad magic number for file header
Error processing image ID 120706 in batch_0_10000.npz: Bad magic number for file header
Processing sub-chunk 7000 to 8000: found 7790 matching metadata rows.
Processing sub-chunk 8000 to 9000: found 18635 matching metadata rows.
Proc

### training

In [8]:
!pip install git+https://github.com/openai/CLIP.git


Defaulting to user installation because normal site-packages is not writeable
Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to c:\users\kiit\appdata\local\temp\pip-req-build-490pdmtk
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'


  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git 'C:\Users\KIIT\AppData\Local\Temp\pip-req-build-490pdmtk'

[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import os
import numpy as np
import torch
import clip

device = "cuda" if torch.cuda.is_available() else "cpu"

model, _ = clip.load("ViT-B/32", device=device)
model.eval()

# normalization parameters defined
# These are the means and stds used when we initially preprocessed the images
imagenet_mean = torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1)
imagenet_std = torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1)
# CLIP expects its own normalization:
clip_mean = torch.tensor([0.48145466, 0.4578275, 0.40821073]).view(3, 1, 1)
clip_std = torch.tensor([0.26862954, 0.26130258, 0.27577711]).view(3, 1, 1)

def convert_to_clip_image(image_np):
    im_tensor = torch.from_numpy(image_np)
    # Unnormalize the image (recover approximate [0,1] range)
    im_denorm = im_tensor * imagenet_std + imagenet_mean
    # Re-normalize for CLIP
    im_clip = (im_denorm - clip_mean) / clip_std
    return im_clip

combined_npz_folder = './combined_npz_files'
mini_batch_size = 256

embedding_list = []

combined_npz_files = sorted([f for f in os.listdir(combined_npz_folder) if f.endswith('.npz')])
print(f"Found {len(combined_npz_files)} combined npz files.")

for npz_file in combined_npz_files:
    npz_path = os.path.join(combined_npz_folder, npz_file)
    print(f"\nExtracting features from {npz_file}")
    
    try:
        data = np.load(npz_path)
    except Exception as e:
        print(f"Error loading {npz_file}: {e}")
        continue

    image_ids = list(data.keys())
    print(f"Loaded {len(image_ids)} image IDs from {npz_file}")
    
    batch_images = []
    batch_ids = []
    
    for idx, image_id in enumerate(image_ids):
        try:
            # Get the raw image tensor (numpy array)
            image_np = data[image_id]
            # Convert to CLIP-compatible tensor
            im_clip = convert_to_clip_image(image_np)
            batch_images.append(im_clip)
            batch_ids.append(image_id)
        except Exception as e:
            print(f"Could not process image {image_id}: {e}")
            continue

        # If mini batch is full or reached end, processing it
        if (len(batch_images) == mini_batch_size) or (idx == len(image_ids) - 1):
            # Stacking the images; shape becomes (batch_size, 3, 224, 224)
            batch_tensor = torch.stack(batch_images).to(device)
            with torch.no_grad():
                # Extract image embeddings
                embeddings = model.encode_image(batch_tensor)
                embeddings = embeddings.cpu().numpy()
            # Store the embeddings along with their image IDs
            for id_, emb in zip(batch_ids, embeddings):
                embedding_list.append((id_, emb))
            # Reset mini-batch accumulators
            batch_images = []
            batch_ids = []
    
    del data

all_ids = [item[0] for item in embedding_list]
all_embeddings = np.array([item[1] for item in embedding_list])
output_embedding_file = 'image_embeddings.npz'
np.savez_compressed(output_embedding_file, ids=all_ids, embeddings=all_embeddings)
print(f"\nSaved embeddings for {len(all_ids)} images to {output_embedding_file}.")


Found 12 combined npz files.

Extracting features from combined_batch_0.npz
Loaded 10000 image IDs from combined_batch_0.npz

Extracting features from combined_batch_1.npz
Loaded 10000 image IDs from combined_batch_1.npz

Extracting features from combined_batch_10.npz
Loaded 10000 image IDs from combined_batch_10.npz

Extracting features from combined_batch_11.npz
Loaded 4543 image IDs from combined_batch_11.npz

Extracting features from combined_batch_2.npz
Loaded 10000 image IDs from combined_batch_2.npz

Extracting features from combined_batch_3.npz
Loaded 10000 image IDs from combined_batch_3.npz

Extracting features from combined_batch_4.npz
Loaded 10000 image IDs from combined_batch_4.npz

Extracting features from combined_batch_5.npz
Loaded 10000 image IDs from combined_batch_5.npz

Extracting features from combined_batch_6.npz
Loaded 10000 image IDs from combined_batch_6.npz

Extracting features from combined_batch_7.npz
Loaded 10000 image IDs from combined_batch_7.npz

Extract

### similarity computation

In [5]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

embedding_file = 'image_embeddings.npz'
data = np.load(embedding_file)
image_ids = data['ids']
embeddings = data['embeddings']

def find_similar_paintings(query_id, embeddings, image_ids, top_n=5):
    query_index = np.where(image_ids == query_id)[0]
    if len(query_index) == 0:
        return f"Image ID {query_id} not found in the dataset."
    query_embedding = embeddings[query_index[0]].reshape(1, -1)

    similarity_scores = cosine_similarity(query_embedding, embeddings)[0]
    
    # Get top N indices based on similarity scores
    top_indices = similarity_scores.argsort()[::-1][1:top_n+1]  # Exclude the query painting
    top_similar = [(image_ids[idx], similarity_scores[idx]) for idx in top_indices]
    
    return top_similar

query_image_id = '2'  # Replace with an actual id from the dataset
top_similar_paintings = find_similar_paintings(query_image_id, embeddings, image_ids, top_n=5)
print(f"Top similar paintings for {query_image_id}:")
for similar_id, score in top_similar_paintings:
    print(f"ID: {similar_id}, Similarity Score: {score}")


Top similar paintings for 2:
ID: 4, Similarity Score: 0.9651362895965576
ID: 12143, Similarity Score: 0.8947922587394714
ID: 344, Similarity Score: 0.891901969909668
ID: 3, Similarity Score: 0.891572117805481
ID: 46054, Similarity Score: 0.8845522403717041


### generating metadata embedding

In [7]:
import pandas as pd
from sentence_transformers import SentenceTransformer

metadata_file = 'combined_metadata.csv'
metadata_df = pd.read_csv(metadata_file)

# Pretrained sentencetransformer for text embeddings
text_model = SentenceTransformer('all-MiniLM-L6-v2')

text_columns = ['title', 'medium', 'term']
metadata_texts = metadata_df[text_columns].fillna('').agg(' '.join, axis=1)  # Combine text columns

# Generate metadata embeddings
metadata_embeddings = text_model.encode(metadata_texts.tolist(), show_progress_bar=True)
print(f"Generated metadata embeddings for {metadata_df.shape[0]} rows.")

Batches:   0%|          | 0/3580 [00:00<?, ?it/s]

Generated metadata embeddings for 114543 rows.


### combining metadata and visual embeddings

In [23]:
metadata_df['depictstmsobjectid'] = metadata_df['depictstmsobjectid'].astype(str)
visual_ids = visual_data['ids'].astype(str)

common_ids = set(visual_ids).intersection(metadata_df['depictstmsobjectid'])
print(f"Found {len(common_ids)} common IDs.")

filtered_metadata = metadata_df[metadata_df['depictstmsobjectid'].isin(common_ids)]
filtered_visual_ids = [vid for vid in visual_ids if vid in common_ids]
filtered_visual_embeddings = [visual_embeddings[i] for i, vid in enumerate(visual_ids) if vid in common_ids]

combined_embeddings = []
combined_ids = []

for idx, row in filtered_metadata.iterrows():
    image_id = row['depictstmsobjectid']
    visual_idx = filtered_visual_ids.index(image_id)
    multimodal_embedding = np.concatenate((filtered_visual_embeddings[visual_idx], metadata_embeddings[idx]))
    combined_embeddings.append(multimodal_embedding)
    combined_ids.append(image_id)

np.savez_compressed('multimodal_embeddings.npz', ids=combined_ids, embeddings=combined_embeddings)
print(f"Saved multimodal embeddings for {len(combined_ids)} paintings.")


Found 114543 common IDs.
Saved multimodal embeddings for 114543 paintings.


### similarity search, finding top n similar images

In [5]:
from sklearn.metrics.pairwise import cosine_similarity

multimodal_data = np.load('multimodal_embeddings.npz')
multimodal_ids = multimodal_data['ids']
multimodal_embeddings = multimodal_data['embeddings']

def find_similar_multimodal(query_id, embeddings, ids, top_n=5):
    query_idx = np.where(ids == query_id)[0]
    if len(query_idx) == 0:
        return f"Image ID {query_id} not found."
    query_embedding = embeddings[query_idx[0]].reshape(1, -1)

    similarity_scores = cosine_similarity(query_embedding, embeddings)[0]

    # Get top N indices based on similarity scores
    top_indices = similarity_scores.argsort()[::-1][1:top_n+1]  # Excluding the query painting itself
    return [(ids[idx], similarity_scores[idx]) for idx in top_indices]

query_id = '16190'  # Replace with an actual painting id
top_similar = find_similar_multimodal(query_id, multimodal_embeddings, multimodal_ids, top_n=5)
print(f"Top multimodal matches for {query_id}:")
for similar_id, score in top_similar:
    print(f"ID: {similar_id}, Similarity Score: {score}")


Top multimodal matches for 16190:
ID: 18107, Similarity Score: 0.9289279580116272
ID: 16632, Similarity Score: 0.9231486916542053
ID: 17800, Similarity Score: 0.9200115203857422
ID: 16621, Similarity Score: 0.918602466583252
ID: 19704, Similarity Score: 0.916595458984375


### tuning weights of visual and metadata

In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

In [8]:
# Step 1: generate metadata embeddings
def generate_metadata_embeddings(metadata_file, text_columns, model_name='all-MiniLM-L6-v2'):
    metadata_df = pd.read_csv(metadata_file)
    
    metadata_texts = metadata_df[text_columns].fillna('').agg(' '.join, axis=1)
    
    text_model = SentenceTransformer(model_name)
    
    metadata_embeddings = text_model.encode(metadata_texts.tolist(), show_progress_bar=True)
    
    print(f"Generated metadata embeddings for {metadata_df.shape[0]} rows.")
    return metadata_embeddings, metadata_df

metadata_file = 'combined_metadata.csv'
text_columns = ['title', 'medium', 'term']
metadata_embeddings, metadata_df = generate_metadata_embeddings(metadata_file, text_columns)

Batches:   0%|          | 0/3580 [00:00<?, ?it/s]

Generated metadata embeddings for 114543 rows.


In [9]:
# Step 2: Combine Metadata and Visual Embeddings
def combine_embeddings(visual_embedding_file, metadata_embeddings, metadata_df, id_column='depictstmsobjectid', visual_weight=0.7, metadata_weight=0.3):
    visual_data = np.load(visual_embedding_file)
    visual_ids = visual_data['ids'].astype(str)  #ensuring ids are strings
    visual_embeddings = visual_data['embeddings']
    
    #ensuring metadata ids are strings
    metadata_df['depictstmsobjectid'] = metadata_df['depictstmsobjectid'].astype(str)
    
    # Finding common IDs
    common_ids = set(visual_ids).intersection(metadata_df['depictstmsobjectid'])
    print(f"Found {len(common_ids)} common IDs.")
    
    # Filtering metadata and visual embeddings to include only common ids
    filtered_metadata = metadata_df[metadata_df['depictstmsobjectid'].isin(common_ids)]
    filtered_visual_ids = [vid for vid in visual_ids if vid in common_ids]
    filtered_visual_embeddings = [visual_embeddings[i] for i, vid in enumerate(visual_ids) if vid in common_ids]
    
    combined_embeddings = []
    combined_ids = []

    for idx, row in filtered_metadata.iterrows():
        image_id = row['depictstmsobjectid']
        visual_idx = filtered_visual_ids.index(image_id)
        
        visual_embedding = normalize(filtered_visual_embeddings[visual_idx].reshape(1, -1), norm='l2')[0]
        metadata_embedding = normalize(metadata_embeddings[idx].reshape(1, -1), norm='l2')[0]
        
        # Applyin weights and combine embeddings
        multimodal_embedding = np.concatenate(
            (visual_embedding * visual_weight, metadata_embedding * metadata_weight)
        )
        combined_embeddings.append(multimodal_embedding)
        combined_ids.append(image_id)
    
    print(f"Combined multimodal embeddings for {len(combined_ids)} paintings.")
    return np.array(combined_embeddings), combined_ids

visual_embedding_file = 'image_embeddings.npz'
visual_weight = 0.7
metadata_weight = 0.3
combined_embeddings, combined_ids = combine_embeddings(
    visual_embedding_file, metadata_embeddings, metadata_df, visual_weight=visual_weight, metadata_weight=metadata_weight
)

output_embedding_file = 'multimodal_embeddings_finetuned.npz'
np.savez_compressed(output_embedding_file, ids=combined_ids, embeddings=combined_embeddings)
print(f"Saved multimodal embeddings to {output_embedding_file}.")

Found 114543 common IDs.
Combined multimodal embeddings for 114543 paintings.
Saved multimodal embeddings to multimodal_embeddings_finetuned.npz.


In [10]:
# Step 3: Perform Similarity Search
def find_similar_multimodal(query_id, embeddings, ids, top_n=5):
    ids = np.array(ids)

    query_idx = np.where(ids == query_id)[0]
    if len(query_idx) == 0:  #handle missing query ID
        return []
    query_idx = query_idx[0]  #extracting scalar index if there's only one match

    query_embedding = embeddings[query_idx].reshape(1, -1)
    similarity_scores = cosine_similarity(query_embedding, embeddings)[0]

    # Get top N indices based on similarity scores
    top_indices = similarity_scores.argsort()[::-1][1:top_n+1]  # Exclude the query painting itself
    return [(ids[idx], similarity_scores[idx]) for idx in top_indices]


query_id = '16190'  # Replace with an actual painting id
top_similar = find_similar_multimodal(query_id, combined_embeddings, combined_ids, top_n=5)
print(f"Top multimodal matches for {query_id}:")
for similar_id, score in top_similar:
    print(f"ID: {similar_id}, Similarity Score: {score}")

Top multimodal matches for 16190:
ID: 18107, Similarity Score: 0.8965755701065063
ID: 25625, Similarity Score: 0.8778117895126343
ID: 26883, Similarity Score: 0.8773630857467651
ID: 19704, Similarity Score: 0.8735079169273376
ID: 17621, Similarity Score: 0.8729293346405029


## similarity search, finding pairwise similarity data

In [3]:
import pandas as pd

def save_labels_incrementally(metadata_df, group_columns, output_csv):
    # Combine columns to form a unique group key
    metadata_df[group_columns] = metadata_df[group_columns].fillna('')  # Replace NaN with empty string
    metadata_df['group_key'] = metadata_df[group_columns].astype(str).agg('_'.join, axis=1)

    with open(output_csv, 'w') as f:
        f.write('painting_id_1,painting_id_2,label\n')  # Header row

    groups = metadata_df.groupby('group_key')
    for _, group in groups:
        ids = group['depictstmsobjectid'].tolist()
        label_data = []
        for i, id1 in enumerate(ids):
            for id2 in ids[i + 1:]:             # Pair paintings within the same group
                label_data.append({'painting_id_1': id1, 'painting_id_2': id2, 'label': 1})

        pd.DataFrame(label_data).to_csv(output_csv, mode='a', index=False, header=False)

metadata_file = 'combined_metadata.csv'
metadata_df = pd.read_csv(metadata_file)
output_csv = 'labels_metadata.csv'

save_labels_incrementally(metadata_df, group_columns=['medium', 'classification', 'term'], output_csv=output_csv)
print(f"Labels saved incrementally to {output_csv}.")

Labels saved incrementally to labels_metadata.csv.


In [4]:
import faiss
import numpy as np
import pandas as pd

def generate_predictions_and_save_faiss(embeddings, ids, output_csv, top_n=10):
    embeddings = embeddings.astype('float32')

    # Create Faiss index for L2 (Euclidean distance)
    d = embeddings.shape[1]
    index = faiss.IndexFlatL2(d)
    index.add(embeddings)

    with open(output_csv, 'w') as f:
        f.write('painting_id_1,painting_id_2,prediction\n')  # Header row

    # Searching for top-N nearest neighbors
    distances, indices = index.search(embeddings, top_n + 1)
    for i, neighbors in enumerate(indices):
        prediction_data = []
        for neighbor_idx, distance in zip(neighbors[1:], distances[i][1:]):  # Exclude self-match
            similarity = 1 / (1 + distance)  # Convert distance to similarity
            prediction_data.append({
                'painting_id_1': ids[i],
                'painting_id_2': ids[neighbor_idx],
                'prediction': similarity
            })

        pd.DataFrame(prediction_data).to_csv(output_csv, mode='a', index=False, header=False)

multimodal_embeddings = np.load('multimodal_embeddings.npz')['embeddings']
multimodal_ids = np.load('multimodal_embeddings.npz')['ids']
output_csv = 'predictions.csv'
generate_predictions_and_save_faiss(multimodal_embeddings, multimodal_ids, output_csv, top_n=10)
print(f"Predictions saved incrementally to {output_csv}.")

Predictions saved incrementally to predictions.csv.


In [6]:
import json

def combine_labels_and_predictions_to_json(labels_csv, predictions_csv, output_json):
    labels_df = pd.read_csv(labels_csv)
    predictions_df = pd.read_csv(predictions_csv)

    # Merging on painting pairs
    combined_df = pd.merge(
        labels_df, predictions_df,
        how='outer', on=['painting_id_1', 'painting_id_2']
    )

    combined_data = {}
    for _, row in combined_df.iterrows():
        pair = (row['painting_id_1'], row['painting_id_2'])
        combined_data[str(pair)] = {
            "label": row['label'] if 'label' in row else None,
            "prediction": row['prediction'] if 'prediction' in row else None
        }
    
    with open(output_json, 'w') as f:
        json.dump(combined_data, f, indent=4)

combine_labels_and_predictions_to_json(
    'labels_metadata.csv', 'predictions.csv', 'labels_and_predictions.json'
)
print(f"Combined labels and predictions saved to labels_and_predictions.json.")

Combined labels and predictions saved to labels_and_predictions.json.


In [7]:
def evaluate_map_from_combined_file(file_path, k=10):
    with open(file_path, 'r') as f:
        data = json.load(f)
    
    query_to_predictions = {}
    query_to_labels = {}
    
    for pair, values in data.items():
        pair = eval(pair)  # Convert string keys back to tuples
        painting_1, painting_2 = pair
        if values["label"] is not None and values["prediction"] is not None:
            if painting_1 not in query_to_predictions:
                query_to_predictions[painting_1] = []
                query_to_labels[painting_1] = []
            query_to_predictions[painting_1].append((painting_2, values["prediction"]))
            query_to_labels[painting_1].append(values["label"])
    
    # Calculate Average Precision for each query
    average_precisions = []
    for query, results in query_to_predictions.items():
        # Sort results by predicted scores (descending)
        sorted_results = sorted(results, key=lambda x: x[1], reverse=True)[:k]
        relevant_labels = [query_to_labels[query][i] for i, _ in enumerate(sorted_results)]
        
        # Precision at each rank
        precisions = []
        num_relevant = 0
        for i, label in enumerate(relevant_labels):
            if label == 1:
                num_relevant += 1
                precisions.append(num_relevant / (i + 1))
        
        # Average Precision
        if precisions:
            average_precisions.append(sum(precisions) / len(precisions))
        else:
            average_precisions.append(0.0)
    
    # Mean Average Precision mAP
    return sum(average_precisions) / len(average_precisions)

combined_file = 'labels_and_predictions.json'
map_score = evaluate_map_from_combined_file(combined_file, k=5)
print("Mean Average Precision (mAP):", map_score)

Mean Average Precision (mAP): 0.2356752970985181


In [11]:
import json
from sklearn.metrics import precision_score, recall_score, f1_score

def evaluate_f1_score_from_combined_file_with_cleaning(file_path, threshold=0.5):
    with open(file_path, 'r') as f:
        data = json.load(f)
    
    y_true = []
    y_pred = []

    for pair, values in data.items():
        label = values["label"]
        prediction = values["prediction"]

        if label is None or label != label:  # Check for None or NaN
            label = 0  # Default to dissimilar

        if prediction is None or prediction != prediction:  # Check for None or NaN
            prediction = 0.0  # Default low similarity

        y_true.append(label)
        y_pred.append(int(prediction >= threshold))  # Apply threshold to predictions
    
    if not y_true or not y_pred:
        raise ValueError("No valid pairs found for evaluation. Please check your data.")

    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    
    return {"Precision": precision, "Recall": recall, "F1-Score": f1}

combined_file = 'labels_and_predictions.json'
metrics = evaluate_f1_score_from_combined_file_with_cleaning(combined_file, threshold=0.7)
print("Precision:", metrics["Precision"])
print("Recall:", metrics["Recall"])
print("F1-Score:", metrics["F1-Score"])

Precision: 0.2916666666666667
Recall: 5.827998771790887e-07
F1-Score: 1.1655974252951958e-06


In [14]:
import json
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def update_pairwise_similarity(painting_id_1, painting_id_2, embeddings, ids, labels_and_predictions_file):
    ids = np.array(ids)

    idx1 = np.where(ids == painting_id_1)[0]
    idx2 = np.where(ids == painting_id_2)[0]

    if len(idx1) == 0 or len(idx2) == 0:  # Handle missing ids
        return f"One or both painting IDs ({painting_id_1}, {painting_id_2}) not found."

    idx1, idx2 = idx1[0], idx2[0]

    similarity_score = float(cosine_similarity(
        embeddings[idx1].reshape(1, -1), embeddings[idx2].reshape(1, -1)
    )[0][0])  # Convert to standard Python float

    try:
        with open(labels_and_predictions_file, 'r') as f:
            labels_and_predictions = json.load(f)
    except (FileNotFoundError, json.JSONDecodeError):
        labels_and_predictions = {}

    # Formating the pair as a string to match JSON structure
    pair_key = str((painting_id_1, painting_id_2))

    existing_entry = labels_and_predictions.get(pair_key, {})
    label = existing_entry.get("label", None)  # Retrieve existing label or leave as None
    labels_and_predictions[pair_key] = {
        "label": label,
        "prediction": similarity_score
    }

    with open(labels_and_predictions_file, 'w') as f:
        json.dump(labels_and_predictions, f, indent=4)

    return labels_and_predictions[pair_key]

painting_id_1 = '16190'  # Replace with actual painting id
painting_id_2 = '17800'  # Replace with actual painting id

multimodal_embeddings = np.load('multimodal_embeddings.npz')['embeddings']
multimodal_ids = np.load('multimodal_embeddings.npz')['ids']

labels_and_predictions_file = 'labels_and_predictions.json'

result = update_pairwise_similarity(
    painting_id_1, painting_id_2, multimodal_embeddings, multimodal_ids, labels_and_predictions_file
)
print("Updated pairwise similarity data:", result)

Updated pairwise similarity data: {'label': None, 'prediction': 0.9200115203857422}
