In [1]:
import torch
from transformers import CLIPProcessor, CLIPModel, AutoTokenizer, AutoModel
from PIL import Image
import requests



In [2]:
# Load CLIP model and processor for image embeddings
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [None]:
# Load text model and tokenizer for text embeddings
text_model = AutoModel.from_pretrained("prajjwal1/bert-tiny")
text_tokenizer = AutoTokenizer.from_pretrained("prajjwal1/bert-tiny")

In [10]:
# Load sample data and generate embeddings
import pandas as pd

# Load from local unified dataset
df = pd.read_csv('../data/03_primary/unified_art_dataset.csv')
sample = df.iloc[0]

# Generate text embedding
text = sample.get('title', '') + " " + sample.get('utterance', '') + " " + sample.get('emotion_text', '')
text_inputs = text_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
with torch.no_grad():
    text_embedding = text_model(**text_inputs).last_hidden_state.mean(dim=1)

print("Text embedding shape:", text_embedding.shape)
print("Sample text:", text[:100])


Text embedding shape: torch.Size([1, 128])
Sample text: acolman-1-1955 I like the woods and as this looks like a graveyard, it soothes. I like the woods and


In [None]:
# Batch generate text embeddings for 10k samples
import pandas as pd

# Load from local unified dataset
df = pd.read_csv('../data/03_primary/unified_art_dataset.csv')
df_subset = df.head(10000).copy()  # 10k samples

text_embeddings = []
for idx, row in df_subset.iterrows():
    text = f"{row.get('title', '')} {row.get('utterance', '')} {row.get('emotion_text', '')}".strip()
    if not text:
        text = "No description available"
    
    inputs = text_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        embedding = text_model(**inputs).last_hidden_state.mean(dim=1).squeeze().numpy()
    
    text_embeddings.append(embedding)
    
    if (idx + 1) % 100 == 0:
        print(f"Processed {idx + 1} text embeddings")

df_subset['text_embedding'] = text_embeddings
print(f"Generated {len(text_embeddings)} text embeddings")

# Save text embedded dataset
df_subset.to_csv('../data/04_feature/text_embedded_art_dataset.csv', index=False)

Trying to download image from: https://uploads6.wikiart.org/images/aaron-siskind/acolman-1-1955.jpg
Response status: 200
Image downloaded successfully
Image embedding shape: torch.Size([1, 512])


In [None]:
import torch
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import requests
import pandas as pd

# Load CLIP model and processor
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Load dataset (assuming text-embedded dataset)
df = pd.read_csv('data/04_feature/text_embedded_art_dataset.csv')

# Filter for high-repetition artworks
repetition_counts = df.groupby('image_name').size()
high_rep_artworks = repetition_counts[repetition_counts >= 5].index
filtered_df = df[df['image_name'].isin(high_rep_artworks)]

# Select diverse 1k artworks
filtered_df = filtered_df.sort_values(['style', 'artist']).drop_duplicates(subset=['image_name']).head(1000)

embeddings = []
successful_rows = []

for idx, row in filtered_df.iterrows():
    image_name = row.get('image_name', '')
    if image_name:
        # Construct URL
        parts = image_name.split('/')
        if len(parts) == 2:
            artist_title = parts[1].replace('_', '/').replace('.jpg', '')
            artist, title = artist_title.split('/', 1)
            image_url = f"https://uploads6.wikiart.org/images/{artist}/{title}.jpg"
        else:
            image_url = f"https://uploads6.wikiart.org/images/{image_name}"
        
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
        try:
            response = requests.get(image_url, stream=True, headers=headers, timeout=10)
            if response.status_code == 200:
                image = Image.open(response.raw)
                inputs = clip_processor(images=image, return_tensors="pt")
                with torch.no_grad():
                    embedding = clip_model.get_image_features(**inputs).squeeze().numpy()
                embeddings.append(embedding)
                successful_rows.append(row)
                if len(successful_rows) % 10 == 0:
                    print(f"Processed {len(successful_rows)} image embeddings")
            else:
                print(f"Failed download: {response.status_code}")
        except Exception as e:
            print(f"Error: {e}")
    else:
        print("No image_name")

# Save results
final_df = pd.DataFrame(successful_rows)
final_df['image_embedding'] = embeddings
final_df.to_csv('data/04_feature/embedded_art_dataset.csv', index=False)