In [1]:
import torch
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import numpy as np
import pandas as pd
import json
from pathlib import Path
import os

In [4]:
url = "data/philly_restaurant_photos.json"
photo_data = pd.read_json(url, lines=True)


photos_dir = Path("data/photos")
photo_data['photo_path'] = photo_data['photo_id'].apply(
    lambda x: str(photos_dir / f"{x}.jpg")
)

photo_data['photo_path'].head()

0    data/photos/Le9rMdT8YFlvqr431LctIQ.jpg
1    data/photos/zNzVcwnSJ4kvjFnANIsIRg.jpg
2    data/photos/J1rqVl8pAoMJtPfGA2HV9w.jpg
3    data/photos/56xUu0i5oOBj9GdZqIg9_w.jpg
4    data/photos/GZpflvLA8AvQ6zi8aerdHg.jpg
Name: photo_path, dtype: object

In [None]:
def get_image_embedding(image_path, model, processor):
    image = Image.open(image_path)
    # preparer l'image et transformer en tenseur adapté à pytorch
    inputs = processor(images=image, return_tensors="pt")
    # embdedding utilisant le modèle CLIP
    with torch.no_grad():
        image_features = model.get_image_features(**inputs)
    # normalisation de l'embedding L2
    return image_features / image_features.norm(p=2, dim=-1, keepdim=True)

In [None]:
def process_photos_by_label(df, model, processor):
    results = []
    for (business_id, label), group in df.groupby(['business_id', 'label']):
        embeddings= []

        for idx, row in group.iterrows():
            try : 
                embedding = get_image_embedding(row['photo_path'], model, processor)
                embeddings.append(embedding[0].numpy())
            except Exception as e:
                print(f"Error processing image {row['photo_path']}: {e}")
                continue

        if len(embeddings) > 0:
            all_embs = np.stack(embeddings)
            mean_emb = all_embs.mean(axis=0)
            mean_emb = mean_emb / np.linalg.norm(mean_emb)
            results.append({
                'business_id': business_id,
                'label': label,
                'embedding': mean_emb
            })
    return pd.DataFrame(results)


In [None]:
def get_image_embedding_batch(image_paths, model, processor, batch_size=32):
    all_embeddings = []
    for i in range(0, len(image_paths), batch_size):
        batch_paths = image_paths[i:i+batch_size]
        images = []
        valid_indices = []
        for idx, image_path in enumerate(batch_paths):
            try:
                image = Image.open(image_path)
                images.append(image)
                valid_indices.append(idx)
            except Exception as e:
                print(f"Error loading image {image_path}: {e}")
                continue
        if len(images) == 0:
            continue
        inputs = processor(images=images, return_tensors="pt", padding=True)
        with torch.no_grad():
            image_features = model.get_image_features(**inputs)
            image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True)
        all_embeddings.extend(image_features.numpy())
    return np.array(all_embeddings)

In [None]:
def process_photos_by_label_fast(df, model, processor, batch_size=32):
    results = []
    for (business_id, label), group in df.groupby(['business_id', 'label']):
        image_paths = group['photo_path'].tolist()
        embeddings = get_image_embedding_batch(image_paths, model, processor, batch_size)
        if embeddings.shape[0] > 0:
            mean_emb = embeddings.mean(axis=0)
            mean_emb = mean_emb / np.linalg.norm(mean_emb)
            results.append({
                'business_id': business_id,
                'label': label,
                'embedding': mean_emb
            })
    return pd.DataFrame(results)

1. Pour chaque combinaison (business, label) :  
    1. Pour chaque photo de ce groupe :  
        - Extraire l'embedding  
        - convertir en numpy  
        - ajouter à la liste  
    2. Empiler tous les embeddings  
    3. Moyenner sur axe 0  
    4. Normaliser   
    5. Sauvegarder résultat  

In [16]:
os.environ['HF_HUB_DOWNLOAD_TIMEOUT'] = '300'
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32", use_safetensors=True)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

embeddings_by_label = process_photos_by_label(photo_data, model, processor)

Error processing image /Users/teatoscanduplantier/Desktop/ENSAE/Machine_Learning/Projet/Machine-learning-for-python/Etude_photos/photos/_-TxKgOJ6Oy0MinS88ntXg.jpg: [Errno 2] No such file or directory: '/Users/teatoscanduplantier/Desktop/ENSAE/Machine_Learning/Projet/Machine-learning-for-python/Etude_photos/photos/_-TxKgOJ6Oy0MinS88ntXg.jpg'


In [20]:
print(embeddings_by_label.head())

              business_id    label  \
0  -0M0b-XhtFagyLmsBtOe8w   inside   
1  -0TffRSXXIlBYVbb5AwfTg    drink   
2  -0TffRSXXIlBYVbb5AwfTg     food   
3  -0TffRSXXIlBYVbb5AwfTg   inside   
4  -0TffRSXXIlBYVbb5AwfTg  outside   

                                           embedding  
0  [0.009815368, -0.0013590974, 0.0011284378, -0....  
1  [-0.011883101, 0.03936019, 0.017815243, 0.0007...  
2  [0.013633062, 0.048887067, -0.0045891963, 0.01...  
3  [-0.018343521, 0.02150698, 0.0011638484, 0.002...  
4  [0.011479036, -0.057466, 0.06860343, -0.008981...  


In [21]:
import pickle
import numpy as np
with open('embeddings_by_label.pkl', 'wb') as f:
    pickle.dump(embeddings_by_label, f)

embeddings_matrix = np.stack(embeddings_by_label['embedding'].values)
np.save('embeddings_matrix.npy', embeddings_matrix)
embeddings_by_label[['business_id', 'label']].to_csv('embeddings_metadata.csv', index=False)

In [22]:
embeddings_by_label.to_csv('embeddings_by_label.csv', index=False)

In [23]:
# Pondération qui s'adapte au nombre de labels présents
def aggregate_to_business_normalized(embeddings_by_label_df): 
    base_weights = {
        'food': 0.35,
        'inside': 0.35,
        'outside': 0.10,
        'drink': 0.10,
        'menu': 0.10
    }
    
    business_embeddings = []
    
    for business_id in embeddings_by_label_df['business_id'].unique():
        business_data = embeddings_by_label_df[
            embeddings_by_label_df['business_id'] == business_id
        ]
        weighted_sum = np.zeros(512)
        total_weight = 0
        
        for idx, row in business_data.iterrows():
            label = row['label']
            weight = base_weights.get(label, 0.1)
            weighted_sum += row['embedding'] * weight
            total_weight += weight  # Somme des poids réellement utilisés
        
        # Re-normaliser par la somme des poids actifs
        if total_weight > 0:
            final_emb = weighted_sum / total_weight  # ← Clé : division par total_weight
            final_emb = final_emb / np.linalg.norm(final_emb)
        else:
            final_emb = np.zeros(512)
        
        business_embeddings.append({
            'business_id': business_id,
            'embedding': final_emb,
            'n_labels': len(business_data),
            'labels_present': ', '.join(business_data['label'].values)
        })
    
    return pd.DataFrame(business_embeddings)

In [24]:
embeddings_by_business = aggregate_to_business_normalized(embeddings_by_label)
print(embeddings_by_business.head())

              business_id                                          embedding  \
0  -0M0b-XhtFagyLmsBtOe8w  [0.009815367254057478, -0.00135909736167944, 0...   
1  -0TffRSXXIlBYVbb5AwfTg  [-0.0020635972460192956, 0.027889015327477534,...   
2  -0eUa8TsXFFy0FCxHYmrjg  [0.03804909371904837, -0.0007548362063656802, ...   
3  -1B9pP_CrRBJYPICE5WbRA  [-0.024990726315259223, 0.03304655852488704, 0...   
4  -1b2kNOowsPrPpBOK4lNkQ  [-0.009722805816285285, 0.021325755452536156, ...   

   n_labels                labels_present  
0         1                        inside  
1         4  drink, food, inside, outside  
2         1                          food  
3         2                  food, inside  
4         2                  food, inside  


In [25]:
embeddings_by_business.to_csv('embeddings_by_business.csv', index=False)