In [21]:
from cifar_data import load_cifar_data
import torch
import torchvision
from torchvision import transforms
import torch.nn as nn
from tqdm import tqdm
import pandas as pd
import pickle
from sklearn.preprocessing import StandardScaler
import numpy as np  

In [3]:
# On charge nos données brutes du dataset CIFAR-10
X, y = load_cifar_data('../data/cifar')
X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.long)

In [4]:
# Redimensionner les images CIFAR-10 en 3x32x32 pour correspondre à l'entrée de ResNet18
X = X.view(-1, 3, 32, 32)  # Reshape les images CIFAR-10 de (3072,) à (3, 32, 32)

# Normalisation des données CIFAR-10 en utilisant les valeurs moyennes et les écarts-types standard
transform = transforms.Compose([
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Appliquer la transformation de normalisation
X = transform(X)

# Charger le modèle pré-entraîné ResNet18
resnet18 = torchvision.models.resnet18(pretrained=True)

# Remplacer la couche fully connected finale pour qu'elle ne change pas la dimensionnalité
# de sortie pour nos embeddings. Nous voulons les features avant la classification finale.
resnet18 = nn.Sequential(*list(resnet18.children())[:-1])

# Passer les images à travers ResNet18 pour obtenir les embeddings
def get_embeddings(X):
    # Assurez-vous que le modèle est en mode évaluation
    resnet18.eval()

    embeddings_list = []
    # Utiliser tqdm pour afficher la barre de progression
    for i in tqdm(range(X.size(0)), desc="Processing Images", unit="image"):
        image = X[i].unsqueeze(0)  # Ajouter une dimension pour simuler un batch
        with torch.no_grad():
            embedding = resnet18(image)
            embedding = embedding.view(embedding.size(0), -1)  # Redimensionner les embeddings
            embeddings_list.append(embedding)
    
    # Concaténer les embeddings obtenus pour toutes les images
    embeddings = torch.cat(embeddings_list, dim=0).numpy()
    
    return embeddings




In [5]:
# `embeddings` contiendra maintenant les représentations d'images de dimension (N, 512)
embeddings = get_embeddings(X)

Processing Images: 100%|██████████| 60000/60000 [02:23<00:00, 419.36image/s]


In [36]:
# Initialiser le scaler
scaler = StandardScaler()

# Appliquer la normalisation (scaling) sur les embeddings
embeddings_scaled = scaler.fit_transform(embeddings)

In [6]:
embeddings_df = pd.DataFrame(embeddings)
y_np = y.numpy()
embeddings_df['target'] = y_np

# Enregistrer le DataFrame dans un fichier CSV
embeddings_df.to_csv('../data/ref_data.csv', index=False)

In [38]:
# On enregistre le modèle ResNet de l'embedding et le scaler
with open("../artifacts/resnet18_embedding.pkl", "wb") as f:
    pickle.dump(resnet18, f)

with open('../artifacts/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

In [22]:
def get_resnet50_embeddings():
    transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize for ResNet50
    transforms.ToTensor(),          # Convert to tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize for ResNet
])

    # Load train and test datasets
    train_dataset = torchvision.datasets.CIFAR10(root='../data/cifar/', train=True, download=True, transform=transform)
    test_dataset = torchvision.datasets.CIFAR10(root='../data/cifar', train=False, download=True, transform=transform)
    
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=False)
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=64, shuffle=False)
    
    # Step 2: Load pretrained ResNet50
    device = torch.device("mps" if torch.mps.is_available() else "cpu")
    resnet = torchvision.models.resnet50(pretrained=True).to(device)
    resnet.fc = nn.Identity()  # Remove the classification head
    
    # Function to extract embeddings
    def extract_embeddings(dataloader, model, device):
        model.eval()
        embeddings = []
        labels = []
        with torch.no_grad():
            for images, targets in tqdm(dataloader):
                images = images.to(device)
                features = model(images)  # Extract features
                embeddings.append(features.cpu().numpy())
                labels.append(targets.numpy())
        embeddings = np.concatenate(embeddings, axis=0)
        labels = np.concatenate(labels, axis=0)
        return embeddings, labels
    
    # Extract embeddings for training and test datasets
    train_embeddings, train_labels = extract_embeddings(train_loader, resnet, device)
    test_embeddings, test_labels = extract_embeddings(test_loader, resnet, device)

    # Initialiser le scaler
    scaler = StandardScaler()

    # Appliquer la normalisation (scaling) sur les embeddings
    # Concatenate train and test embeddings
    all_embeddings = np.concatenate((train_embeddings, test_embeddings), axis=0)
    all_labels = np.concatenate((train_labels, test_labels), axis=0)

    # Apply scaling
    embeddings_scaled = scaler.fit_transform(all_embeddings)

    # Create DataFrame
    embeddings_df = pd.DataFrame(embeddings_scaled)
    embeddings_df['target'] = all_labels
    embeddings_df.to_csv('../data/ref_data.csv', index=False)

    resnet.to('cpu')
    # On enregistre le modèle ResNet de l'embedding et le scaler
    with open("../artifacts/resnet50_embedding.pkl", "wb") as f:
        pickle.dump(resnet, f)

    with open('../artifacts/scaler_resnet50.pkl', 'wb') as f:
        pickle.dump(scaler, f)

In [23]:
get_resnet50_embeddings()

Files already downloaded and verified
Files already downloaded and verified


100%|██████████| 782/782 [08:27<00:00,  1.54it/s]
100%|██████████| 157/157 [01:50<00:00,  1.42it/s]
