In [11]:
import os
import sys
import shutil
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from PIL import Image

project_path = os.path.abspath("../code") 
sys.path.append(project_path)
from vipm_llabels import ResNet50FeatureExtractor
from vipm_image_retrieval import ImageRetrievalKNN, ImageRetrievalBestFit, ImageRetrievalKNNCentroids
from vipm_dataset_cleaner import DatasetCleaner

In [12]:
# Carica il file CSV
def load_csv(csv_path):
    data = pd.read_csv(csv_path, header=None, names=['image_name', 'label'])
    return data['image_name'].tolist(), data['label'].tolist()

# Percorsi
csv_path = '../dataset/train_small.csv'   
csv_unlabeled = '../dataset/train_unlabeled.csv'
indir = '../dataset/train_set'  # Modifica in base alla posizione delle immagini
outdir = '../llabels'  # Modifica in base alla posizione delle feature
os.makedirs(outdir, exist_ok=True)

In [13]:
# Carica le immagini dal CSV
image_names, labels = load_csv(csv_path)
image_names_unlabeled, _ = load_csv(csv_unlabeled)

extractor = ResNet50FeatureExtractor()
features_unlabeled, _, _ = extractor.get_features(csv=csv_unlabeled, indir=indir, outdir=outdir, normalize=True)
features_small, _, _ = extractor.get_features(csv=csv_path, indir=indir, outdir=outdir, normalize=True)


Caricamento delle feature da ../features\train_unlabeled_resnet50_features_normalized.npz
Caricamento delle feature da ../features\train_small_resnet50_features_normalized.npz


In [14]:
def visualize_retrieved_images(images, indices, predictions, class_label, outdir, num_images=10):
    # Filtra gli indici per la classe specificata
    filtered_indices = [idx for idx, pred in enumerate(predictions) if pred == class_label]
    selected_indices = filtered_indices[:num_images]

    # Calcola il numero di righe necessario
    num_cols = 10  # Numero massimo di immagini per riga
    num_rows = (len(selected_indices) + num_cols - 1) // num_cols  # Arrotonda verso l'alto

    # Visualizza le immagini recuperate
    plt.figure(figsize=(15, 5 * num_rows))
    for i, idx in enumerate(selected_indices):
        img_path = os.path.join(indir, images[indices[idx]])
        img = Image.open(img_path)
        
        # Posiziona l'immagine nella griglia
        plt.subplot(num_rows, num_cols, i + 1)
        plt.imshow(img)
        plt.axis('off')
        plt.title(f"Retrieved {i + 1}")
    
    plt.tight_layout()
    plt.show()

In [15]:
def visualize_discarted_images(images, indices, labels, class_label, outdir, num_images=10):
    # Filtra gli indici per la classe specificata
    filtered_indices = [idx for idx, pred in enumerate(labels) if pred == class_label]
    selected_indices = filtered_indices[:num_images]

    # Calcola il numero di righe necessario
    num_cols = 10  # Numero massimo di immagini per riga
    num_rows = (len(selected_indices) + num_cols - 1) // num_cols  # Arrotonda verso l'alto

    # Visualizza le immagini recuperate
    plt.figure(figsize=(15, 5 * num_rows))
    for i, idx in enumerate(selected_indices):
        img_path = os.path.join(indir, images[indices[idx]])
        img = Image.open(img_path)
        
        # Posiziona l'immagine nella griglia
        plt.subplot(num_rows, num_cols, i + 1)
        plt.imshow(img)
        plt.axis('off')
        plt.title(f"Discarted {i + 1}")
    
    plt.tight_layout()
    plt.show()

In [16]:
# set seed
np.random.seed(42)

# Centroidi 20

## 100% dataset

In [46]:
# senza cleaning
image_names_unlabeled_numpy = np.array(image_names_unlabeled)
image_names_numpy = np.array(image_names)

retrival_centroidi = ImageRetrievalKNNCentroids(queryset=features_unlabeled, dataset=features_small, dataset_labels=labels, n_image_per_class=20, algo='ball_tree')
indices_centroidi, centroidi_labels = retrival_centroidi.retrieve_images()

# con cleaning
cleaner = DatasetCleaner(features=features_small, class_info=labels, clean_criterion=DatasetCleaner.clean_criterion_isolation_forest)
accepted_indices_by_class = cleaner.clean_dataset_by_class(contamination="auto")

discarted_indices_by_class = [idx for idx in range(len(labels)) if idx not in accepted_indices_by_class]
labels_discarted = [labels[idx] for idx in discarted_indices_by_class]

features_small_filtered = np.array([features_small[idx] for idx in accepted_indices_by_class])
features_small_filtered_labels = np.array([labels[idx] for idx in accepted_indices_by_class])

image_names_unlabeled_numpy = np.array(image_names_unlabeled)

retrival_centroidi_cleaned = ImageRetrievalKNNCentroids(queryset=features_unlabeled, dataset=features_small_filtered, dataset_labels=features_small_filtered_labels, n_image_per_class=20, algo='ball_tree')
indices_centroidi_cleaned, centroidi_labels_cleaned = retrival_centroidi.retrieve_images()

In [47]:
retrieved_images = image_names_unlabeled_numpy[indices_centroidi]
retrieved_labels = centroidi_labels

retrieved_images_cleaned = image_names_unlabeled_numpy[indices_centroidi_cleaned]
retrieved_labels_cleaned = centroidi_labels_cleaned

small_images = image_names_numpy
small_labels = labels

small_images_filtered = image_names_numpy[accepted_indices_by_class]
small_labels_filtered = [labels[idx] for idx in accepted_indices_by_class]

In [49]:
import os
import pandas as pd
import shutil

# Crea le cartelle
folders = [
    'small_with_cleaned_retrieval',
    'filtered_small_with_retrieval',
    'small_with_retrieval',
    'filtered_small_with_cleaned_retrieval'
]
for folder in folders:
    if not os.path.exists(folder):
        os.makedirs(folder)

# Combinazioni delle immagini e creazione delle cartelle
combinations = [
    (small_images, retrieved_images_cleaned, small_labels, retrieved_labels_cleaned, 'small_with_cleaned_retrieval'),
    (small_images_filtered, retrieved_images, small_labels_filtered, retrieved_labels, 'filtered_small_with_retrieval'),
    (small_images, retrieved_images, small_labels, retrieved_labels, 'small_with_retrieval'),
    (small_images_filtered, retrieved_images_cleaned, small_labels_filtered, retrieved_labels_cleaned, 'filtered_small_with_cleaned_retrieval')
]

# Copia delle immagini e creazione dei CSV
for img_set1, img_set2, label_set1, label_set2, folder in combinations:
    # Combina le immagini e le etichette
    combined_images = list(img_set1) + list(img_set2)
    print("shape combined_images: ", len(combined_images))
    combined_labels = list(label_set1) + list(label_set2)
    print("shape combined_labels: ", len(combined_labels))

    # Salva le immagini nella cartella
    for i, image_name in enumerate(combined_images):
        # Copia l'immagine nella cartella
        shutil.copy(os.path.join(indir, image_name), folder)
        
    # Crea un DataFrame per il CSV
    df = pd.DataFrame({'image_name': combined_images, 'label': combined_labels})
    
    # Salva il CSV nella cartella
    csv_path = os.path.join(folder, f'{folder}_labels.csv')
    df.to_csv(csv_path, index=False)

    print(f"Cartella '{folder}' creata con immagini e CSV.")


shape combined_images:  10040
shape combined_labels:  10040
Cartella 'small_with_cleaned_retrieval' creata con immagini e CSV.
shape combined_images:  9244
shape combined_labels:  9244
Cartella 'filtered_small_with_retrieval' creata con immagini e CSV.
shape combined_images:  10040
shape combined_labels:  10040
Cartella 'small_with_retrieval' creata con immagini e CSV.
shape combined_images:  9244
shape combined_labels:  9244
Cartella 'filtered_small_with_cleaned_retrieval' creata con immagini e CSV.
