# JULE

## Importación de librerías

In [13]:
import pandas as pd
import sys
from sklearn.model_selection import train_test_split
import time
import torch.optim as optim
import torch.nn as nn
import numpy as np
import os
import matplotlib.pyplot as plt
from pathlib import Path
import torch
import torchvision.transforms as transforms
from torchvision import models
from torch.utils.data import DataLoader,Dataset
from tqdm.notebook import tqdm # Progession bar
from PIL import Image
import torchvision.models as models
from torchvision.models import ResNet50_Weights
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import AgglomerativeClustering

## Objetos y rutas

In [14]:

src_path = os.path.abspath('../src')
if src_path not in sys.path:
    sys.path.append(src_path)

data_dir = os.path.join('..', 'data')
metadata_dir = os.path.join(data_dir, 'metadata')
test_csv = os.path.join(metadata_dir, 'test_metadata.csv')
train_csv = os.path.join(metadata_dir, 'train_metadata.csv')
val_csv = os.path.join(metadata_dir, 'val_metadata.csv')

images_dir = os.path.join(data_dir,'images')
processed_dir = os.path.join(data_dir,'processed')
model_dir = os.path.join('..','models','jule')

In [15]:
def make_data_loaders_unsupervised(train_csv, val_csv, test_csv,processed_dir, images_dir, batch_size, image_size):
    from torch.utils.data import DataLoader, Dataset
    from PIL import Image

    class ImageDataset(Dataset):
        def __init__(self, csv_file, root_dir, transform=None):
            self.data = pd.read_csv(csv_file)
            self.root_dir = root_dir
            self.transform = transform

        def __len__(self):
            return len(self.data)

        def __getitem__(self, idx):
            img_name = os.path.join(self.root_dir, self.data.iloc[idx, 0])
            image = Image.open(img_name).convert('RGB')

            # Obtener las etiquetas (asumiendo que empiezan desde el segundo índice)
            labels = self.data.iloc[idx, 1:].values.astype(np.float32)

            if self.transform:
                image = self.transform(image)

            return image, labels

    transform = transforms.Compose([
        transforms.Resize((image_size, image_size)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    train_dataset = ImageDataset(train_csv, processed_dir, transform)
    val_dataset = ImageDataset(val_csv, images_dir, transform)
    test_dataset = ImageDataset(test_csv, images_dir, transform)

    dataloaders = {
        'train': DataLoader(train_dataset, batch_size=batch_size, shuffle=True),
        'val': DataLoader(val_dataset, batch_size=batch_size, shuffle=False),
        'test': DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    }
    return dataloaders


In [16]:
dataloaders = make_data_loaders_unsupervised(train_csv,val_csv,test_csv,processed_dir,images_dir,33,224)

## CNN

In [17]:

class ResNet50Encoder(nn.Module):
    def __init__(self, embedding_dim=128):
        super(ResNet50Encoder, self).__init__()
        resnet = models.resnet50(weights=ResNet50_Weights.IMAGENET1K_V1)
        self.features = nn.Sequential(*list(resnet.children())[:-1]) 
        self.fc = nn.Linear(resnet.fc.in_features, embedding_dim)  
        self.normalize = nn.functional.normalize  

    def forward(self, x):
        x = self.features(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        return self.normalize(x, p=2, dim=1)

In [18]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ResNet50Encoder().to(device)


## Función de pérdida

In [19]:
class MultiLabelTripletLoss(nn.Module):
    def __init__(self, margin=0.2):
        super(MultiLabelTripletLoss, self).__init__()
        self.margin = margin

    def forward(self, anchor, positive, negative):
        pos_dist = torch.sum((anchor - positive) ** 2, dim=1)
        neg_dist = torch.sum((anchor - negative) ** 2, dim=1)
        loss = torch.relu(pos_dist - neg_dist + self.margin)
        return torch.mean(loss)



## Clustering

In [20]:
class DynamicClusterer:
    def __init__(self, n_clusters, linkage='average'):
        self.n_clusters = n_clusters
        self.linkage = linkage

    def fit_predict(self, embeddings):
        # Calculate cosine similarity matrix instead of Euclidean distance
        similarity_matrix = cosine_similarity(embeddings)
        # Convert similarity to distance (1 - similarity)
        distance_matrix = 1 - similarity_matrix
        # Perform hierarchical clustering
        clustering_model = AgglomerativeClustering(
            n_clusters=self.n_clusters, metric='precomputed', linkage=self.linkage
        )
        cluster_labels = clustering_model.fit_predict(distance_matrix)
        return cluster_labels
def perform_clustering_dynamic(embeddings, n_clusters=14):
    clusterer = DynamicClusterer(n_clusters=n_clusters)
    return clusterer.fit_predict(embeddings)

## Entrenamiento

In [21]:
def train_model_unsupervised(model, dataloader, optimizer, criterion, device, num_epochs=10, n_clusters=14):
    model.train()
    for epoch in range(num_epochs):
        all_embeddings = []
        for imgs, _ in dataloader:
            imgs = imgs.to(device)
            embeddings = model(imgs)
            all_embeddings.append(embeddings.detach().cpu().numpy())

        all_embeddings = np.vstack(all_embeddings)
        cluster_labels = perform_clustering_dynamic(all_embeddings, n_clusters=n_clusters)
        print(f"Epoch {epoch + 1}/{num_epochs}")
        print(f"Cluster distribution: {np.bincount(cluster_labels)}")

        # Update triplets and train
        for imgs,_ in dataloader:
            imgs = imgs.to(device)
            embeddings = model(imgs)
            batch_size = embeddings.size(0)
            divisible_batch_size = (batch_size // 3) * 3
            if divisible_batch_size < 3:
                continue  # Saltar lotes demasiado pequeños

            embeddings = embeddings[:divisible_batch_size]  # Truncar a divisible por 3
            anchor, positive, negative = torch.chunk(embeddings, 3)
           
            loss = criterion(anchor, positive, negative)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        torch.save(model.state_dict(), f"model_{n_clusters}_epoch_{epoch + 1}.pth")

In [22]:
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = MultiLabelTripletLoss()
dataloader = dataloaders['train']
train_model_unsupervised(model, dataloader, optimizer, criterion, device, num_epochs=5, n_clusters=14)



In [None]:

def calculate_nmi(dataloader, model, device, n_clusters=14):
    # Step 1: Generate embeddings
    embeddings, true_labels = load_model_and_generate_embeddings(dataloader, model, device)

    # Step 2: Perform clustering on embeddings
    cluster_labels = perform_clustering(embeddings, n_clusters=n_clusters)

    # Step 3: Compute NMI
    nmi_score = normalized_mutual_info_score(true_labels, cluster_labels)
    print(f"NMI Score: {nmi_score:.4f}")
    return nmi_score




In [None]:
calculate_nmi(dataloader['test'],model,device,n_clusters=14)