In [None]:

import os
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.models as models
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader, Dataset
from PIL import Image
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score


In [None]:

class LazyDataset(Dataset):
    def __init__(self, image_dir, transform=None):
        self.image_dir = image_dir
        self.image_filenames = os.listdir(image_dir)
        self.transform = transform

    def __len__(self):
        return len(self.image_filenames)

    def __getitem__(self, idx):
        img_path = os.path.join(self.image_dir, self.image_filenames[idx])
        image = Image.open(img_path).convert("RGB")
        if self.transform:
            image = self.transform(image)
        return image


In [None]:

class SimCLRTransform:
    def __init__(self, size=224):
        self.transform = transforms.Compose([
            transforms.RandomResizedCrop(size),
            transforms.RandomHorizontalFlip(),
            transforms.RandomApply([transforms.ColorJitter(0.4, 0.4, 0.4, 0.1)], p=0.8),
            transforms.RandomGrayscale(p=0.2),
            transforms.GaussianBlur(kernel_size=9),
            transforms.ToTensor()
        ])

    def __call__(self, x):
        return self.transform(x), self.transform(x)


In [None]:

class SimCLRModel(nn.Module):
    def __init__(self, base_encoder=models.resnet18, projection_dim=128):
        super().__init__()
        self.encoder = base_encoder(pretrained=False)
        self.encoder.fc = nn.Identity()
        self.projector = nn.Sequential(
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, projection_dim)
        )

    def forward(self, x):
        h = self.encoder(x)
        z = self.projector(h)
        return F.normalize(z, dim=1)


In [None]:

def nt_xent_loss(z1, z2, temperature=0.5):
    z = torch.cat([z1, z2], dim=0)
    sim = F.cosine_similarity(z.unsqueeze(1), z.unsqueeze(0), dim=2)
    N = z.size(0)
    labels = torch.arange(N // 2, device=z.device).repeat(2)
    labels = (labels.unsqueeze(0) == labels.unsqueeze(1)).float()
    mask = torch.eye(N, dtype=torch.bool, device=z.device)
    sim = sim[~mask].view(N, -1)
    labels = labels[~mask].view(N, -1)
    sim /= temperature
    loss = -torch.log(
        torch.exp(sim) * labels / torch.exp(sim).sum(dim=1, keepdim=True)
    )
    return loss.sum() / (2 * (N // 2))


In [None]:

def train_simclr(model, dataloader, optimizer, device, epochs=100):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for (x1, x2) in dataloader:
            x1, x2 = x1.to(device), x2.to(device)
            z1, z2 = model(x1), model(x2)
            loss = nt_xent_loss(z1, z2)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}: Loss = {total_loss / len(dataloader):.4f}")


In [None]:

def extract_embeddings(model, dataloader, device):
    model.eval()
    features = []
    with torch.no_grad():
        for x in dataloader:
            x = x.to(device)
            h = model.encoder(x)
            features.append(h.cpu())
    return torch.cat(features).numpy()


In [None]:

def cluster_embeddings(embeddings, n_clusters):
    kmeans = KMeans(n_clusters=n_clusters, n_init='auto')
    y_pred = kmeans.fit_predict(embeddings)
    sil_score = silhouette_score(embeddings, y_pred)
    ch_score = calinski_harabasz_score(embeddings, y_pred)
    return y_pred, sil_score, ch_score

# Dataset Boold Cell

In [None]:
IMAGE_DIR = "blood_cell/blood_cell/segmenter"

GPU = True

LEARNING_RATE = 3e-4

EPOCHS = 100

device = torch.device("cuda" if GPU else "cpu")
transform = SimCLRTransform()
dataset = LazyDataset(IMAGE_DIR, transform=transform)
dataloader = DataLoader(dataset, batch_size=256, shuffle=True, num_workers=4, drop_last=True)

model = SimCLRModel().to(device)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

train_simclr(model, dataloader, optimizer, device, epochs=EPOCHS)

# Para extraer embeddings sin augmentaciones
eval_transform = transforms.Compose([transforms.Resize((224, 224)), transforms.ToTensor()])
eval_dataset = LazyDataset(IMAGE_DIR, transform=eval_transform)
eval_loader = DataLoader(eval_dataset, batch_size=128, shuffle=False)

embeddings = extract_embeddings(model, eval_loader, device)
y_pred, sil_score, ch_score = cluster_embeddings(embeddings, n_clusters=10)
print("Silhouette Score:", sil_score)
print("Calinski-Harabasz Score:", ch_score)


# Dataset CIFAR

In [None]:
IMAGE_DIR = "cifar-10/cifar-10/cifar-10-batches-py"

GPU = True

LEARNING_RATE = 3e-4

EPOCHS = 100

device = torch.device("cuda" if GPU else "cpu")
transform = SimCLRTransform()
dataset = LazyDataset(IMAGE_DIR, transform=transform)
dataloader = DataLoader(dataset, batch_size=256, shuffle=True, num_workers=4, drop_last=True)

model = SimCLRModel().to(device)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

train_simclr(model, dataloader, optimizer, device, epochs=EPOCHS)

# Para extraer embeddings sin augmentaciones
eval_transform = transforms.Compose([transforms.Resize((224, 224)), transforms.ToTensor()])
eval_dataset = LazyDataset(IMAGE_DIR, transform=eval_transform)
eval_loader = DataLoader(eval_dataset, batch_size=128, shuffle=False)

embeddings = extract_embeddings(model, eval_loader, device)
y_pred, sil_score, ch_score = cluster_embeddings(embeddings, n_clusters=10)
print("Silhouette Score:", sil_score)
print("Calinski-Harabasz Score:", ch_score)