In [47]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torchvision import models
import numpy as np
import pandas as pd
import sys
from sklearn.model_selection import train_test_split
import time
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader,Dataset
from tqdm.notebook import tqdm # Progession bar
from PIL import Image


In [48]:
def make_data_loaders(train_csv, val_csv, test_csv, images_dir, batch_size, image_size):
    from torch.utils.data import DataLoader, Dataset
    from PIL import Image

    class ImageDataset(Dataset):
        def __init__(self, csv_file, root_dir, transform=None):
            self.data = pd.read_csv(csv_file)
            self.root_dir = root_dir
            self.transform = transform

        def __len__(self):
            return len(self.data)

        def __getitem__(self, idx):
            img_name = os.path.join(self.root_dir, self.data.iloc[idx, 0])
            image = Image.open(img_name).convert('RGB')
            if self.transform:
                image = self.transform(image)
            return image

    transform = transforms.Compose([
        transforms.Resize((image_size, image_size)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    train_dataset = ImageDataset(train_csv, images_dir, transform)
    val_dataset = ImageDataset(val_csv, images_dir, transform)
    test_dataset = ImageDataset(test_csv, images_dir, transform)

    dataloaders = {
        'train': DataLoader(train_dataset, batch_size=batch_size, shuffle=True),
        'val': DataLoader(val_dataset, batch_size=batch_size, shuffle=False),
        'test': DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    }
    return dataloaders


In [49]:

train_csv = "C:/Users/RSCBAL04/Documents/GitHub/ChestX-ray8_classification/data/metadata/train_metadata.csv"
val_csv ="C:/Users/RSCBAL04/Documents/GitHub/ChestX-ray8_classification/data/metadata/val_metadata.csv"
test_csv =train_csv
images_dir = "C:/Users/RSCBAL04/Documents/GitHub/ChestX-ray8_classification/data/images"
dataloaders = make_data_loaders(train_csv, val_csv, test_csv, images_dir, batch_size=16, image_size=224)

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



In [50]:
class DeepClusterModel(nn.Module):
    def __init__(self, base_model):
        super(DeepClusterModel, self).__init__()
        self.features = nn.Sequential(*list(base_model.children())[:-1])
        self.fc = nn.Linear(base_model.fc.in_features, 128)  # Dimensión del espacio de características

    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x

In [51]:

from torchvision.models import ResNet50_Weights
def initialize_model():
    base_model = models.resnet50(weights=ResNet50_Weights.IMAGENET1K_V1)
    model = DeepClusterModel(base_model)
    return model

In [52]:
from sklearn.cluster import KMeans
def run_kmeans(features, num_clusters):
    kmeans = KMeans(n_clusters=num_clusters, n_init=20, verbose=1)
    assignments = kmeans.fit_predict(features)
    return assignments

In [53]:
def train_deepcluster(model, train_dataloader,val_loader, num_clusters, num_epochs):
    optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
    criterion = nn.CrossEntropyLoss()
    train_losses = []
    val_losses = []
    for epoch in range(num_epochs):
        model.train()
        features = []
        print(features)
        for inputs in train_dataloader:
           
            inputs = inputs.cuda()
            outputs = model(inputs)
           
            features.append(outputs.detach().cpu().numpy())

        features = np.vstack(features)
        assignments = run_kmeans(features, num_clusters)

        model.train()
        for batch_idx, inputs in enumerate(train_dataloader):
            inputs = inputs.cuda()
            optimizer.zero_grad()
            outputs = model(inputs)
            batch_assignments = torch.tensor(assignments[batch_idx * inputs.size(0):(batch_idx + 1) * inputs.size(0)], dtype=torch.long).cuda()
            loss = criterion(outputs, batch_assignments)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        train_losses.append(train_loss / len(train_dataloader))

        # Calcular la pérdida de validación
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for inputs in val_loader:
                inputs = inputs.cuda()
                outputs = model(inputs)
                batch_assignments = torch.tensor(assignments[:inputs.size(0)], dtype=torch.long).cuda()
                loss = criterion(outputs, batch_assignments)
                val_loss += loss.item()
        
        val_losses.append(val_loss / len(val_loader))

        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_losses[-1]}, Validation Loss: {val_losses[-1]}")

    return train_losses, val_losses
        




In [55]:
for i in dataloaders['train']:
    print(i)
    break

for i in dataloaders['val']:
    print(i)
    break

tensor([[[[-1.9124, -1.8953, -1.8953,  ..., -1.8782, -1.8782, -1.8782],
          [-1.8953, -1.8782, -1.8782,  ..., -1.8610, -1.8610, -1.8782],
          [-1.8782, -1.8782, -1.8610,  ..., -1.8610, -1.8610, -1.8782],
          ...,
          [-1.8953, -1.8782, -1.8610,  ..., -1.7583, -1.9980, -1.9809],
          [-1.9124, -1.8782, -1.8610,  ..., -1.7925, -1.9980, -1.9809],
          [-1.9295, -1.8953, -1.8782,  ..., -1.8439, -2.0152, -1.9980]],

         [[-1.8256, -1.8081, -1.8081,  ..., -1.7906, -1.7906, -1.7906],
          [-1.8081, -1.7906, -1.7906,  ..., -1.7731, -1.7731, -1.7906],
          [-1.7906, -1.7906, -1.7731,  ..., -1.7731, -1.7731, -1.7906],
          ...,
          [-1.8081, -1.7906, -1.7731,  ..., -1.6681, -1.9132, -1.8957],
          [-1.8256, -1.7906, -1.7731,  ..., -1.7031, -1.9132, -1.8957],
          [-1.8431, -1.8081, -1.7906,  ..., -1.7556, -1.9307, -1.9132]],

         [[-1.5953, -1.5779, -1.5779,  ..., -1.5604, -1.5604, -1.5604],
          [-1.5779, -1.5604, -

In [57]:
model = initialize_model().cuda()
train_deepcluster(model, dataloaders['train'], dataloaders['val'], num_clusters=10, num_epochs=5)

[]


NameError: name 'dataloader' is not defined

In [41]:
torch.save(model.state_dict(), 'deepcluster_model.pth')

In [43]:
import numpy as np


np.save('assignments.npy', assignments)

NameError: name 'assignments' is not defined

In [40]:
import matplotlib.pyplot as plt

def visualize_clusters(images, assignments, num_clusters=10):
    fig, axes = plt.subplots(num_clusters, 10, figsize=(20, 20))
    for cluster in range(num_clusters):
        cluster_indices = [i for i, a in enumerate(assignments) if a == cluster]
        for i, idx in enumerate(cluster_indices[:10]):
            img = images[idx].permute(1, 2, 0).numpy()  # Convertir tensor a numpy array
            axes[cluster, i].imshow(img)
            axes[cluster, i].axis('off')
    plt.show()

# Obtener imágenes y asignaciones de clúster
images = [traindataset[i] for i in range(len(dataset))]
assignments = run_kmeans(features, num_clusters=10)

# Visualizar clústeres
visualize_clusters(images, assignments)

NameError: name 'dataset' is not defined