In [5]:
from torchvision import transforms
from torchvision.datasets import ImageFolder
import os
import copy
import torch
import numpy as np
from torch import nn
from torchvision.datasets import MNIST
from torch.utils.data import DataLoader, ConcatDataset, Subset
from torchvision import transforms
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
# import wandb

In [6]:
class ImageDataLoader:
    def __init__(self, data_dir):
        self.data_dir = data_dir
        self.transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
        ])
        self.dataset = ImageFolder(self.data_dir, transform=self.transform, target_transform=self._get_class_name)
        # if self.selected_classes is not None:
        #     self._filter_classes()
        # self.dataloader = torch.utils.data.DataLoader(self.dataset, batch_size=self.batch_size, shuffle=self.shuffle)

    def __len__(self):
        return len(self.dataset)

    def __iter__(self):
        return iter(self.dataloader)

    def _get_class_name(self, index):
        return index

    # def _filter_classes(self):
    #     self.dataset.samples = [sample for sample in self.dataset.samples if self._get_class_from_image_path(sample[0]) in self.selected_classes]
    #     self.dataset.targets = [self.dataset.class_to_idx[self._get_class_from_image_path(sample[0])] for sample in self.dataset.samples]
        # self.dataset.classes = self.selected_classes
        
    # def _get_class_from_image_path(self, image_path):
    #     path = os.path.dirname(image_path)
    #     folders = path.split("/")
    #     # print(folders)
    #     return folders[3]
        


# Example usage:

data_dir = './data/binary/hiper_normal/'

data_loader = ImageDataLoader(data_dir)

In [115]:
# print(np.unique(trainloader.dataset.dataset.targets, return_counts=True))
# print(np.unique(testloader.dataset.dataset.targets, return_counts=True))
# print(train_subset.dataset.samples)


def initialize_wandb(inputs):
    wandb.init(name=inputs.name, project=inputs.PROJECT, entity=inputs.ENTITY)
    wandb.config = inputs.wandb

def compute_metrics(outputs, labels):
    # convert outputs to the predicted classes
    _, pred = torch.max(outputs, 1)

    # compare predictions to true label
    total = len(labels)
    true_postives = pred.eq(labels.data.view_as(pred)).sum().item()
    accuracy = true_postives / len(labels)

    return {
        'tp': true_postives,
        'accuracy': accuracy,
        'total': total
    }

def train_step(model, train_loader, optimizer, scheduler, criterion, scaler, device):
    running_loss, tp, total = 0, 0, 0
    for imgs, labels in train_loader:
        # put model in training mode
        model.train()
        # send images and labels to device
        imgs, labels = imgs.to(device), labels.to(device)

        # feedforward and loss with mixed-precision
        optimizer.zero_grad()
        with torch.cuda.amp.autocast():
            # TODO: check if this output is logits, probabilities or log of probabilities
            outputs = model(imgs)
            loss = criterion(outputs, labels)

        # sum up the loss
        running_loss += loss.item() * len(imgs)

        # backpropagation with mixed precision training
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        # Update learning rate
        scheduler.step()

        metrics = compute_metrics(outputs, labels)
        tp += metrics['tp']
        total += metrics['total']

    accuracy = tp / total
    print(f'Training loss: {running_loss / len(train_loader):.5f}')
    print(f'Training accuracy: {100*accuracy:.2f} (%)')

    # wandb log
    wandb.log({
        'train_loss': running_loss / len(train_loader),
        'train_accuracy': accuracy
    })

def val_step(model, testloader, criterion, device):
    pass


def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
    print("Saving checkpoint...")
    torch.save(state, filename)


def load_checkpoint(checkpoint_path, model, optimizer, device):
    # TODO: add and check device
    checkpoint = torch.load(checkpoint_path)

    # load variables
    model.load_state_dict(checkpoint['state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer'])
    step = checkpoint['step']

    return step

In [51]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# inputs = Inputs() #Load parameters

# initialize_wandb(inputs)

data_dir = './data/binary/hiper_normal/'

data_loader = ImageDataLoader(data_dir)

# print(inputs)
# print(f'\nModel is {inputs.model_name}')



# model = get_classification_model(inputs.model_name, 2)


# optimizer and scheduler
# optimizer = inputs.OPTIMIZER(model.parameters(), lr=inputs.lr)
# warmup_steps = len(dataloaders['train']) * inputs.WARMUP_EPOCHS
# scheduler = linear_warmup(optimizer, warmup_steps)

k_folds = 5
num_epochs = 1
loss_function = nn.CrossEntropyLoss()

# For fold results
results = {}

# Set fixed random number seed
torch.manual_seed(42)

train_transform = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.RandomResizedCrop(224),
    transforms.ColorJitter(brightness=0.4, contrast=0.4,saturation=0.4, hue=0.1),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
])
test_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
])

# Define the K-fold Cross Validator


# Start print
print('--------------------------------')

# if load_model:
#     step = load_checkpoint(load_model, model, optimizer, device)
# else:
#     step = 0


# for more than 1 GPU
if torch.cuda.device_count() > 1:
    print(f"Let's use {torch.cuda.device_count()} GPUs!\n")
    model = nn.DataParallel(model)
else:
    print('Using a single GPU\n')
# model.to(device)



skfold = StratifiedKFold(n_splits=k_folds, shuffle=True)
binary_labels = [sample[1] for sample in data_loader.dataset.samples]

for fold, (train_ids, test_ids) in enumerate(skfold.split(data_loader.dataset, binary_labels)):
# K-fold Cross Validation model evaluation


    print(f'FOLD {fold +1}')
    print('--------------------------------')


    train_subset = Subset(data_loader.dataset, train_ids)
    train_subset.transform = train_transform
    

    trainloader = torch.utils.data.DataLoader(
        train_subset,
        batch_size=50, 
        shuffle=True 
        )
    

    test_subset = Subset(data_loader.dataset, test_ids) 
    test_subset.transform = test_transform
    testloader = torch.utils.data.DataLoader(
        test_subset,
        batch_size=50, 
        shuffle=True
        )

    # max_val_accuracy, min_val_loss = 0, 10000000000
    # for epoch in range(1, num_epochs+1):

    #     # Print epoch
    #     print(f'Epoch {epoch}/{num_epochs}')

    #     # Set current loss value
    #     train_step(model, trainloader, optimizer, scheduler, criterion, scaler, device)
    #     val_accuracy, val_loss = val_step(model, testloader, criterion, device)





    #     if max_val_accuracy < val_accuracy:
    #         print(f'Accuracy increased from {max_val_accuracy:.4f}' + \
    #               f' to {val_accuracy:.4f} ({epoch}/{num_epochs})')

    #         max_val_accuracy = val_accuracy
    #         if save_model:
    #             checkpoint = {
    #                 'state_dict': model.state_dict(),
    #                 'optimizer': optimizer.state_dict(),
    #                 'step': step
    #             }
    #             save_checkpoint(checkpoint, filename=f'checkpoint-{name}-max-acc.pth.tar')

    #     if min_val_loss > val_loss:
    #         print(f'Validation loss decreased from {min_val_loss:.2f}' + \
    #               f' to {val_loss:.2f} ({epoch}/{epochs})')

    #         min_val_loss = val_loss
    #         if save_model:
    #             checkpoint = {
    #                 'state_dict': model.state_dict(),
    #                 'optimizer': optimizer.state_dict(),
    #                 'step': step
    #             }
    #             save_checkpoint(checkpoint, filename=f'checkpoint-{name}-min-loss.pth.tar')

--------------------------------
Using a single GPU

FOLD 1
--------------------------------
FOLD 2
--------------------------------
FOLD 3
--------------------------------
FOLD 4
--------------------------------
FOLD 5
--------------------------------


In [13]:
my_transformation = transforms.Compose([
            transforms.ToTensor(),
            transforms.Resize((224, 224)),
])

skfold = StratifiedKFold(n_splits=5, shuffle=True)

binary_labels = [sample[1] for sample in data_loader.dataset.samples]
for fold, (train_ids, test_ids) in enumerate(skfold.split(data_loader.dataset, binary_labels)):

    train_subset = Subset(data_loader.dataset, train_ids)
    train_subset.transform = my_transformation
    sampler = get_balanced_dataset_sampler(data_loader, train_ids, train_subset)
    train_loader = DataLoader(train_subset, batch_size=50, sampler=sampler)

    test_subset = Subset(data_loader.dataset, test_ids) 
    test_subset.transform = my_transformation
    test_loader = DataLoader(test_subset, batch_size=50, shuffle=True )
    
    break

In [18]:
with torch.no_grad():
    for x, y in test_loader:
        x = x.to(device='cuda')
        y = y.to(device='cuda')
        print(y)

tensor([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0,
        0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0,
        1, 0], device='cuda:0')
tensor([0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0,
        1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1,
        1, 0], device='cuda:0')
tensor([0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1,
        0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0,
        0, 0], device='cuda:0')
tensor([0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0,
        0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0,
        0, 0], device='cuda:0')
tensor([1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1,
        1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0,
        1, 1], device='cuda:0')
tensor([1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0,

In [29]:
from torchvision.utils import save_image
my_transformation = transforms.Compose([
            transforms.ToTensor(),
            transforms.Resize((224, 224)),
            transforms.Pad(64, padding_mode='reflect'),
            transforms.RandomHorizontalFlip(), 
            transforms.RandomVerticalFlip(),
            transforms.RandomRotation(20), 
            transforms.Normalize(mean=[0.5, 0.5, 0.5],std=[0.5, 0.5, 0.5]),
])

train_transform = transforms.Compose([
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomRotation(degrees=10),
    # transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.2),
    # transforms.RandomAffine(degrees=10, translate=(0.1,0.1), scale=(0.9,1.1), shear=10),
    transforms.Pad(padding=(10, 0), fill=0, padding_mode='symmetric'),
    transforms.ToTensor(),
])

test_dataset = ImageFolder('./data/test/', transform=train_transform)


for idx, (img, label) in enumerate(test_dataset):
    save_image(img, f'./data/results/img_{idx}.png')

In [59]:

binary_labels = [sample[1] for sample in trainloader.dataset.dataset.samples]
np.unique(binary_labels, return_counts=True)

(array([0, 1]), array([2043, 1570]))

In [80]:
sum(np.unique(np.array(binary_labels), return_counts=True)[1]) /np.unique(np.array(binary_labels), return_counts=True)[1]



array([1.76847773, 2.30127389])

In [91]:
binary_labels = [sample[1] for sample in data_loader.dataset.samples]
sum(np.unique(np.array(binary_labels)[train_ids], return_counts=True)[1] ) /np.unique(np.array(binary_labels)[train_ids], return_counts=True)[1] 

array([1.76819572, 2.30175159])

In [10]:
def get_balanced_dataset_sampler(data_loader, train_ids, train_subset):


    binary_labels = [sample[1] for sample in data_loader.dataset.samples]
    class_weights = 1 / np.unique(np.array(binary_labels)[train_ids], return_counts=True)[1] 

    sample_weights = [0] * len(train_ids)


    for idx, (data, label) in enumerate(train_subset):
        class_weight = class_weights[label]
        sample_weights[idx] = class_weight

    sampler = WeightedRandomSampler(
        sample_weights, num_samples=len(sample_weights), replacement=True
    )
    return sampler

In [2]:
import torch
torch.cuda.is_available()
torch.zeros(1).cuda()

AssertionError: Torch not compiled with CUDA enabled

In [82]:
sum(np.unique(np.array(binary_labels)[test_ids], return_counts=True)[1] ) /np.unique(np.array(binary_labels)[test_ids], return_counts=True)[1] 

array([1.76960784, 2.29936306])

In [3]:
import torch
import torchvision.datasets as datasets
import os
from torch.utils.data import WeightedRandomSampler, DataLoader
import torchvision.transforms as transforms
import torch.nn as nn

# Methods for dealing with imbalanced datasets:
# 1. Oversampling (probably preferable)
# 2. Class weighting


# trainloader = torch.utils.data.DataLoader(
#     train_subset,
#     batch_size=50, 
#     shuffle=True 
#     )


def get_loader(root_dir, batch_size):
    my_transforms = transforms.Compose(
        [
            transforms.ToTensor(),
            transforms.Resize((224, 224)),
            transforms.Pad(64, padding_mode='reflect'),
            transforms.RandomHorizontalFlip(), 
            transforms.RandomVerticalFlip(),
            transforms.RandomRotation(20), 
            transforms.Normalize(mean=[0.5, 0.5, 0.5],std=[0.5, 0.5, 0.5]),

        ]
    )

    dataset = datasets.ImageFolder(root=root_dir, transform=my_transforms)
    subdirectories = dataset.classes
    class_weights = []

    # loop through each subdirectory and calculate the class weight
    # that is 1 / len(files) in that subdirectory
    print(f'subdirectories :{subdirectories}')
    for subdir in subdirectories:
        print(f'subdirectories :{subdir}')
        files = os.listdir(os.path.join(root_dir, subdir))
        print(f'len(files) :{len(files)}')
        class_weights.append(1 / len(files))

    print(f'class_weights:{class_weights}')
    sample_weights = [0] * len(dataset)
    print(f'sample_weights : {sample_weights}')

    for idx, (data, label) in enumerate(dataset):
        class_weight = class_weights[label]
        sample_weights[idx] = class_weight

    print(f'sample_weights : {sample_weights}')
    sampler = WeightedRandomSampler(
        sample_weights, num_samples=len(sample_weights), replacement=True
    )

    loader = DataLoader(dataset, batch_size=batch_size, sampler=sampler)
    # loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    return loader


def main():
    loader = get_loader(root_dir="./data/raw/normal/", batch_size=5)

    num_retrievers = 0
    num_elkhounds = 0
    for epoch in range(10):
        # print(f'Epoch {epoch+1}')
        for data, labels in loader:
            # print(labels)
            num_retrievers += torch.sum(labels == 0)
            num_elkhounds += torch.sum(labels == 1)

    # print(num_retrievers.item())
    # print(num_elkhounds.item())


if __name__ == "__main__":
    main()

subdirectories :['AZAN', 'HE', 'PAMS', 'PAS', 'PICRO']
subdirectories :AZAN
len(files) :126
subdirectories :HE
len(files) :870
subdirectories :PAMS
len(files) :189
subdirectories :PAS
len(files) :294
subdirectories :PICRO
len(files) :93
class_weights:[0.007936507936507936, 0.0011494252873563218, 0.005291005291005291, 0.003401360544217687, 0.010752688172043012]
sample_weights : [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0



KeyboardInterrupt: 

In [12]:
print(50 + 100)
print(79 + 71)
print(62 + 88)

150
150
150


In [None]:
                                transforms.ToPILImage(),
                                  transforms.Pad(64, padding_mode='reflect'),
                                  transforms.RandomHorizontalFlip(), 
                                  transforms.RandomVerticalFlip(),
                                  transforms.RandomRotation(20), 
                                  transforms.ToTensor(),
                                  transforms.Normalize(mean=[0.5, 0.5, 0.5],std=[0.5, 0.5, 0.5])])

trans_valid = transforms.Compose([transforms.ToPILImage(),
                                  transforms.Pad(64, padding_mode='reflect'