In [40]:
import os
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
from torchvision import models
import numpy as np

### Load ResNet50 pre-trained model

In [41]:
# Download the ResNet50 pre-trained model.
resnet50_model = models.resnet50(pretrained = True) # pretrained: If True, returns a model pre-trained on ImageNet.
print(resnet50_model)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [42]:
layers_dict = {
    "layer 1": -5,
    "layer 2": -4,
    "layer 3": -3,
    "layer 4": -2
}

In [43]:
# Create a new model that its output is the output of the chosen intermediate layer from resnet50 model.
resnet50_intermediate_layer_model = nn.Sequential(*list(resnet50_model.children())[:layers_dict["layer 2"]])

### Load ResNet50 MoCo model

In [44]:
# Download the pre-trained weights and save them in a dictionairy.
# The directory where my checkpoints are saved.
checkpoints_directory = r"C:\Nikolaos Sintoris\Education\MEng CSE - UOI\Diploma Thesis\Checkpoints"

moco_checkpoint = torch.load(checkpoints_directory + '\\moco_v2_800ep_pretrain.pth.tar')
moco_checkpoint_dict = moco_checkpoint['state_dict']

# Rename moco pre-trained keys in order to match the ResNet50 keys.
for k in list(moco_checkpoint_dict.keys()):
    # retain only encoder_q up to before the embedding layer
    if k.startswith('module.encoder_q') and not k.startswith('module.encoder_q.fc'):
        # remove prefix
        moco_checkpoint_dict[k[len("module.encoder_q."):]] = moco_checkpoint_dict[k]
    # delete renamed or unused k
    del moco_checkpoint_dict[k]
    
# Download the ResNet50 model.
resnet50_moco_model = models.resnet50(pretrained = False)

# Load the pre-trained weights from MoCo.
resnet50_moco_model.load_state_dict(moco_checkpoint_dict, strict = False)

_IncompatibleKeys(missing_keys=['fc.weight', 'fc.bias'], unexpected_keys=[])

In [45]:
# Create a new model that its output is the output of the chosen intermediate layer from resnet50 MoCo model.
resnet50_moco_intermediate_layer_model = nn.Sequential(*list(resnet50_moco_model.children())[:layers_dict["layer 2"]])

### Load ResNet50 SimCLR model

In [46]:
# Download the pre-trained weights and save them in a dictionairy.
# The directory where my checkpoints are saved.
checkpoints_directory = r"C:\Nikolaos Sintoris\Education\MEng CSE - UOI\Diploma Thesis\Checkpoints"

simclr_checkpoint = torch.load(checkpoints_directory + '\\checkpoint_0040.pth.tar')
simclr_checkpoint_dict = simclr_checkpoint['state_dict']

# Rename moco pre-trained keys in order to match the ResNet50 keys.
for k in list(simclr_checkpoint_dict.keys()):
    # retain only encoder_q up to before the embedding layer
    if k.startswith('backbone') and not k.startswith('backbone.fc'):
        # remove prefix
        simclr_checkpoint_dict[k[len("backbone."):]] = simclr_checkpoint_dict[k]
    # delete renamed or unused k
    del simclr_checkpoint_dict[k]

# Download the ResNet50 model.
resnet50_simclr_model = models.resnet50(pretrained = False)

# Load the pre-trained weights from SimCLR.
resnet50_simclr_model.load_state_dict(simclr_checkpoint_dict, strict = False)

_IncompatibleKeys(missing_keys=['fc.weight', 'fc.bias'], unexpected_keys=[])

In [47]:
# Create a new model that its output is the output of the chosen intermediate layer from resnet50 SimCLR model.
resnet50_simclr_intermediate_layer_model = nn.Sequential(*list(resnet50_simclr_model.children())[:layers_dict["layer 2"]])

### Specify image transformations

In [48]:
# Resize the image to 224x224 because VGG16 takes an input image of this size.
# transforms.Compose(): Composes several transforms together.   
# transforms.Resize(): Resize the input image to the given size.    
# transforms.ToTensor(): Convert a PIL Image or numpy.ndarray to tensor.    
# transforms.Normalize(): Normalize a tensor image with mean and standard deviation for n channels.
transform = transforms.Compose([transforms.ToTensor()])

### Store Embeddings

In [49]:
def averagePooling(my_tensor):
    temp_tensor = nn.AvgPool2d(my_tensor.shape[-1])(my_tensor)
    final_tensor = temp_tensor[:, :, -1]# Convert from 4D to 3D tensor.
    return final_tensor.T # Transpose the tensor in order to have a vector.

def averagePoolingNonSquare(my_tensor):
    temp_tensor = nn.AvgPool2d((my_tensor.shape[-2], my_tensor.shape[-1]))(my_tensor)
    final_tensor = temp_tensor[:, :, -1]# Convert from 4D to 3D tensor.
    return final_tensor.T # Transpose the tensor in order to have a vector.

### Create training data set

In [50]:
# A dictionary that has all the possible categories.
labels_dict = {
    "0": 0,
    "1": 1,
    "2": 2
}

# Set the directory that all of my data is saved.
train_dataset_directory = r"C:\Nikolaos Sintoris\Education\MEng CSE - UOI\Diploma Thesis\Scanning\Train dataset"

# About ImageFolder():
# Images should be sorted into folders. All the pictures of AK should be in one folder, 
# all the pictures of NORM should be in another etc.
# root: Root directory path
# transform: A function/transform that takes in an PIL image and returns a transformed version
training_dataset = torchvision.datasets.ImageFolder(root = train_dataset_directory , transform = transform)

In [51]:
# Save the number of training, validation and test data.
number_of_training_data = len(training_dataset)

# Validation data is 30% of training data.
number_of_validation_data = number_of_training_data - round(number_of_training_data * 0.7)
number_of_training_data = number_of_training_data - number_of_validation_data

# Create a DataLoader, which can split the training data into batches of size 1, while training. 
# shuffle=True: ensure that the batches generated in each epoch are different.
trainloader = torch.utils.data.DataLoader(training_dataset, batch_size = 1, shuffle = True)   

print("Number of training data: ", number_of_training_data)
print("Number of validation data: ", number_of_validation_data)

Number of training data:  8095
Number of validation data:  3469


In [52]:
# training_data: A torch tensor that has the output of resnet50_layer4 model from the training images.
# actual_training_labels: A torch tensor that has the actual labels of the training images.
# Same for validation.
#classifier_training_data = torch.randn(number_of_training_data, 512, dtype = torch.float)   
actual_training_labels = torch.tensor(np.arange(number_of_training_data))
actual_training_labels = actual_training_labels.type(torch.LongTensor)

#classifier_validation_data = torch.randn(number_of_validation_data, 512, dtype = torch.float)   
actual_validation_labels = torch.tensor(np.arange(number_of_validation_data))
actual_validation_labels = actual_validation_labels.type(torch.LongTensor)


moco_training_data = torch.randn(number_of_training_data, 512, dtype = torch.float)   
moco_validation_data = torch.randn(number_of_validation_data, 512, dtype = torch.float)     

simclr_training_data = torch.randn(number_of_training_data, 512, dtype = torch.float)   
simclr_validation_data = torch.randn(number_of_validation_data, 512, dtype = torch.float)


resnet50_intermediate_layer_model.eval()
resnet50_moco_intermediate_layer_model.eval()
resnet50_simclr_intermediate_layer_model.eval()

train_index = 0
validation_index = 0
for image, label in trainloader:
    with torch.no_grad():
        # Take the output from layer_1, then apply average poooling and store it to the training_data.
        #classifier_a = resnet50_intermediate_layer_model(image)
        #classifier_b = classifier_a[-1, :, :, :] # Convert from 4D to 3D tensor.
        #classifier_output = averagePooling(classifier_b)
        moco_a = resnet50_moco_intermediate_layer_model(image)
        moco_b = moco_a[-1, :, :, :] # Convert from 4D to 3D tensor.
        moco_output = averagePooling(moco_b)
        simclr_a = resnet50_simclr_intermediate_layer_model(image)
        simclr_b = simclr_a[-1, :, :, :] # Convert from 4D to 3D tensor.
        simclr_output = averagePoolingNonSquare(simclr_b)

    # The last 30% of training data, store it as validation data.
    if(train_index < number_of_training_data):
        #classifier_training_data[train_index] = classifier_output
        moco_training_data[train_index] = moco_output
        simclr_training_data[train_index] = simclr_output

        # Create a torch tensor that has the actual labels of the training set.
        str_label = str(label.item())
        actual_training_labels[train_index] = labels_dict[str_label]

        train_index = train_index + 1
    else:
        #classifier_validation_data[validation_index] = classifier_output
        moco_validation_data[validation_index] = moco_output
        simclr_validation_data[validation_index] = simclr_output

        # Create a torch tensor that has the actual labels of the validation set.
        str_label = str(label.item())
        actual_validation_labels[validation_index] = labels_dict[str_label]

        validation_index = validation_index + 1

"""""""""
print("ResNet50-Classifier")
print("\tTrain Data: ")
print("\t\tTraining data shape: ", classifier_training_data.shape)
print("\t\tTraining labels shape: ", actual_training_labels.shape)

print("\tValidation Data: ")
print("\t\tValidation data shape: ", classifier_validation_data.shape)
print("\t\tValidation labels shape: ", actual_validation_labels.shape)

"""""""""
print("ResNet50-MoCo")
print("\tTrain Data: ")
print("\t\tTraining data shape: ", moco_training_data.shape)
print("\t\tTraining labels shape: ", actual_training_labels.shape)

print("\tValidation Data: ")
print("\t\tValidation data shape: ", moco_validation_data.shape)
print("\t\tValidation labels shape: ", actual_validation_labels.shape)

print("ResNet50-SimCLR")
print("\tTrain Data: ")
print("\t\tTraining data shape: ", simclr_training_data.shape)
print("\t\tTraining labels shape: ", actual_training_labels.shape)

print("\tValidation Data: ")
print("\t\tValidation data shape: ", simclr_validation_data.shape)
print("\t\tValidation labels shape: ", actual_validation_labels.shape)

ResNet50-MoCo
	Train Data: 
		Training data shape:  torch.Size([8095, 512])
		Training labels shape:  torch.Size([8095])
	Validation Data: 
		Validation data shape:  torch.Size([3469, 512])
		Validation labels shape:  torch.Size([3469])
ResNet50-SimCLR
	Train Data: 
		Training data shape:  torch.Size([8095, 512])
		Training labels shape:  torch.Size([8095])
	Validation Data: 
		Validation data shape:  torch.Size([3469, 512])
		Validation labels shape:  torch.Size([3469])


### Convert tensors to np arrays

In [53]:
# Convert tensors to numpy arrays in order to store them in csv files.
#classifier_training_data_np = classifier_training_data.numpy()
actual_training_labels_np = actual_training_labels.numpy()

#classifier_validation_data_np = classifier_validation_data.numpy()
actual_validation_labels_np = actual_validation_labels.numpy()

moco_training_data_np = moco_training_data.numpy()
moco_validation_data_np = moco_validation_data.numpy()

simclr_training_data_np = simclr_training_data.numpy()
simclr_validation_data_np = simclr_validation_data.numpy()

### Store embeddings in csv files

In [55]:
# Store the embeddings in csv files.
#classifier_embeddings_directory = r"C:\Nikolaos Sintoris\Education\MEng CSE - UOI\Diploma Thesis\Scanning\Train results\ResNet50 Classifier\Layer 1\Embeddings"

#if not os.path.exists(classifier_embeddings_directory):
#    os.makedirs(classifier_embeddings_directory)

moco_embeddings_directory = r"C:\Nikolaos Sintoris\Education\MEng CSE - UOI\Diploma Thesis\Scanning\Train results\ResNet50 MoCo\Layer 2\Embeddings"

if not os.path.exists(moco_embeddings_directory):
    os.makedirs(moco_embeddings_directory)

simclr_embeddings_directory = r"C:\Nikolaos Sintoris\Education\MEng CSE - UOI\Diploma Thesis\Scanning\Train results\ResNet50 SimCLR\Layer 2\Embeddings"

if not os.path.exists(simclr_embeddings_directory):
    os.makedirs(simclr_embeddings_directory)

case_data = ["training_data", "actual_training_labels", "validation_data", "actual_validation_labels"]

#classifier_case_data_np_dict = {
#    "training_data": classifier_training_data_np,
#    "actual_training_labels": actual_training_labels_np,
#    "validation_data": classifier_validation_data_np,
#    "actual_validation_labels": actual_validation_labels_np
#}


moco_case_data_np_dict = {
    "training_data": moco_training_data_np,
    "actual_training_labels": actual_training_labels_np,
    "validation_data": moco_validation_data_np,
    "actual_validation_labels": actual_validation_labels_np
}

simclr_case_data_np_dict = {
    "training_data": simclr_training_data_np,
    "actual_training_labels": actual_training_labels_np,
    "validation_data": simclr_validation_data_np,
    "actual_validation_labels": actual_validation_labels_np
}

case_data_csv_dict = {
    "training_data": "\\training_data.csv",
    "actual_training_labels": "\\actual_training_labels.csv",
    "validation_data": "\\validation_data.csv",
    "actual_validation_labels": "\\actual_validation_labels.csv"
}

# Save every numpy array to a different csv file.
"""""""""
for current_data in case_data:
    np.savetxt(classifier_embeddings_directory + case_data_csv_dict[current_data], classifier_case_data_np_dict[current_data], delimiter = ',')
"""""""""

for current_data in case_data:
    np.savetxt(moco_embeddings_directory + case_data_csv_dict[current_data], moco_case_data_np_dict[current_data], delimiter = ',')

for current_data in case_data:
    np.savetxt(simclr_embeddings_directory + case_data_csv_dict[current_data], simclr_case_data_np_dict[current_data], delimiter = ',')