## Import Libraries

In [1]:
import numpy as np
import pandas as pd
from PIL import Image
import torch
from torch import optim, nn
import torch.nn.functional as F
from torchvision import transforms
import torchvision
from torch.utils.data.dataset import Dataset
import matplotlib.pyplot as plt
import os.path
from os import path
from collections import OrderedDict
import time
import glob
from torch.utils.tensorboard import SummaryWriter  # to print to tensorboard


models_dir = os.path.expanduser('/extra_disk_1/trained_model/resnet50_dcgan_SGD')
model_name = 'resnet50_dcgan_SGD.pt'
model_path = os.path.join(models_dir, model_name)
if not os.path.exists(models_dir):
    os.makedirs(models_dir)
    print("create models_dir: ", models_dir)

print('Model save/load location: {}'.format(model_path))

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


Model save/load location: /extra_disk_1/trained_model/resnet50_dcgan_SGD/resnet50_dcgan_SGD.pt


## Custom Dataset Loading Class

In [2]:
label_list = ['Cardiomegaly','Emphysema','Effusion','Hernia','Nodule','Pneumothorax','Atelectasis','Pleural_Thickening','Mass','Edema','Consolidation',
              'Infiltration','Fibrosis','Pneumonia','No Finding']

In [3]:
def resolve_full_path(img_name):
    original_is_found = False
    dcgan_is_found = False
    
    # Read 1 image file
    folder_idx_range = 13
    img_path = ''
    for folder_idx in range(folder_idx_range):
        path_prefix = "/extra_disk_1/data/images_"
        path_suffix = "images/"
        cur_img_dir = path_prefix +str(folder_idx).zfill(3) +'/'
        img_folder_path = path.join(cur_img_dir, path_suffix)
        img_path = os.path.join(img_folder_path, img_name) 
        if(path.exists(img_path)):
            original_is_found = True
            break
            
    if(not original_is_found):
        # search in dcgan_image folder
        path_prefix = glob.glob("/extra_disk_1/code/medical_ip/NIH_code/DCGAN_NIH/dcgan_image/*")
        for path_folder in path_prefix:
            img_path = os.path.join(path_folder, img_name)
            if(os.path.exists(img_path)):
                dcgan_is_found = True
                break
        if not dcgan_is_found:
            raise Exception('Couldn\'t find: {} last:{}'.format(img_name, img_path))
    return img_path
        
    
class DatasetFromCSV(Dataset):
    def __init__(self, csv_path=None, data_frame=None, transform=None):
        if(csv_path is not None):
            self.data = pd.read_csv(csv_path).head(20)
        elif data_frame is not None:
            self.data = data_frame
        else:
            raise Exception('No csv path or data frame provided')
            
        self.data_len = len(self.data.index)            # csv data length
        self.image_names = np.array(self.data.loc[:,'Image Index'])  # image names
    
        self.labels = torch.zeros(self.data_len, 15)
        labels = self.data.loc[:,'Finding Labels'] #.map(lambda x: x.split('|'))
        self.multi_hot_encoding_label(labels)
    
        self.transform = transform
        
    def __len__(self):
        return self.data_len
    
    def __getitem__(self, index):
        # Read 1 image name
        img_name = self.image_names[index]
        img_path = resolve_full_path(img_name)
        img_as_img = Image.open(img_path)

        img_as_img = img_as_img.convert("RGB")
        # Transform image to tensor
        img_as_tensor = self.transform(img_as_img)

        # Read 1 label:
        image_label = self.labels[index]

        return img_as_tensor, image_label
    
    def multi_hot_encoding_label(self, labels):
            for i,label in enumerate(labels):
                for idx in range(len(label_list)):
                    if label_list[idx] in label:
                        self.labels[i][idx] = 1

## Resample imbalanced dataset

In [4]:
# Iterate through all_labels to calculate each case's weight
def calculate_weight(data):
    D_single_weight = calculate_single_label_weight(data)
    weight = torch.zeros(data.shape[0])
    
    all_labels = data.loc[:,'Finding Labels'].map(lambda x: x.split('|'))
    for i, labels in enumerate(all_labels):
        for ii, label in enumerate(labels):
            weight[i] += D_single_weight[label]
    
    return weight

def calculate_single_label_weight(data):
    # Calculate single label weight
    D_sorted = count_label(data)
    D_single_weight = D_sorted.copy()
    for i, label in enumerate(D_single_weight.keys()):
        D_single_weight[label] = 1.0/D_single_weight[label]*1e5
        
    return D_single_weight

def count_label(data):
    D_label_count = dict()
    all_labels = data.loc[:,'Finding Labels'].map(lambda x: x.split('|'))
    for i,labels in enumerate(all_labels):
        for ii, label in enumerate(labels):
            D_label_count[label] = D_label_count.get(label, 0) + 1
    D = D_label_count
    D_sorted = OrderedDict(sorted(D.items(), key=lambda x: x[1], reverse=True))
    
    return D_sorted

## Transform

In [5]:
# Define transforms
transform = transforms.Compose([transforms.Resize(256),
                                transforms.RandomResizedCrop(224),
                                transforms.RandomHorizontalFlip(), # randomly flip and rotate
                                transforms.RandomRotation(10),
                                transforms.ToTensor(),
                                transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

## Define the model with Transfer learning

In [6]:
# Training different layers at 4 stages, each stage has epochs of 2, 3, 3, 2
def switch_stage(epoch, model):
    if epoch <= 2:
        stage=1
        print('----- STAGE 1 -----') # only training 'layer2', 'layer3', 'layer4' and 'fc'
        for name, param in model.named_parameters(): # all requires_grad by default, are True initially
            if ('layer2' in name) or ('layer3' in name) or ('layer4' in name) or ('fc' in name):
                param.requires_grad = True 
            else:
                param.requires_grad = False
    if 2 < epoch <= 5:
        stage=2
        print('\n----- STAGE 2 -----') # only training 'layer3', 'layer4' and 'fc'
        for name, param in model.named_parameters(): 
            if ('layer3' in name) or ('layer4' in name) or ('fc' in name):
                param.requires_grad = True 
            else:
                param.requires_grad = False
    if 5 < epoch <= 8:
        stage=3
        print('\n----- STAGE 3 -----') # only training  'layer4' and 'fc'
        for name, param in model.named_parameters(): 
            if ('layer4' in name) or ('fc' in name):
                param.requires_grad = True 
            else:
                param.requires_grad = False
    if 8 < epoch <= 10:
        stage=4
        print('\n----- STAGE 4 -----') # only training  'layer4' and 'fc'
        for name, param in model.named_parameters(): 
            if 'fc' in name:
                param.requires_grad = True 
            else:
                param.requires_grad = False
    log_stage = open('log_stage.txt','a')
    log_stage.write(f"---------------------------stage: [{stage}]--------------------------\n")
    for name, param in model.named_parameters(): # all requires_grad by default, are True initially
        log_stage.write('{}: {}\n'.format(name, param.requires_grad))
    log_stage.close()

In [7]:
from torchvision import models

# Use GPU if it's available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device: ",device)

def create_model():
    
    # Import pre-trained resnet
    model = models.resnet50(pretrained=False)

    # Freeze parameters so we don't backprop through them
    for param in model.parameters():
        param.requires_grad = True
#     print('----- STAGE 1 -----') # only training 'layer2', 'layer3', 'layer4' and 'fc'
#     for name, param in model.named_parameters(): # all requires_grad by default, are True initially
#         if ('layer2' in name) or ('layer3' in name) or ('layer4' in name) or ('fc' in name):
#             param.requires_grad = True 
#         else:
#             param.requires_grad = False

 
    # Change output to classfiy 14 conditioins + nothing.
    # Change a new classifier
    model.fc = nn.Sequential(
        nn.Linear(model.fc.in_features, 256),
        nn.ReLU(),
        nn.Dropout(0.2),
        nn.Linear(256, 15)
    )
    
    return model
model = create_model()
# print(model)

Using device:  cuda


### Specify Loss function and Optimizer

In [8]:
criterion = nn.BCEWithLogitsLoss()

learning_rate = 0.001
# Only train the classifier parameters, feature parameters are frozen
# optimizer = optim.Adam(model.fc.parameters(), lr=learning_rate)
# optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=learning_rate)
optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=learning_rate, momentum=0.9)

model = model.to(device);
writer = SummaryWriter(f'run/k_fold_resnet50_SGD/training')


# Kfold prep

In [9]:
from sklearn.model_selection import KFold

# Define sampler to resample the imbalanced dataset
train_dataset_entry = pd.read_csv("/extra_disk_1/code/medical_ip/Multi_Label_Dataloader_and_Classifier/traindata_paul.csv")
valid_dataset_entry = pd.read_csv("/extra_disk_1/code/medical_ip/Multi_Label_Dataloader_and_Classifier/valdata_paul.csv")

dcgan_Cardiomegaly_entry = pd.read_csv("/extra_disk_1/code/medical_ip/NIH_code/DCGAN_NIH/dcgan_image_csv/dcgan_Cardiomegaly.csv")
dcgan_Consolidation_entry = pd.read_csv("/extra_disk_1/code/medical_ip/NIH_code/DCGAN_NIH/dcgan_image_csv/dcgan_Consolidation.csv")
dcgan_Emphysema_entry = pd.read_csv("/extra_disk_1/code/medical_ip/NIH_code/DCGAN_NIH/dcgan_image_csv/dcgan_Emphysema.csv")
dcgan_Pleural_Thickening_entry = pd.read_csv("/extra_disk_1/code/medical_ip/NIH_code/DCGAN_NIH/dcgan_image_csv/dcgan_Pleural_Thickening.csv")
dcgan_Pneumothorax_entry = pd.read_csv("/extra_disk_1/code/medical_ip/NIH_code/DCGAN_NIH/dcgan_image_csv/dcgan_Pneumothorax.csv")

n_splits = 5
kf = KFold(n_splits = n_splits, shuffle = True, random_state = 2)

# non_test_set = pd.concat([train_dataset_entry, valid_dataset_entry], axis=0)
non_test_set = pd.concat([train_dataset_entry, valid_dataset_entry, dcgan_Cardiomegaly_entry,
                          dcgan_Consolidation_entry, dcgan_Emphysema_entry, dcgan_Pleural_Thickening_entry,
                          dcgan_Pneumothorax_entry], axis=0, ignore_index=True, join='inner')

print(non_test_set.head)

<bound method NDFrame.head of                     Image Index       Finding Labels
0              00000002_000.png           No Finding
1              00000003_000.png               Hernia
2              00000003_001.png               Hernia
3              00000003_002.png               Hernia
4              00000003_003.png  Hernia|Infiltration
...                         ...                  ...
94585  Pneumothorax_019_045.png         Pneumothorax
94586  Pneumothorax_019_046.png         Pneumothorax
94587  Pneumothorax_019_047.png         Pneumothorax
94588  Pneumothorax_019_048.png         Pneumothorax
94589  Pneumothorax_019_049.png         Pneumothorax

[94590 rows x 2 columns]>


## Train the model

In [None]:
from torch.autograd import Variable

# number of epochs to train the model
n_epochs = 10
k =1
valid_losses = []
train_losses = []
for  train_index, valid_index in kf.split(non_test_set):
    train = non_test_set.iloc[train_index]
    valid =  non_test_set.iloc[valid_index]
    # Define custom data loader
    train_dataset = DatasetFromCSV(data_frame=train,transform=transform)
    valid_dataset = DatasetFromCSV(data_frame=valid,transform=transform)
    batch_size_ = 10
    # Define two data loaders
    train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                                    batch_size=batch_size_,
                                                    num_workers=4,
                                                    shuffle=True)

    valid_loader = torch.utils.data.DataLoader(dataset=valid_dataset,
                                                    batch_size=batch_size_,
                                                    num_workers=4,
                                                    shuffle=True)
    log_file = open('run/SGD/log.txt', 'a')
    log_file.write('Fold: {}/{} \n'.format(k, n_splits,))
    k = k+1
    log_file.close()

    # k fold setup before
    valid_loss_min = np.Inf # track change in validation loss
    writer.add_scalar('learning rate', learning_rate)
    for epoch in range(0, n_epochs):
        t0 = time.time()
        # keep track of training and validation loss
        train_loss = 0.0
        valid_loss = 0.0
        
        # switch stage
#         switch_stage(epoch, model)
        
        log_stage_epoch = open('run/SGD/log_stage_epoch.txt','a')
        log_stage_epoch.write(f"---------------------------epoch: [{epoch}]--------------------------\n")
        for name, param in model.named_parameters(): # all requires_grad by default, are True initially
            log_stage_epoch.write('{}: {}\n'.format(name, param.requires_grad))
        log_stage_epoch.close()
        
        ###################
        # train the model #
        ###################
        model.train()
        for batch_idx, (data, target) in enumerate(train_loader):
            # move tensors to GPU if CUDA is available
            data = data.to(device)
            target = target.to(device)
            model = model.to(device)
            # clear the gradients of all optimized variables
            optimizer.zero_grad()
            # forward pass: compute predicted outputs by passing inputs to the model
            output = model(data)
            # calculate the batch loss
            loss = criterion(output, target)
            # backward pass: compute gradient of the loss with respect to model parameters
            loss.backward()
            # perform a single optimization step (parameter update)
            optimizer.step()
            # update training loss
            train_loss += loss.item()*data.size(0)

            # Print losses ocassionally and print to tensorboard
            if batch_idx % 100 == 0:
                train_loss_divided = train_loss/(batch_idx+1)
                log_file = open('run/SGD/log.txt', 'a')
                log_file.write(f'Epoch [{epoch}/{n_epochs}] Batch {batch_idx}/{len(train_loader)} Train_loss {train_loss_divided} \n')
                log_file.close()
                
                writer.add_scalar('loss', train_loss_divided, epoch*len(train_loader)+batch_idx)
#                 with torch.no_grad():
#                     img_grid_real = torchvision.utils.make_grid(data, normalize=True)
#                     writer.add_image("Train Lung Xray Images", img_grid_real)

        ######################    
        # validate the model #
        ######################
        model.eval()
        for batch_idx, (data, target) in enumerate(valid_loader):

            data = data.to(device)
            target = target.to(device)
            # forward pass: compute predicted outputs by passing inputs to the model
            output = model(data)
            # calculate the batch loss
            loss = criterion(output, target)
            # update average validation loss 
            valid_loss += loss.item()*data.size(0)

        # calculate average losses
        train_loss = train_loss/len(train_loader)
        valid_loss = valid_loss/len(valid_loader)

        t1 = time.time()
        total = t1-t0
        
        # print training/validation statistics 
        log_file = open('run/SGD/log.txt', 'a')
        log_file.write('Epoch: {}/{} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f} Duration seconds: {} \n'.format(
                        epoch, n_epochs, train_loss, valid_loss, total))
        log_file.close()
        
        writer.add_scalar('train_loss', train_loss, epoch)
        writer.add_scalar('valid_loss', valid_loss, epoch)


        # save model if validation loss has decreased
        if valid_loss <= valid_loss_min:
            log_file = open('run/SGD/log.txt', 'a')
            log_file.write('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ... \n'.format(valid_loss_min, valid_loss))
            log_file.close()
            
            torch.save(model.state_dict(), model_path)
            valid_loss_min = valid_loss
        writer.add_scalar('best_valid_loss_fold', valid_loss_min, epoch)
        
        log_file = open('run/SGD/log.txt', 'a')
        log_file.write(f'best_valid_loss_fold [{valid_loss_min}] Best_Epoch [{epoch}]')
        log_file.close()
        
    valid_losses.append(valid_loss_min)
    train_losses.append(train_loss)



# Test classficaton on a single image.

In [None]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))
# Doing inference on cpu as it doesn't take much effort feel free to change.
# Had some trouble loading it on GPU. 
image, label = next(iter(valid_loader))
model = create_model()
model.load_state_dict(torch.load(model_path, map_location='cpu'))
model.eval()

data = image.to('cpu')
# forward pass: compute predicted outputs by passing inputs to the model
output = model(data)
output = output.to('cpu').detach().numpy()
target = label.to('cpu').detach().numpy()
cur = 7
print(np.shape(target))
print(np.shape(output))

print("Predicted: {}".format(output[cur]))
print("Predicted sigmoid: {}".format(sigmoid(output[cur])))

print("Actual: {}".format(target[cur]))

print("Predicted Max : {}".format(output[cur].max()))
print("Actual Max : {}".format(target[cur].max()))

print("Predicted Sigmoid Arg Max : {}".format(sigmoid(output[cur].argmax())))
print("Actual Arg Max : {}".format(target[cur].argmax()))


print(output[cur].max())
print("\nimage batch shape: ", image.shape)
print("single image shape: ", image[cur].shape)




# 1 channel image
img_1_channel = image.numpy()[cur][1]
print("img_1channel shape: ", img_1_channel.shape)
plt.figure()
plt.imshow(img_1_channel)

# 3 channel image
plt.figure()
img_3_channel = image[cur].permute(1, 2, 0)
plt.imshow(img_3_channel, cmap='cool')
print("img_3channel shape:", img_3_channel.shape)

# print label
print("labels:",label)
