In [2]:
%matplotlib inline
import os
import time
import copy
import pandas as pd
import numpy as np
from random import seed
from random import randint
import random
import torch
from torch import nn
from torch import optim
import torch.nn.functional as F
from torchvision import datasets, transforms, models
from PIL import Image
from matplotlib import pyplot as plt
from sklearn.metrics import fbeta_score, precision_recall_fscore_support, multilabel_confusion_matrix
from sklearn import metrics
from tqdm import tqdm_notebook as tqdm
import warnings
warnings.filterwarnings("ignore")

torch.set_num_threads(5)

''' 
    type_of_dataset:
        a)1 --->>> 'bp'
        b)2 --->>> 'cc'
        c)3 --->>> 'mf'
        
    type_of_image:
        a)None
        b)charge
        c)hydropathy
        d)isoelectric
'''

type_of_image = 'None'
type_of_dataset = '1'
output_dim = 932
weights_path = 'weights_{}_{}.pth'.format(type_of_dataset, type_of_image)
if type_of_dataset == '1':
    output_dim = 932
    d = 'bp'
if type_of_dataset == '2':
    output_dim = 439
    d = 'cc'
if type_of_dataset == '3':
    output_dim = 589
    d = 'mf'

train_dir = 'Data_preprocessing/Examples/{}/{}'.format(d, type_of_image)    #provide path for generated voxel datas
labels_csv = '{}.csv'.format(d)                                             #provide the path for labels
resnet_weights_path = 'Resnet50.pth'                                        #provide path for pretrained resnet50 weigths

In [3]:
device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
print(device)

BATCH_SIZE = 50
NUM_EPOCHS = 20
PERCENTILE = 99.7
LEARNING_RATE = 0.0001
DISABLE_TQDM = True

In [4]:
df = pd.read_csv(labels_csv)
attribute_dict = dict(zip(df.accession_no,df.labels))

In [5]:
data_transforms = transforms.Compose([
        transform.
        transforms.ToTensor(),
    ])

cuda:2


In [6]:
from torch.utils import data
class ImageData(data.Dataset):
    def __init__(self,df,dirpath,transform,test = False):
        self.df = df
        self.test = test
        self.dirpath = dirpath
        self.conv_to_tensor = transform
        #image data 
        if not self.test:
            self.image_arr = np.asarray(str(self.dirpath)+'/'+self.df.iloc[:, 0])
        else:
            self.image_arr = np.asarray(str(self.dirpath)+'/'+self.df.iloc[:, 0])
        
        #labels data
        if not self.test:
             self.label_df = self.df.iloc[:,1]
        
        # Calculate length of df
        self.data_len = len(self.df.index)

    def __len__(self):
        return self.data_len
    
    def __getitem__(self, idx):
        image_name = self.image_arr[idx]
        img = Image.open(image_name)
        img = img.convert(mode = 'RGB')
        img_tensor = self.conv_to_tensor(img)
        if not self.test:
            image_labels = self.label_df[idx]
            label_tensor = torch.zeros((1, output_dim))
            image_labels = [int(x) for x in image_labels.split(',')]
            for i, label in enumerate(image_labels):
                label_tensor[0, i] = label
            image_label = torch.tensor(label_tensor,dtype= torch.float32)
            return (img_tensor,image_label.squeeze())
        return (img_tensor)

In [7]:
df = pd.read_csv(labels_csv)

from sklearn.model_selection import train_test_split
train_df,val_df = train_test_split(df, test_size=0.20)
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
print(f"Validation_Data Length: {len(val_df)}\nTrain_Data Length: {len(train_df)}")

In [8]:
# Train dataset
train_dataset = ImageData(train_df,train_dir,data_transforms)
train_loader = data.DataLoader(dataset=train_dataset,batch_size=BATCH_SIZE,shuffle=False)

# validation dataset
val_dataset = ImageData(val_df,train_dir,data_transforms)
val_loader = data.DataLoader(dataset=val_dataset,batch_size=BATCH_SIZE,shuffle=False)

dataloaders_dict = {'train':train_loader, 'val':val_loader}

In [9]:
features, labels = next(iter(train_loader))
print(f'Train Features: {features.shape}\nTrain Labels: {labels.shape}')
print()
features, labels = next(iter(val_loader))
print(f'Validation Features: {features.shape}\nValidation Labels: {labels.shape}')
print()

In [10]:
resnet_cls = models.resnet50()
resnet_cls.load_state_dict(torch.load(resnet_weights_path))

class AvgPool(nn.Module):
    def forward(self, x):
        return F.avg_pool2d(x, x.shape[2:])
    
class ResNet50(nn.Module):
    def __init__(self,num_outputs):
        super(ResNet50,self).__init__()
        self.resnet = resnet_cls
        layer4 = self.resnet.layer4
        self.resnet.layer4 = nn.Sequential(nn.Dropout(0.5), layer4)
        self.resnet.avgpool = AvgPool()
        self.resnet.fc = nn.Linear(2048, 1024)
        
        self.fc1 = nn.Linear(1024, 256)
        self.fc2 = nn.Linear(256, num_outputs)
        self.bn1 = nn.BatchNorm1d(1024)
        self.bn2 = nn.BatchNorm1d(256)
    
        for param in self.bn2.parameters():
            param.requires_grad = True

        for param in self.resnet.parameters():
            param.requires_grad = False

        for param in self.resnet.layer4.parameters():
            param.requires_grad = True

        for param in self.resnet.fc.parameters():
            param.requires_grad = True
            
        for param in self.fc1.parameters():
            param.requires_grad = True
            
        for param in self.bn1.parameters():
            param.requires_grad = True
            
        for param in self.fc2.parameters():
            param.requires_grad = True
            
    def forward(self,x):
        out = self.bn1(self.resnet(x))
        out = self.bn2(F.relu(self.fc1(out)))
        out = F.sigmoid(self.fc2(out))
        return out
    
NeuralNet = ResNet50(num_outputs = output_dim)

Validation_Data Length: 2222
 Train_Data Length: 8888


In [None]:
NeuralNet

In [11]:
total_params = sum(p.numel() for p in NeuralNet.parameters())
print(f'{total_params:,} total parameters.')
total_trainable_params = sum(p.numel() for p in NeuralNet.parameters() if p.requires_grad)
print(f'{total_trainable_params:,} training parameters.')

In [12]:
print("TRAINING")
print("training examples: ",len(train_dataset))
print("batch size: ",BATCH_SIZE)
print("batches available: ",len(train_loader))
print()
print("VALIDATION")
print("validation examples: ",len(val_dataset))
print("batch size: ",BATCH_SIZE)
print("batches available: ",len(val_loader))
print()

Train Features: torch.Size([50, 3, 224, 224])
Train Labels: torch.Size([50, 932])

Validation Features: torch.Size([50, 3, 224, 224])
Validation Labels: torch.Size([50, 932])



In [13]:
NeuralNet = NeuralNet.to(device)
optimizer = optim.Adam(NeuralNet.parameters(),lr = LEARNING_RATE)
loss_func = torch.nn.BCEWithLogitsLoss()
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,patience = 2)
best_loss = np.inf
best_f_score = np.inf
best_precision = np.inf
best_recall = np.inf

In [14]:
prec = []
recall = []
f = []
losss = []
val_f = []
val_loss = []
val_prec = []
val_recall = []

def store(phase,p,r,fs,l):
    if phase == 'train':
        prec.append(p)
        recall.append(r)
        f.append(fs)
        losss.append(l)
    else:
        val_prec.append(p)
        val_recall.append(r)
        val_f.append(fs)
        val_loss.append(l)
        
def calc(l,f,p,r,length):
    loss = l/length
    pre = p/length
    fs = f/length
    re = r/length
    
    return loss,fs,pre,re

def result(epoch, NUM_EPOCHS,phase,epoch_loss,epoch_f_loss,epoch_precision,epoch_recall,elapsed_time):
    print("\tPhase: {}\n\t\t Epoch: {}/{} | {}_loss:{:.8f} | f_score:{:.8f} | precision:{:.8f} | recall:{:.8f} | Time: {:.4f}s".format(phase,
                                                                              epoch+1,
                                                                              NUM_EPOCHS,
                                                                              phase,
                                                                              epoch_loss,
                                                                              epoch_f_score,
                                                                              epoch_precision,
                                                                              epoch_recall,
                                                                              elapsed_time))

ResNet50(
  (resnet): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): Bottleneck(
        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace)
        (downsample): Sequential(
          (0): Conv2

In [15]:
for epoch in range(NUM_EPOCHS):
    for phase in ['train', 'val']:
        start_time = time.time()
        if phase == 'train':
            NeuralNet.train()
        else:
            NeuralNet.eval()

        running_loss = 0.0
        running_f_score = 0.0
        running_precision = 0.0
        running_recall = 0.0

        for images_batch, labels_batch in tqdm(dataloaders_dict[phase],disable = DISABLE_TQDM):
            images_batch = images_batch.to(device)
            labels_batch = labels_batch.to(device)

            optimizer.zero_grad()

            with torch.set_grad_enabled(phase == 'train'):
                pred_batch = NeuralNet(images_batch)
                _, preds = torch.max(pred_batch.data, 1)
                loss = loss_func(pred_batch,labels_batch)
            if phase == 'train':
                loss.backward()
                optimizer.step()

            labels_cpu = labels_batch.cpu().detach().numpy()
            pred_cpu = pred_batch.cpu().detach().numpy()

#             print(metrics.multilabel_confusion_matrix(labels_cpu, pred_cpu>0.5, samplewise = True))

            temp_precision, temp_recall, temp_f_score, _ = precision_recall_fscore_support(
                                                                labels_cpu, pred_cpu > 0.1, beta=0.5, average='samples')

            running_loss += loss.item() * images_batch.size(0)
            running_precision += (temp_precision * len(images_batch))
            running_recall += (temp_recall * len(images_batch))
            running_f_score += (temp_f_score * len(images_batch))

        epoch_loss = running_loss / len(dataloaders_dict[phase].dataset)
        epoch_f_score = running_f_score / len(dataloaders_dict[phase].dataset)
        epoch_precision = running_precision / len(dataloaders_dict[phase].dataset)
        epoch_recall = running_recall / len(dataloaders_dict[phase].dataset)
        
        store(phase,epoch_precision,epoch_recall,epoch_f_score,epoch_loss)

        if phase == 'val' and epoch_f_score < best_f_score:
#             print("model val_loss Improved from {:.8f} to {:.8f}".format(best_loss,epoch_loss))
            best_f_score = epoch_f_score
            best_precision = epoch_precision
            best_recall = epoch_recall
            best_loss = epoch_loss
            best_model_wts = copy.deepcopy(NeuralNet.state_dict())
            torch.save(NeuralNet.state_dict(), weights_path)

        if phase == 'val':
            scheduler.step(epoch_loss)

        elapsed_time = time.time()-start_time
        result(epoch, NUM_EPOCHS,phase,epoch_loss,epoch_f_score,epoch_precision,epoch_recall,elapsed_time)


26,110,692 total parameters.
17,567,396 training parameters.


In [16]:
resnet_cls = models.resnet50()
resnet_cls.load_state_dict(torch.load(resnet_weights_path))

class AvgPool(nn.Module):
    def forward(self, x):
        return F.avg_pool2d(x, x.shape[2:])
    
class ResNet50(nn.Module):
    def __init__(self,num_outputs):
        super(ResNet50,self).__init__()
        self.resnet = resnet_cls
        layer4 = self.resnet.layer4
        self.resnet.layer4 = nn.Sequential(nn.Dropout(0.5), layer4)
        self.resnet.avgpool = AvgPool()
        self.resnet.fc = nn.Linear(2048, 1024)
        
        self.fc1 = nn.Linear(1024, 256)
        self.fc2 = nn.Linear(256, num_outputs)
        self.bn1 = nn.BatchNorm1d(1024)
        self.bn2 = nn.BatchNorm1d(256)
    
        for param in self.bn2.parameters():
            param.requires_grad = True

        for param in self.resnet.parameters():
            param.requires_grad = False

        for param in self.resnet.layer4.parameters():
            param.requires_grad = True

        for param in self.resnet.fc.parameters():
            param.requires_grad = True
            
        for param in self.fc1.parameters():
            param.requires_grad = True
            
        for param in self.bn1.parameters():
            param.requires_grad = True
            
        for param in self.fc2.parameters():
            param.requires_grad = True
            
    def forward(self,x):
        out = self.bn1(self.resnet(x))
        out = self.bn2(F.relu(self.fc1(out)))
        out = F.sigmoid(self.fc2(out))
        return out
    
NeuralNet = ResNet50(num_outputs = output_dim)
NeuralNet.load_state_dict(torch.load(weights_path))

TRAINING
training examples:  8888
batch size:  50
batches available:  178

VALIDATION
validation examples:  2222
batch size:  50
batches available:  45



In [17]:
newmodel = torch.nn.Sequential(*(list(NeuralNet.children())[:-3]))
print(newmodel)

In [18]:
features = []
acc = []

for img_name in tqdm(os.listdir(train_dir)):
    img = Image.open(train_dir + '/' + img_name)
    img = img.convert(mode = 'RGB')
    b = transforms.ToTensor()
    img_tensor = b(img)
    img_tensor = img_tensor.unsqueeze(0)
    pred = newmodel(img_tensor)
    temp = pred.detach().numpy()
    features.append(temp)
    acc.append(img_name.strip().split('_')[0])
res_df = pd.DataFrame({'accession': acc,'features':features})

res_df.to_pickle('best_features_{}_{}.pkl'.format(d, type_of_image))      #provide the path from where your multipred code will access the features

In [19]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np

y = [i for i in range(1, NUM_EPOCHS)]

fig = plt.figure()
ax = plt.subplot(111)
ax.plot(y, recall,label = 'train_recall')
ax.plot(y, val_recall,'r',label='val_recall')
plt.title('recall_{}_None'.format(d))
plt.ylabel('Recall')
plt.xlabel('Number of Epochs')
ax.legend()
plt.show()
fig.savefig('both_recall_{}_None.png'.format(d))
plt.close()

In [None]:
fig = plt.figure()
ax = plt.subplot(111)
ax.plot(y, recall,label = 'train_recall')
plt.title('train_recall_{}_None'.format(d))
plt.ylabel('Recall')
plt.xlabel('Number of Epochs')
ax.legend()
plt.show()
fig.savefig('train_recall_{}_None.png'.format(d))
plt.close()

fig = plt.figure()
ax = plt.subplot(111)
ax.plot(y, val_recall,'r',label='val_recall')
plt.title('val_recall_{}_None'.format(d))
plt.ylabel('Recall')
plt.xlabel('Number of Epochs')
ax.legend()
plt.show()
fig.savefig('validation_recall_{}_None.png'.format(d))
plt.close()

In [20]:
fig = plt.figure()
ax = plt.subplot(111)
ax.plot(y, prec, label = 'train_precision')
ax.plot(y, val_prec, 'r',label = 'val_precision')
plt.title('precision_{}_None'.format(d))
plt.ylabel('Precision')
plt.xlabel('Number of Epochs')
ax.legend()
plt.show()
fig.savefig('both_precision_{}_None.png'.format(d))
plt.close()

	Phase: train
		 Epoch: 1/20 | train_loss:0.93893044 | f_score:0.05156340 | precision:0.04205003 | recall:0.99985554 | Time: 449.4322s
	Phase: val
		 Epoch: 1/20 | val_loss:0.89410764 | f_score:0.05088794 | precision:0.04148140 | recall:1.00000000 | Time: 126.2740s
	Phase: train
		 Epoch: 2/20 | train_loss:0.83552606 | f_score:0.06448353 | precision:0.05451973 | recall:0.94874575 | Time: 438.9258s
	Phase: val
		 Epoch: 2/20 | val_loss:0.77176658 | f_score:0.18679073 | precision:0.17248018 | recall:0.52176741 | Time: 201.2233s
	Phase: train
		 Epoch: 3/20 | train_loss:0.78037728 | f_score:0.19140890 | precision:0.20151631 | recall:0.58327598 | Time: 988.2677s
	Phase: val
		 Epoch: 3/20 | val_loss:0.74110300 | f_score:0.29630298 | precision:0.36361765 | recall:0.29232750 | Time: 111.9107s
	Phase: train
		 Epoch: 4/20 | train_loss:0.75449432 | f_score:0.27229700 | precision:0.33967494 | recall:0.37852795 | Time: 983.7621s
	Phase: val
		 Epoch: 4/20 | val_loss:0.72867415 | f_score:0.317612

In [None]:
fig = plt.figure()
ax = plt.subplot(111)
ax.plot(y, prec, label = 'train_precision')
plt.title('train_precision_{}_None'.format(d))
plt.ylabel('Precision')
plt.xlabel('Number of Epochs')
ax.legend()
plt.show()
fig.savefig('train_precision_{}_None.png'.format(d))
plt.close()

fig = plt.figure()
ax = plt.subplot(111)
ax.plot(y, val_prec, 'r',label = 'val_precision')
plt.title('val_precision_{}_None'.format(d))
plt.ylabel('Precision')
plt.xlabel('Number of Epochs')
ax.legend()
plt.show()
fig.savefig('validation_precision_{}_None.png'.format(d))
plt.close()

In [None]:
y = [i for i in range(1,21)]

fig = plt.figure()
ax = plt.subplot(111)
ax.plot(y, f,label = 'train_f_score')
ax.plot(y, val_f, 'r',label = 'val_f_score')
plt.title('f_score_{}_None'.format(d))
plt.ylabel('F_score')
plt.xlabel('Number of Epochs')
ax.legend()
plt.show()
fig.savefig('both_f_score_{}_None.png'.format(d))
plt.close()

In [None]:
y = [i for i in range(1,21)]

fig = plt.figure()
ax = plt.subplot(111)
ax.plot(y, f,label = 'train_f_score')
plt.title('train_f_score_{}_None'.format(d))
plt.ylabel('F_score')
plt.xlabel('Number of Epochs')
ax.legend()
plt.show()
fig.savefig('train_f_score_{}_None.png'.format(d))
plt.close()

fig = plt.figure()
ax = plt.subplot(111)
ax.plot(y, val_f, 'r',label = 'val_f_score')
plt.title('val_fscore_{}_None'.format(d))
plt.ylabel('F_score')
plt.xlabel('Number of Epochs')
ax.legend()
plt.show()
fig.savefig('validation_fscore_{}_None.png'.format(d))
plt.close()

In [None]:
y = [i for i in range(1,21)]

fig = plt.figure()
ax = plt.subplot(111)
ax.plot(y, losss, label = 'train_loss')
ax.plot(y, val_loss, 'r', label = 'val_loss')
plt.title('loss_{}_None'.format(d))
plt.ylabel('Loss')
plt.xlabel('Number of Epochs')
ax.legend()
plt.show()
fig.savefig('both_loss_{}_None.png'.format(d))
plt.close()

In [21]:
y = [i for i in range(1,21)]

fig = plt.figure()
ax = plt.subplot(111)
ax.plot(y, losss, label = 'train_loss')
plt.title('train_loss_{}_None'.format(d))
plt.ylabel('Loss')
plt.xlabel('Number of Epochs')
ax.legend()
plt.show()
fig.savefig('train_loss_{}_None.png'.format(d))
plt.close()

fig = plt.figure()
ax = plt.subplot(111)
ax.plot(y, val_loss, 'r',label = 'val_loss')
plt.title('val_loss_{}_None'.format(d))
plt.ylabel('Loss')
plt.xlabel('Number of Epochs')
ax.legend()
plt.show()

fig.savefig('validation_loss_{}_None.png'.format(d))
plt.close()

IncompatibleKeys(missing_keys=[], unexpected_keys=[])