In [1]:
import json
import pandas as pd
import os
import sys
import pickle
import numpy as np
from random import random
import math
import torch
import torchvision 
import torch.nn.functional as F  
import torchvision.datasets as datasets  
import torchvision.transforms as transforms  
from torch import optim  
from torch import nn  
from torch.utils.data import DataLoader  
from tqdm import tqdm  
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

In [2]:
parentPath = '/content/drive/MyDrive/PhD/Fragle_TSS/Tested_Algorithms/Large Bin Based Modeling/Dataset/Sig_10_Mb_10_Splits'
os.chdir(parentPath)

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model, criterion, optimizer = None, None, None
train_indices, test_indices = [], []
TF_mean, TF_std = None, None
train_samples, test_samples, max_arr = None, None, None
train_meta_info, test_meta_info = None, None

json_file = open('../../meta_info_files/split_patient_wise.json')
dic = json.load(json_file)
myPath = os.getcwd()

thresholds = [0.05, 0.03, 0.02, 0.01, 0.005, 0.001]
MAE_dic, SN_dic, SP_dic = {}, {}, {}
for thr in thresholds:
    MAE_dic[thr] = []
    SN_dic[thr] = []
    SP_dic[thr] = []

In [4]:
loaded_dict = {}
with open('train_samples.pkl', 'rb') as f:
  loaded_dict = pickle.load(f)
train_meta_info = loaded_dict['meta'] 
train_samples = loaded_dict['samples']

loaded_dict = {}
with open('test_samples.pkl', 'rb') as f:
  loaded_dict = pickle.load(f)
test_meta_info = loaded_dict['meta'] 
test_samples = loaded_dict['samples']
loaded_dict = {}

In [5]:
sums = np.sum(train_samples, axis=2)
train_samples = train_samples/ sums[:, :, np.newaxis]

sums = np.sum(test_samples, axis=2)
test_samples = test_samples/ sums[:, :, np.newaxis]

In [6]:
class LoadDataset(Dataset):
    def __init__(self, indices, train):
        self.indices = indices
        self.is_train = train

    def __len__(self):
        return len(self.indices)

    def __getitem__(self, index):
        ind = self.indices[index]
        if self.is_train == True:
          dataY = torch.tensor(train_meta_info[ind][-1])
          dataY_norm = torch.tensor((train_meta_info[ind][-1] - TF_mean)/ (TF_std + 0.0000001))
          dataX = torch.tensor(train_samples[ind]/ max_arr)
        else:
          dataY = torch.tensor(test_meta_info[ind][-1])
          dataY_norm = torch.tensor((test_meta_info[ind][-1] - TF_mean)/ (TF_std + 0.0000001))
          dataX = torch.tensor(test_samples[ind]/ max_arr)
        dataX = dataX.float()
        dataY_norm = dataY_norm.float()
        dataY = dataY.float()
        return (dataX, dataY_norm, dataY)

In [7]:
class VGG_convnet(nn.Module):

    def __init__(self, bin_no = 282, feature_no = 156):

        super(VGG_convnet, self).__init__()

        self.begin_linear1 = nn.Linear(feature_no, 64)
        self.begin_linear1_rw = torch.nn.Parameter(torch.randn(64))
        self.begin_linear2 = nn.Linear(64, 96)
        self.begin_linear2_rw = torch.nn.Parameter(torch.randn(96))
        self.begin_linear3 = nn.Linear(96, 128)
        self.begin_linear3_rw = torch.nn.Parameter(torch.randn(128))
        
        self.pos_emb1D = torch.nn.Parameter(torch.randn(bin_no, 128))
        self.const = torch.nn.Parameter(torch.tensor(0.01))
        
        # block 1:         
        self.conv1a = nn.Conv1d(in_channels=128, out_channels=32, kernel_size=3, padding=1)
        self.conv1b = nn.Conv1d(in_channels=32, out_channels=32, kernel_size=3, padding=1)
        self.LN1 = nn.LayerNorm(bin_no)
        
        # block 2:      
        self.conv2a = nn.Conv1d(in_channels=32, out_channels=64, kernel_size=3, padding=1)
        self.conv2b = nn.Conv1d(in_channels=64, out_channels=64, kernel_size=3, padding=1)
        self.LN2 = nn.LayerNorm(bin_no)
        
        # block 3:            
        self.conv3a =  nn.Conv1d(in_channels=64, out_channels=96, kernel_size=3, padding=1)
        self.conv3b =  nn.Conv1d(in_channels=96, out_channels=96, kernel_size=3, padding=1)
        self.LN3 = nn.LayerNorm(bin_no)
        
        #block 4:       
        self.conv4a = nn.Conv1d(in_channels=96, out_channels=128, kernel_size=3, padding=1)
        self.conv4b = nn.Conv1d(in_channels=128, out_channels=128, kernel_size=3, padding=1)
        self.LN4 = nn.LayerNorm(bin_no)
        
        # linear layers:
        self.end_linear1 = nn.Linear(128, 128)
        self.end_linear1_rw = torch.nn.Parameter(torch.randn(128))
        self.end_linear2 = nn.Linear(128, 64)
        self.end_linear2_rw = torch.nn.Parameter(torch.randn(64))
        self.end_linear3 = nn.Linear(64, 1)

    def forward(self, x, targets, isTrain):
        x = self.begin_linear1(x)
        x = F.relu(x) + self.begin_linear1_rw * x * F.relu(x)
        x = self.begin_linear2(x)
        x = F.relu(x) + self.begin_linear2_rw * x * F.relu(x)
        x = self.begin_linear3(x)
        x = F.relu(x) + self.begin_linear3_rw * x * F.relu(x)

        x = x + self.pos_emb1D
        x = x.permute(0, 2, 1) # 128, 282
        
        # block 1:  
        x = F.relu(self.conv1a(x)) # 32, 282
        x = F.relu(self.conv1b(x))
        x = self.LN1(x)
        
        # block 2:   
        x = F.relu(self.conv2a(x)) # 64, 282
        x = F.relu(self.conv2b(x))
        x = self.LN2(x)

        # block 3:    
        x = F.relu(self.conv3a(x)) # 96, 282
        x = F.relu(self.conv3b(x))
        x = self.LN3(x)
        
        #block 4:      
        x = F.relu(self.conv4a(x)) # 128, 282
        x = F.relu(self.conv4b(x))
        x = self.LN4(x)
        
        y = torch.mean(x, axis=2) # 128
        
        # linear layers:   
        y = self.end_linear1(y)
        y = F.relu(y) + self.end_linear1_rw * y * F.relu(y)
        y = self.end_linear2(y)
        y = F.relu(y) + self.end_linear2_rw * y * F.relu(y)
        y = self.end_linear3(y)
        scores = y.squeeze()

        final_loss = None
        if isTrain == True:
          numerator = self.const + torch.abs(scores - targets)
          denominator = self.const + targets
          loss = torch.sum(numerator/denominator)/ scores.size(0)
          final_loss = loss * loss
        
        return y, final_loss

In [8]:
def metric_calc(loader, thr):
    running_MAE = 0
    num_samples = 0
    TP, TN, FP, FN = 0, 0, 0, 0
    model.eval()
    with torch.no_grad():
        for batch_idx, (data, _, targets) in enumerate(loader):
            # Get data to cuda if possible
            data = data.to(device=device)
            targets = targets.to(device=device)

            # forward
            scores, _ = model(data, None, False)
            scores = scores.view(targets.size())
            
            # TP, TN, FP, FN
            for i in range(targets.size(0)):
                pred = scores[i].item() * TF_std + TF_mean
                running_MAE += abs(pred - targets[i].item())
                num_samples += 1
                if targets[i].item()>0.0 and targets[i].item()<thr:
                     continue
                elif pred>thr and targets[i].item()>0.0:
                    TP += 1
                elif pred<=thr and targets[i].item()>0.0:
                    FN+=1
                elif pred>thr and targets[i].item()==0.0:
                    FP+=1
                elif pred<=thr and targets[i].item()==0.0:
                    TN+=1
    
    MAE_final = running_MAE/ num_samples
    SN_final = TP/(TP+FN)
    SP_final = TN/(TN+FP)
    
    return MAE_final, SN_final, SP_final

In [9]:
def train(train_loader):
  for epoch in range(70):
    # for batch_idx, (data, targets) in enumerate(tqdm(train_loader)):
    for batch_idx, (data, targets, _) in enumerate(train_loader):
        # Get data to cuda if possible
        data = data.to(device=device)
        data.requires_grad_()
        targets = targets.to(device=device)

        # forward
        scores, loss = model(data, targets, True)
        scores = scores.view(targets.size())

        # backward
        optimizer.zero_grad()
        loss.backward()

        # gradient descent or adam step
        optimizer.step()

    if epoch % 5 == 0:
      print(f'epoch {epoch}')
      for thr in thresholds:
        MAE, SN, SP = metric_calc(test_loader, thr)
        print(f'threshold: {thr*100}%, MAE: {MAE}, Sensitivity: {SN}, Specificity: {SP}')

In [10]:
def make_csv(split_no, max_arr):
    csv_list = []
    test_y = []
    scores, sample_names = [], []
    
    model.eval()
    with torch.no_grad():
      for index in test_indices:
          sample_names.append(test_meta_info[index][1])
          dataX = np.copy(test_samples[index])
          dataX = dataX/ max_arr
          test_y.append(test_meta_info[index][-1])

          dataX = torch.tensor(dataX)
          # dataX = dataX.permute(1, 0)     ########
          dataX = torch.unsqueeze(dataX, dim=0)
          dataX = dataX.float()

          score, _ = model(dataX.to(device), None, False)
          score = score.item() * TF_std + TF_mean
          scores.append(score)
        
    for i in range(len(scores)):
        tmp = []
        tmp.append(sample_names[i])
        tmp.append(scores[i])
        tmp.append(test_y[i])
        csv_list.append(tmp)
        
    folder = '/content/drive/MyDrive/PhD/Fragle_TSS/Tested_Algorithms/Large Bin Based Modeling/10_split_csv/'
    filePath = folder + 'test' + str(split_no) + '.csv'
    my_df = pd.DataFrame(csv_list)
    my_df.to_csv(filePath, index=False, header=['Sample_ID', 'Pred_Fraction', 'True_Fraction'])

In [None]:
for N in range(10):
  split_train = dic['train'][N] + dic['val'][N]
  split_test = dic['test'][N]
  train_indices, test_indices, train_y = [], [], []
  max_arr = []

  for i in range(len(train_meta_info)):
    if train_meta_info[i][1] in split_train:
      train_indices.append(i)
      train_y.append(train_meta_info[i][-1])
  for i in range(len(test_meta_info)):   
    if test_meta_info[i][1] in split_test:
      test_indices.append(i)
  max_arr = np.max(train_samples[train_indices, :,:], axis=0)
  train_y = np.array(train_y)
  TF_mean = np.mean(train_y)
  TF_std = np.std(train_y)

  train_data = LoadDataset(train_indices, True)
  test_data = LoadDataset(test_indices, False)
  train_loader = DataLoader(dataset=train_data, batch_size=32, shuffle=True)
  test_loader = DataLoader(dataset=test_data, batch_size=32)

  # initializing model for split N
  model = VGG_convnet().to(device)
  criterion = nn.L1Loss()
  optimizer = optim.Adam(model.parameters(), lr=0.0001)

  # train the model
  print(f'Split no. {N}')
  train(train_loader) ###

  break

  # model_path = '../../Experiments/Models/model' + str(N) + '.pt'
  # torch.save(model.state_dict(), model_path)
  make_csv(N+1, max_arr)

  for thr in thresholds:
      MAE, SN, SP = metric_calc(test_loader, thr)
      print(f'threshold: {thr*100}%, MAE: {MAE}, Sensitivity: {SN}, Specificity: {SP}')
      MAE_dic[thr].append(MAE)
      SN_dic[thr].append(SN)
      SP_dic[thr].append(SP)
  print()
  print()

In [None]:
for thr in thresholds:
    print(f'At threshold of {thr*100}%')
    mean_MAE = round(np.mean(MAE_dic[thr]), 3)
    med_MAE = round(np.median(MAE_dic[thr]), 3)
    std_MAE = round(np.std(MAE_dic[thr]), 3)
    print(f'MAE mean: {mean_MAE}, median: {med_MAE}, std: {std_MAE}')
        
    mean_SN = round(np.mean(SN_dic[thr]), 3)
    med_SN = round(np.median(SN_dic[thr]), 3)
    std_SN = round(np.std(SN_dic[thr]), 3)
    print(f'Sensitivity mean: {mean_SN}, median: {med_SN}, std: {std_SN}')
    
    mean_SP = round(np.mean(SP_dic[thr]), 3)
    med_SP = round(np.median(SP_dic[thr]), 3)
    std_SP = round(np.std(SP_dic[thr]), 3)
    print(f'Specificity mean: {mean_SP}, median: {med_SP}, std: {std_SP}')
    
    print()