In [1]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from torch.utils.data import Dataset,DataLoader
from scipy import stats
from torch import cuda

  _dtype_to_storage = {data_type(0).dtype: data_type for data_type in _storages}


In [2]:
hidden = 16
layer = 1
modelpth = './model/Cpf1/' #
modelname = 'CRISPR-OTE1.0'  #CRISPR-OTE2.0

In [3]:
# 数据封装
class Dataset(Dataset):
    def __init__(self, csv_file = r"./data/csv/Data set HT 1-1.csv", 
                 nrows = 15000, seqs_file = r"./data/SEQs/1-1/SEQs.npy", 
                 feas_file = r"./data/new_fea/HT1-1_fea_datas.csv"):
        
        self.df = pd.read_csv(csv_file, sep = ",", header = 1, usecols = [9,11], nrows = nrows, 
                                names = ['Indel freqeuncy','Tm34'], 
                                dtype = {'Indel freqeuncy': np.float64}) 
        
        self.seqs = np.load(seqs_file) # batch,50,4
        self.feas = pd.read_csv(feas_file, sep = ",", header = 0, index_col=0, nrows = nrows, dtype = np.float64) # batch,44
        self.new_feas = self.feas.iloc[:,[40, 41, 42, 43]].values  # 4RNAfold
        print(self.new_feas.shape)
        self.Tms = self.df.iloc[:, -1].values 
        self.IFs = self.df['Indel freqeuncy'].values 
        self.IFs = self.IFs.reshape(-1, 1) 
        self.Tms = self.Tms.reshape(-1, 1) 
    
    def __len__(self):
            return len(self.seqs)

    def __getitem__(self, index):
        X = torch.from_numpy(self.seqs).float()
        Y = torch.from_numpy(self.IFs).float()
        T = torch.from_numpy(self.Tms).float()
        R = torch.from_numpy(self.new_feas).float()
        return X[index],Y[index],T[index],R[index]

In [4]:
testDatasetHT12 = Dataset(csv_file = r"./data/csv/Data set HT 1-2.csv", nrows = 1292, 
              seqs_file = r"./data/SEQs/1-2/SEQs.npy", feas_file = r"./data/new_fea/HT1-2_fea_datas.csv")
testDatasetHT2 = Dataset(csv_file = r"./data/csv/Data set HT 2.csv", nrows = 2963, 
              seqs_file = r"./data/SEQs/2/SEQs.npy", feas_file = r"./data/new_fea/HT2_fea_datas.csv")
testDatasetHT3 = Dataset(csv_file = r"./data/csv/Data set HT 3.csv", nrows = 1251, 
              seqs_file = r"./data/SEQs/3/SEQs.npy", feas_file = r"./data/new_fea/HT3_fea_datas.csv")

(1292, 4)
(2963, 4)
(1251, 4)


In [5]:
# 网络结构
class CRISPR_OTE(nn.Module):
    def __init__(self):
        super(CRISPR_OTE, self).__init__()
        
        self.lstm = nn.LSTM(4, hidden, layer, batch_first = True, bidirectional = True)
        self.layerl = nn.Sequential(nn.Dropout(p=0.3), nn.Linear(34*2*hidden, 80), nn.ReLU(True)) 
        
        self.conv1 = nn.Sequential(nn.Conv1d(in_channels = 4, out_channels = 80, kernel_size = 5), # stide 1
                      nn.ReLU(),nn.AvgPool1d(kernel_size = 2)) # stride kernel_size
        self.layerc = nn.Sequential(nn.Dropout(p=0.3), nn.Linear(80*15, 80), nn.ReLU(True)) # (34-5+1)/2

        # CRISPR-OTE2.0
#         self.fc_Tm = nn.Linear(5, 30)
        
        # CRISPR-OTE1.0
        self.fc_Tm = nn.Linear(1, 30)   
        
        self.layer1 = nn.Sequential(nn.Dropout(p=0.3), nn.Linear(190, 80), nn.ReLU(True))
        self.layer4 = nn.Sequential(nn.Dropout(p=0.3), nn.Linear(80, 1))


    def forward(self, x, T, R):
        x50 = x
        
        # LSTM
        if  cuda.is_available():
            h0 = torch.zeros(2*layer, x.size(0), hidden).cuda() 
            c0 = torch.zeros(2*layer, x.size(0), hidden).cuda()
            indices34 = torch.linspace(6, 39, steps = 34).long().cuda()
        else:
            h0 = torch.zeros(2*layer, x.size(0), hidden)
            c0 = torch.zeros(2*layer, x.size(0), hidden)
            indices34 = torch.linspace(6, 39, steps = 34).long()

        x34 = torch.index_select(x, 1, indices34, out=None)  # batch,34,4
        out34, (hn34, cn34)  = self.lstm(x34, (h0, c0))  

        x_l = out34 # batch,34,hidden*direction 
        x_l = x_l.contiguous().view(x_l.size(0), -1)   # batch,34*16*2 
        x_l = self.layerl(x_l)   # batch,80   
        
        # CNN
        x34 = x34.permute(0,2,1) # batch,4,34
        x_c = self.conv1(x34)  # batch,80,(34-5+1)/2
        x_c = x_c.view(x_c.size(0), -1)  # batch,1200,
        x_c = self.layerc(x_c)  # batch,80
        
        # CRISPR-OTE2.0
#         feas = torch.cat([T, R], dim = 1)
    
        # CRISPR-OTE1.0
        feas = T
        
        feas = self.fc_Tm(feas)
        x = torch.cat([x_l, x_c, feas], dim = 1) # batch,190      
        x = self.layer1(x)
        x = self.layer4(x)
        return x


if  cuda.is_available():
    model = CRISPR_OTE().cuda()
else:
    model = CRISPR_OTE()
print(model)

CRISPR_OTE(
  (lstm): LSTM(4, 16, batch_first=True, bidirectional=True)
  (layerl): Sequential(
    (0): Dropout(p=0.3, inplace=False)
    (1): Linear(in_features=1088, out_features=80, bias=True)
    (2): ReLU(inplace=True)
  )
  (conv1): Sequential(
    (0): Conv1d(4, 80, kernel_size=(5,), stride=(1,))
    (1): ReLU()
    (2): AvgPool1d(kernel_size=(2,), stride=(2,), padding=(0,))
  )
  (layerc): Sequential(
    (0): Dropout(p=0.3, inplace=False)
    (1): Linear(in_features=1200, out_features=80, bias=True)
    (2): ReLU(inplace=True)
  )
  (fc_Tm): Linear(in_features=1, out_features=30, bias=True)
  (layer1): Sequential(
    (0): Dropout(p=0.3, inplace=False)
    (1): Linear(in_features=190, out_features=80, bias=True)
    (2): ReLU(inplace=True)
  )
  (layer4): Sequential(
    (0): Dropout(p=0.3, inplace=False)
    (1): Linear(in_features=80, out_features=1, bias=True)
  )
)


In [6]:
# 测试
def test(model, test_loader):
    model.eval()
    with torch.no_grad():
        test_output = []
        test_label = []
        for k,(inputs, labels, T, R) in enumerate(test_loader):
            length = len(test_loader)
            if  cuda.is_available():
                inputs = inputs.cuda()
                T = T.cuda()
                R = R.cuda()
                outputs = model(inputs, T, R).cpu()
            else:
                outputs = model(inputs, T, R)

#             r,p = stats.spearmanr(labels.detach().numpy(),outputs.detach().numpy())
            test_output.extend(outputs.squeeze().detach().numpy())
            test_label.extend(labels.squeeze().detach().numpy())

        test_spear_r, test_spear_p = stats.spearmanr(test_output, test_label)
        test_pear_r, test_pear_p = stats.pearsonr(test_output, test_label)
        print("test_spr {:.4f}, test_pear_r {:.4f}".format(test_spear_r, test_pear_r))

    return(test_spear_r, test_pear_r)

In [7]:
checkpoint = torch.load(modelpth + modelname + '.pth', map_location='cpu')
model.load_state_dict(checkpoint['model'])
run_time = checkpoint['time']
run_minloss = checkpoint['min_loss']
run_spr = checkpoint['spr']
run_pear = checkpoint['pear']
print('time {}, minloss {:.4f}, spr {:.4f}, pear {:.4f}'.format(run_time, run_minloss, run_spr, run_pear))  

time 10770.481737613678, minloss 417.7994, spr 0.7656, pear 0.7747


In [12]:
test_loaderHT12 = DataLoader(dataset = testDatasetHT12,batch_size = 1292,num_workers = 0,pin_memory = True)
_, _ = test(model,test_loaderHT12)

RuntimeError: Numpy is not available

In [None]:
test_loaderHT2 = DataLoader(dataset = testDatasetHT2,batch_size = 2963,num_workers = 0,pin_memory = True)
_, _ = test(model,test_loaderHT2)

In [None]:
test_loaderHT3 = DataLoader(dataset = testDatasetHT3,batch_size = 1251,num_workers = 0,pin_memory = True)
_, _ = test(model,test_loaderHT3)