In [1]:
import math
import re
import torch
from torch import nn
from torch.nn import functional as F
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv(r'基准数据集.csv')
sequences = data.Sequence
labels = data.Label.values

In [3]:
pat = re.compile('[AGCTagct]')

def pre_process(text):
    text = pat.findall(text)
    text = [each.lower() for each in text]
    return text

x = sequences.apply(pre_process)

In [4]:
word_list = ['a', 'g', 'c', 't']

word_index = {'a': 0, 'g': 1, 'c': 2, 't': 3}

In [5]:
text = x.apply(lambda x: [word_index.get(word, 4) for word in x])

In [6]:
text_len = 1200

pad_text = [l + (text_len - len(l)) * [4] if len(l) < text_len else l[:text_len] for l in text]

pad_text = np.array(pad_text)

In [7]:
x_train, x_test, y_train, y_test = train_test_split(pad_text, labels, test_size=0.3)

In [8]:
class Mydataset(torch.utils.data.Dataset):
    def __init__(self, text_list, label_list):
        self.text_list = text_list
        self.label_list = label_list
    
    def __getitem__(self,index):
        text = torch.LongTensor(self.text_list[index])
        label = self.label_list[index]
        return text, label
    
    def __len__(self):
        return len(self.text_list)

train_ds = Mydataset(x_train, y_train)
test_ds = Mydataset(x_test, y_test)


batch_size = 32

train_dl = torch.utils.data.DataLoader(train_ds, batch_size=batch_size, shuffle=True)
test_dl = torch.utils.data.DataLoader(test_ds, batch_size=batch_size, shuffle=False)

In [9]:
one_hot_dim = len(word_list)
hidden_size = 30
print('hidden_size:', hidden_size)

class Net(nn.Module):
    def __init__(self, word_list, one_hot_dim, hidden_size, num_layers=1):
        super().__init__()        
        
        self.gru = nn.GRU(one_hot_dim, hidden_size, num_layers)
        self.linear1 = nn.Linear(hidden_size, 128)
        self.linear2 = nn.Linear(128, 2)
 
    def forward(self, inputs):
        x = F.one_hot(inputs, num_classes=len(word_list)+1)
        x = x[:, :, :-1]
        x = x.permute(1, 0, 2)
        x = x.float()
        x, _ = self.gru(x)
        x = torch.sum(x, dim=0)
        x = F.relu(self.linear1(x))
        x = self.linear2(x)
        return x

hidden_size: 30


In [10]:
model = Net(word_list, one_hot_dim, hidden_size)
model = model.to('cuda')

loss = nn.CrossEntropyLoss()
loss = loss.to('cuda')

optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

In [11]:
def fit(model, optimizer, train_dl, test_dl):
   
    """训练集"""
    tr_correct = 0
    tr_total = 0
    tr_loss = 0
    tr_TP = 0
    tr_TN = 0
    tr_FP = 0
    tr_FN = 0
    
    model.train()
    for x, y in train_dl:
        y = y.long()
        x, y = x.to('cuda'), y.to('cuda')
        y_pred = model(x)
        loss_value = loss(y_pred, y)
        optimizer.zero_grad()
        loss_value.backward()
        optimizer.step()
        
        with torch.no_grad():
            y_pred = torch.argmax(y_pred, dim=1)
            tr_correct += (y_pred == y).sum().item()
            tr_TP += ((y_pred == y) & (y == 1)).sum().item()
            tr_FN += ((y_pred != y) & (y == 1)).sum().item()
            tr_FP += ((y_pred != y) & (y == 0)).sum().item()
            tr_TN += ((y_pred == y) & (y == 0)).sum().item()
            tr_total += len(y)
            tr_loss += loss_value.item()
            
    """1个epoch训练结束后，计算训练集的各个指标"""
    epoch_tr_loss = tr_loss / len(train_dl)
    epoch_tr_accuracy = tr_correct / tr_total
    epoch_tr_MCC = (tr_TP * tr_TN - tr_TP * tr_FN) / (math.sqrt((tr_TP + tr_FP) * (tr_TP + tr_FN) * (tr_TN + tr_FP) * (tr_TN + tr_FN)))
    epoch_tr_SE=tr_TP/(tr_TP+tr_FN)
    epoch_tr_SPC = tr_TN / (tr_TN + tr_FP)
    epoch_tr_PPV= tr_TP / (tr_TP + tr_FP)
    epoch_tr_NPV= tr_TN / (tr_TN + tr_FN)
    epoch_tr_recall = tr_TP / (tr_TP + tr_FN)
    epoch_tr_precision = tr_TP / (tr_TP + tr_FP)
    epoch_tr_F1 = (2 * epoch_tr_precision * epoch_tr_recall) / (epoch_tr_precision + epoch_tr_recall)
   

    """验证集"""
    te_correct = 0
    te_total = 0
    te_loss = 0
    te_TP = 0
    te_TN = 0
    te_FP = 0
    te_FN = 0
    
    model.eval()
    with torch.no_grad():
        for x, y in test_dl:
            y = y.long()
            x, y = x.to('cuda'), y.to('cuda')
            y_pred = model(x)
            loss_value = loss(y_pred, y)
            y_pred = torch.argmax(y_pred, dim=1)
            te_correct += (y_pred == y).sum().item()
            te_TP += ((y_pred == y) & (y == 1)).sum().item()
            te_FN += ((y_pred != y) & (y == 1)).sum().item()
            te_FP += ((y_pred != y) & (y == 0)).sum().item()
            te_TN += ((y_pred == y) & (y == 0)).sum().item()
            te_total += len(y)
            te_loss += loss_value.item()
        
    """1个epoch训练结束后，计算测试集的各个指标"""
    epoch_te_loss = te_loss / len(test_dl)
    epoch_te_accuracy = te_correct / te_total
    epoch_te_MCC = (te_TP * te_TN - te_TP * te_FN) / (math.sqrt((te_TP + te_FP) * (te_TP + te_FN) * (te_TN + te_FP) * (te_TN + te_FN)))
    epoch_te_SE=te_TP/(te_TP+te_FN)
    epoch_te_SPC = te_TN / (te_TN + te_FP)
    epoch_te_PPV= te_TP / (te_TP + te_FP)
    epoch_te_NPV= te_TN / (te_TN + te_FN)
    epoch_te_recall = te_TP / (te_TP + te_FN)
    epoch_te_precision = te_TP / (te_TP + te_FP)
    epoch_te_F1 = (2 * epoch_te_precision * epoch_te_recall) / (epoch_te_precision + epoch_te_recall)


    return epoch_tr_loss, epoch_tr_accuracy, epoch_tr_MCC, epoch_tr_SE,epoch_tr_F1, epoch_tr_SPC,epoch_tr_PPV, epoch_tr_NPV,epoch_te_loss, epoch_te_accuracy, epoch_te_MCC, epoch_te_SE, epoch_te_SPC,epoch_te_PPV,epoch_te_F1,epoch_te_NPV 

In [12]:
tr_loss = []
tr_accuracy = []
tr_MCC = []
tr_SE = []
tr_SPC = []
tr_PPV=[]
tr_NPV=[]
tr_AUC=[]
tr_F1=[]

te_loss = []
te_accuracy = []
te_MCC = []
te_SE = []
te_SPC = []
te_PPV=[]
te_NPV=[]
te_AUC=[]
te_F1=[]
from sklearn import metrics

In [13]:
epochs = 80

for epoch in range(epochs): 
    print(f'{epoch} : ',end = '')
    epoch_tr_loss, epoch_tr_accuracy, epoch_tr_MCC, epoch_tr_SE,epoch_tr_F1, epoch_tr_SPC,epoch_tr_PPV, epoch_tr_NPV,epoch_te_loss, epoch_te_accuracy, epoch_te_MCC, epoch_te_SE, epoch_te_SPC,epoch_te_PPV,epoch_te_F1,epoch_te_NPV= fit(model, optimizer, train_dl, test_dl)
    
    tr_loss.append(epoch_tr_loss)
    tr_accuracy.append(epoch_tr_accuracy)
    tr_MCC.append(epoch_tr_MCC)
    tr_SE.append(epoch_tr_SE)
    tr_SPC.append(epoch_tr_SPC)
    tr_PPV.append(epoch_tr_PPV)
    tr_NPV.append(epoch_tr_NPV)
    tr_F1.append(epoch_tr_F1)
    "tr_AUC.append(epoch_tr_AUC)"
    
    te_loss.append(epoch_te_loss)
    te_accuracy.append(epoch_te_accuracy)
    print(epoch_te_accuracy)
    te_MCC.append(epoch_te_MCC)
    te_SE.append(epoch_te_SE)
    te_SPC.append(epoch_te_SPC)
    te_PPV.append(epoch_te_PPV)
    te_NPV.append(epoch_te_NPV)
    te_F1.append(epoch_te_F1)
    "te_AUC.append(epoch_te_AUC)"

0 : 0.7958333333333333
1 : 0.8152083333333333
2 : 0.8295833333333333
3 : 0.8310416666666667
4 : 0.799375
5 : 0.8264583333333333
6 : 0.80125
7 : 0.8291666666666667
8 : 0.8241666666666667
9 : 0.815625
10 : 0.7970833333333334
11 : 0.801875
12 : 0.8329166666666666
13 : 0.793125
14 : 0.839375
15 : 0.8110416666666667
16 : 0.7947916666666667
17 : 0.8460416666666667
18 : 0.8233333333333334
19 : 0.8379166666666666
20 : 0.6975
21 : 0.8185416666666666
22 : 0.8172916666666666
23 : 0.8477083333333333
24 : 0.8395833333333333
25 : 0.848125
26 : 0.7814583333333334
27 : 0.8364583333333333
28 : 0.8214583333333333
29 : 0.8289583333333334
30 : 0.824375
31 : 0.8460416666666667
32 : 0.765
33 : 0.84
34 : 0.755625
35 : 0.8391666666666666
36 : 0.8525
37 : 0.8525
38 : 0.8039583333333333
39 : 0.8264583333333333
40 : 0.8322916666666667
41 : 0.8495833333333334
42 : 0.8529166666666667
43 : 0.8433333333333334
44 : 0.854375
45 : 0.8297916666666667
46 : 0.8385416666666666
47 : 0.8122916666666666
48 : 0.853541666666666

In [14]:
column_name = ['loss', 'accuracy', 'MCC', 'SE', 'SPC', 'PPV', 'NPV','F1']

tr_loss = pd.Series(tr_loss)
tr_accuracy = pd.Series(tr_accuracy)
tr_MCC = pd.Series(tr_MCC)
tr_SE = pd.Series(tr_SE)
tr_SPC = pd.Series(tr_SPC)
tr_PPV = pd.Series(tr_PPV)
tr_NPV = pd.Series(tr_NPV)
tr_F1 = pd.Series(tr_F1)

tr_result = pd.concat([tr_loss, tr_accuracy, tr_MCC, tr_SE, tr_SPC, tr_PPV, tr_NPV,tr_F1], axis=1)
tr_result.columns = column_name

In [15]:
column_name = ['loss', 'accuracy', 'MCC', 'SE', 'SPC', 'PPV', 'NPV','F1']

te_loss = pd.Series(te_loss)
te_accuracy = pd.Series(te_accuracy)
te_MCC = pd.Series(te_MCC)
te_SE = pd.Series(te_SE)
te_SPC = pd.Series(te_SPC)
te_PPV = pd.Series(te_PPV)
te_NPV = pd.Series(te_NPV)
te_F1= pd.Series(te_F1)

te_result = pd.concat([te_loss, te_accuracy, te_MCC, te_SE, te_SPC, te_PPV, te_NPV,te_F1], axis=1)
te_result.columns = column_name
te_result.index = [*range(1, epochs + 1)]

In [16]:
max(te_result.accuracy)

0.8614583333333333

In [30]:
# save
tr_result.to_csv(r'one_hot_GRU_train_result.csv')
te_result.to_csv(r'one_hot_GRU_valid_result.csv')