In [14]:
import math
import re
import torch
from torch import nn
from torch.nn import functional as F
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [15]:
data = pd.read_csv(r'./基准数据集.csv')
sequences = data.Sequence
labels = data.Label.values
data

Unnamed: 0,Meta,Sequence,Label
0,lcl|Athaliana_AT5G45455.1,GAAATTCTTTGGAGCTCAGTGGCCCAACAAGATTATAATCCGAAAA...,1
1,lcl|Athaliana_AT2G44925.1,AAGGAAAAAAAAAAAGGAATTTCGTTTCTCTTGGTGTTAAAAGGAG...,1
2,lcl|Athaliana_AT3G51660.1,GTCAGATTTGAAACTCAAAGATATTATTTCAAAAATTGTTCACATC...,1
3,lcl|Athaliana_AT4G06655.1,ATGAACTTGCCTTTCGATGACAAAATCAAACTCAGATTCCGACTAC...,1
4,URS0000291A56_3702,GGCATCCCGTCCTTAATTGGTCC,1
...,...,...,...
15995,VIT_201s0010g02000.1,ATGGGCTCAATAGCAGGGAATTATGGTGCATGCATTTTTGTGGCAG...,0
15996,VIT_208s0007g01060.2,ATGGATTCCTCTCGCGAGTTCGTCAAGGACGTCAAGCGTGTCATTG...,0
15997,VIT_214s0060g01630.2,ATGGCAACTTTTGCCAAACCAGAGAATGCTTTGAAGCGAGCTGAAG...,0
15998,VIT_208s0007g07570.2,ATGGGGGCGAGTCGAAAACTACAAGGCGAGATTGACAGGGTTCTGA...,0


In [16]:
pat = re.compile('[AGCTagct]')

def pre_process(text):
    text = pat.findall(text)
    text = [each.lower() for each in text]
    return text

x = sequences.apply(pre_process)

In [17]:
word_list = ['a', 'g', 'c', 't']

word_index = {'a': 0, 'g': 1, 'c': 2, 't': 3}

In [18]:
text = x.apply(lambda x: [word_index.get(word, 4) for word in x])

In [19]:
text_len = 1000
pad_text = [l + (text_len - len(l)) * [4] if len(l) < text_len else l[:text_len] for l in text]
pad_text = np.array(pad_text)

In [20]:
x_train, x_test, y_train, y_test = train_test_split(pad_text, labels, test_size=0.3)

In [21]:
class Mydataset(torch.utils.data.Dataset):
    def __init__(self, text_list, label_list):
        self.text_list = text_list
        self.label_list = label_list
    
    def __getitem__(self,index):
        text = torch.LongTensor(self.text_list[index])
        label = self.label_list[index]
        return text, label
    
    def __len__(self):
        return len(self.text_list)

train_ds = Mydataset(x_train, y_train)
test_ds = Mydataset(x_test, y_test)


batch_size = 32

train_dl = torch.utils.data.DataLoader(train_ds, batch_size=batch_size, shuffle=True)
test_dl = torch.utils.data.DataLoader(test_ds, batch_size=batch_size, shuffle=False)

In [22]:
one_hot_dim = len(word_list)
hidden_size = 20
print('hidden_size:', hidden_size)

class Net(nn.Module):
    def __init__(self, word_list, one_hot_dim, hidden_size, num_layers=1):
        super().__init__()
        
        self.rnn = nn.GRU(one_hot_dim, hidden_size, num_layers, bidirectional=True)
        self.linear1 = nn.Linear(2 * hidden_size, 128)
        self.linear2 = nn.Linear(128, 2)
 
    def forward(self, inputs):
        x = F.one_hot(inputs, num_classes=len(word_list)+1)
        x = x[:, :, :-1]  # x size (batch_size, seq_len, one_hot_dim)
        x = x.permute(1, 0, 2)  # x size (seq_len, batch_size, one_hot_dim)
        x = x.float()
        x, _ = self.rnn(x)  # x is outuput, size (seq_len, batch, 2 * hidden_size) 
        x = torch.sum(x, dim=0)
        x = F.relu(self.linear1(x))
        x = self.linear2(x)
        return x

hidden_size: 20


In [23]:
model = Net(word_list, one_hot_dim, hidden_size)
model = model.to('cuda')

loss = nn.CrossEntropyLoss()
loss = loss.to('cuda')

optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

In [24]:
def fit(model, optimizer, train_dl, test_dl):
   
    """训练集"""
    tr_correct = 0
    tr_total = 0
    tr_loss = 0
    tr_TP = 0
    tr_TN = 0
    tr_FP = 0
    tr_FN = 0
    
    model.train()
    for x, y in train_dl:
        y = y.long()
        x, y = x.to('cuda'), y.to('cuda')
        y_pred = model(x)
        loss_value = loss(y_pred, y)
        optimizer.zero_grad()
        loss_value.backward()
        optimizer.step()
        
        with torch.no_grad():
            y_pred = torch.argmax(y_pred, dim=1)
            tr_correct += (y_pred == y).sum().item()
            tr_TP += ((y_pred == y) & (y == 1)).sum().item()
            tr_FN += ((y_pred != y) & (y == 1)).sum().item()
            tr_FP += ((y_pred != y) & (y == 0)).sum().item()
            tr_TN += ((y_pred == y) & (y == 0)).sum().item()
            tr_total += len(y)
            tr_loss += loss_value.item()
            
    """1个epoch训练结束后，计算训练集的各个指标"""
    epoch_tr_loss = tr_loss / len(train_dl)
    epoch_tr_accuracy = tr_correct / tr_total
    epoch_tr_MCC = (tr_TP * tr_TN - tr_TP * tr_FN) / (math.sqrt((tr_TP + tr_FP) * (tr_TP + tr_FN) * (tr_TN + tr_FP) * (tr_TN + tr_FN)))
    epoch_tr_SE=tr_TP/(tr_TP+tr_FN)
    epoch_tr_SPC = tr_TN / (tr_TN + tr_FP)
    epoch_tr_PPV= tr_TP / (tr_TP + tr_FP)
    epoch_tr_NPV= tr_TN / (tr_TN + tr_FN)
    epoch_tr_recall = tr_TP / (tr_TP + tr_FN)
    epoch_tr_precision = tr_TP / (tr_TP + tr_FP)
    epoch_tr_F1 = (2 * epoch_tr_precision * epoch_tr_recall) / (epoch_tr_precision + epoch_tr_recall)
   

    """验证集"""
    te_correct = 0
    te_total = 0
    te_loss = 0
    te_TP = 0
    te_TN = 0
    te_FP = 0
    te_FN = 0
    
    model.eval()
    with torch.no_grad():
        for x, y in test_dl:
            y = y.long()
            x, y = x.to('cuda'), y.to('cuda')
            y_pred = model(x)
            loss_value = loss(y_pred, y)
            y_pred = torch.argmax(y_pred, dim=1)
            te_correct += (y_pred == y).sum().item()
            te_TP += ((y_pred == y) & (y == 1)).sum().item()
            te_FN += ((y_pred != y) & (y == 1)).sum().item()
            te_FP += ((y_pred != y) & (y == 0)).sum().item()
            te_TN += ((y_pred == y) & (y == 0)).sum().item()
            te_total += len(y)
            te_loss += loss_value.item()
        
    """1个epoch训练结束后，计算测试集的各个指标"""
    epoch_te_loss = te_loss / len(test_dl)
    epoch_te_accuracy = te_correct / te_total
    epoch_te_MCC = (te_TP * te_TN - te_TP * te_FN) / (math.sqrt((te_TP + te_FP) * (te_TP + te_FN) * (te_TN + te_FP) * (te_TN + te_FN)))
    epoch_te_SE=te_TP/(te_TP+te_FN)
    epoch_te_SPC = te_TN / (te_TN + te_FP)
    epoch_te_PPV= te_TP / (te_TP + te_FP)
    epoch_te_NPV= te_TN / (te_TN + te_FN)
    epoch_te_recall = te_TP / (te_TP + te_FN)
    epoch_te_precision = te_TP / (te_TP + te_FP)
    epoch_te_F1 = (2 * epoch_te_precision * epoch_te_recall) / (epoch_te_precision + epoch_te_recall)


    return epoch_tr_loss, epoch_tr_accuracy, epoch_tr_MCC, epoch_tr_SE,epoch_tr_F1, epoch_tr_SPC,epoch_tr_PPV, epoch_tr_NPV,epoch_te_loss, epoch_te_accuracy, epoch_te_MCC, epoch_te_SE, epoch_te_SPC,epoch_te_PPV,epoch_te_F1,epoch_te_NPV 

In [25]:
tr_loss = []
tr_accuracy = []
tr_MCC = []
tr_SE = []
tr_SPC = []
tr_PPV=[]
tr_NPV=[]
tr_AUC=[]
tr_F1=[]

te_loss = []
te_accuracy = []
te_MCC = []
te_SE = []
te_SPC = []
te_PPV=[]
te_NPV=[]
te_AUC=[]
te_F1=[]
from sklearn import metrics

In [26]:
epochs = 80

for epoch in range(epochs): 
    print(f'{epoch} : ',end = '')
    epoch_tr_loss, epoch_tr_accuracy, epoch_tr_MCC, epoch_tr_SE,epoch_tr_F1, epoch_tr_SPC,epoch_tr_PPV, epoch_tr_NPV,epoch_te_loss, epoch_te_accuracy, epoch_te_MCC, epoch_te_SE, epoch_te_SPC,epoch_te_PPV,epoch_te_F1,epoch_te_NPV= fit(model, optimizer, train_dl, test_dl)
    
    tr_loss.append(epoch_tr_loss)
    tr_accuracy.append(epoch_tr_accuracy)
    tr_MCC.append(epoch_tr_MCC)
    tr_SE.append(epoch_tr_SE)
    tr_SPC.append(epoch_tr_SPC)
    tr_PPV.append(epoch_tr_PPV)
    tr_NPV.append(epoch_tr_NPV)
    tr_F1.append(epoch_tr_F1)
    "tr_AUC.append(epoch_tr_AUC)"
    
    te_loss.append(epoch_te_loss)
    te_accuracy.append(epoch_te_accuracy)
    print(epoch_te_accuracy)
    te_MCC.append(epoch_te_MCC)
    te_SE.append(epoch_te_SE)
    te_SPC.append(epoch_te_SPC)
    te_PPV.append(epoch_te_PPV)
    te_NPV.append(epoch_te_NPV)
    te_F1.append(epoch_te_F1)
    "te_AUC.append(epoch_te_AUC)"

0 : 0.813125
1 : 0.7889583333333333
2 : 0.8435416666666666
3 : 0.79625
4 : 0.8297916666666667
5 : 0.8322916666666667
6 : 0.8133333333333334
7 : 0.83875
8 : 0.8435416666666666
9 : 0.8079166666666666
10 : 0.756875
11 : 0.8447916666666667
12 : 0.8208333333333333
13 : 0.8172916666666666
14 : 0.7895833333333333
15 : 0.8514583333333333
16 : 0.8479166666666667
17 : 0.7089583333333334
18 : 0.8260416666666667
19 : 0.8366666666666667
20 : 0.8402083333333333
21 : 0.8452083333333333
22 : 0.8502083333333333
23 : 0.8333333333333334
24 : 0.8435416666666666
25 : 0.8410416666666667
26 : 0.8541666666666666
27 : 0.85
28 : 0.8422916666666667
29 : 0.85375
30 : 0.8439583333333334
31 : 0.84625
32 : 0.84375
33 : 0.8477083333333333
34 : 0.803125
35 : 0.8352083333333333
36 : 0.8366666666666667
37 : 0.855
38 : 0.8416666666666667
39 : 0.8558333333333333
40 : 0.8191666666666667
41 : 0.8520833333333333
42 : 0.84875
43 : 0.8239583333333333
44 : 0.81875
45 : 0.85125
46 : 0.8508333333333333
47 : 0.8591666666666666
48 

In [27]:
column_name = ['loss', 'accuracy', 'MCC', 'SE', 'SPC', 'PPV', 'NPV','F1']

tr_loss = pd.Series(tr_loss)
tr_accuracy = pd.Series(tr_accuracy)
tr_MCC = pd.Series(tr_MCC)
tr_SE = pd.Series(tr_SE)
tr_SPC = pd.Series(tr_SPC)
tr_PPV = pd.Series(tr_PPV)
tr_NPV = pd.Series(tr_NPV)
tr_F1 = pd.Series(tr_F1)

tr_result = pd.concat([tr_loss, tr_accuracy, tr_MCC, tr_SE, tr_SPC, tr_PPV, tr_NPV,tr_F1], axis=1)
tr_result.columns = column_name

In [28]:
column_name = ['loss', 'accuracy', 'MCC', 'SE', 'SPC', 'PPV', 'NPV','F1']

te_loss = pd.Series(te_loss)
te_accuracy = pd.Series(te_accuracy)
te_MCC = pd.Series(te_MCC)
te_SE = pd.Series(te_SE)
te_SPC = pd.Series(te_SPC)
te_PPV = pd.Series(te_PPV)
te_NPV = pd.Series(te_NPV)
te_F1= pd.Series(te_F1)

te_result = pd.concat([te_loss, te_accuracy, te_MCC, te_SE, te_SPC, te_PPV, te_NPV,te_F1], axis=1)
te_result.columns = column_name
# te_result.index = [*range(1, epochs + 1)]

In [29]:
max(te_result.accuracy)

0.8770833333333333

In [31]:
# save
tr_result.to_csv(r'one_hot_BiGRU_train_result.csv')
te_result.to_csv(r'one_hot_BiGRU_valid_result.csv')