In [1]:
import math
import re
import torch
from torch import nn
from torch.nn import functional as F
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
%matplotlib inline

In [2]:
data = pd.read_csv(r'基准数据集.csv')
sequences = data.Sequence
labels = data.Label.values

In [3]:
pat = re.compile('[AGCTagct]')

def pre_process(text):
    text = pat.findall(text)
    text = [each.lower() for each in text]
    return text

x = sequences.apply(pre_process)

In [4]:
word_list = ['a', 'g', 'c', 't']

word_index = {'a': 0, 'g': 1, 'c': 2, 't': 3}

In [5]:
text = x.apply(lambda x: [word_index.get(word, 4) for word in x])


In [6]:
text_len = 1200

pad_text = [l + (text_len - len(l)) * [4] if len(l) < text_len else l[:text_len] for l in text]

pad_text = np.array(pad_text)

In [7]:
x_train, x_test, y_train, y_test = train_test_split(pad_text, labels, test_size=0.3)

In [8]:
class Mydataset(torch.utils.data.Dataset):
    def __init__(self, text_list, label_list):
        self.text_list = text_list
        self.label_list = label_list
    
    def __getitem__(self,index):
        text = torch.LongTensor(self.text_list[index])
        label = self.label_list[index]
        return text, label
    
    def __len__(self):
        return len(self.text_list)

train_ds = Mydataset(x_train, y_train)
test_ds = Mydataset(x_test, y_test)


batch_size = 16

train_dl = torch.utils.data.DataLoader(train_ds, batch_size=batch_size, shuffle=True)
test_dl = torch.utils.data.DataLoader(test_ds, batch_size=batch_size, shuffle=False)


In [9]:
class PositionalEncoding(nn.Module):
    "Implement the PE function."

    def __init__(self, d_model, dropout=0.1, max_len=text_len) :
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() *
                             -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0,1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [10]:
one_hot_dim = len(word_list)

class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.pos = PositionalEncoding(one_hot_dim)
        self.encoder_layer = nn.TransformerEncoderLayer(one_hot_dim, nhead=2)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=3)
        self.fc1 = nn.Linear(one_hot_dim, 128)
        self.fc2 = nn.Linear(128, 2)
           
    def forward(self, inputs):
        x = F.one_hot(inputs, num_classes=len(word_list)+1)
        x = x[:, :, :-1]
        x = x.permute(1, 0,2)
        x = self.pos(x)
        x = self.transformer_encoder(x)
        x = torch.sum(x, dim=0)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [11]:
model = Net()
model = model.to('cuda')

loss = nn.CrossEntropyLoss()
loss = loss.to('cuda')

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [12]:
def fit(model, optimizer, train_dl, test_dl):
   
    """训练集"""
    tr_correct = 0
    tr_total = 0
    tr_loss = 0
    tr_TP = 0
    tr_TN = 0
    tr_FP = 0
    tr_FN = 0
    
    model.train()
    for x, y in train_dl:
        y = y.long()
        x, y = x.to('cuda'), y.to('cuda')
        y_pred = model(x)
        loss_value = loss(y_pred, y)
        optimizer.zero_grad()
        loss_value.backward()
        optimizer.step()
        
        with torch.no_grad():
            y_pred = torch.argmax(y_pred, dim=1)
            tr_correct += (y_pred == y).sum().item()
            tr_TP += ((y_pred == y) & (y == 1)).sum().item()
            tr_FN += ((y_pred != y) & (y == 1)).sum().item()
            tr_FP += ((y_pred != y) & (y == 0)).sum().item()
            tr_TN += ((y_pred == y) & (y == 0)).sum().item()
            tr_total += len(y)
            tr_loss += loss_value.item()
            
    """1个epoch训练结束后，计算训练集的各个指标"""
    epoch_tr_loss = tr_loss / len(train_dl)
    epoch_tr_accuracy = tr_correct / tr_total
    epoch_tr_MCC = (tr_TP * tr_TN - tr_TP * tr_FN) / (math.sqrt((tr_TP + tr_FP) * (tr_TP + tr_FN) * (tr_TN + tr_FP) * (tr_TN + tr_FN)))
    epoch_tr_SE=tr_TP/(tr_TP+tr_FN)
    epoch_tr_SPC = tr_TN / (tr_TN + tr_FP)
    epoch_tr_PPV= tr_TP / (tr_TP + tr_FP)
    epoch_tr_NPV= tr_TN / (tr_TN + tr_FN)
    epoch_tr_recall = tr_TP / (tr_TP + tr_FN)
    epoch_tr_precision = tr_TP / (tr_TP + tr_FP)
    epoch_tr_F1 = (2 * epoch_tr_precision * epoch_tr_recall) / (epoch_tr_precision + epoch_tr_recall)
   

    """验证集"""
    te_correct = 0
    te_total = 0
    te_loss = 0
    te_TP = 0
    te_TN = 0
    te_FP = 0
    te_FN = 0
    
    model.eval()
    with torch.no_grad():
        for x, y in test_dl:
            y = y.long()
            x, y = x.to('cuda'), y.to('cuda')
            y_pred = model(x)
            loss_value = loss(y_pred, y)
            y_pred = torch.argmax(y_pred, dim=1)
            te_correct += (y_pred == y).sum().item()
            te_TP += ((y_pred == y) & (y == 1)).sum().item()
            te_FN += ((y_pred != y) & (y == 1)).sum().item()
            te_FP += ((y_pred != y) & (y == 0)).sum().item()
            te_TN += ((y_pred == y) & (y == 0)).sum().item()
            te_total += len(y)
            te_loss += loss_value.item()
        
    """1个epoch训练结束后，计算测试集的各个指标"""
    epoch_te_loss = te_loss / len(test_dl)
    epoch_te_accuracy = te_correct / te_total
    epoch_te_MCC = (te_TP * te_TN - te_TP * te_FN) / (math.sqrt((te_TP + te_FP) * (te_TP + te_FN) * (te_TN + te_FP) * (te_TN + te_FN)))
    epoch_te_SE=te_TP/(te_TP+te_FN)
    epoch_te_SPC = te_TN / (te_TN + te_FP)
    epoch_te_PPV= te_TP / (te_TP + te_FP)
    epoch_te_NPV= te_TN / (te_TN + te_FN)
    epoch_te_recall = te_TP / (te_TP + te_FN)
    epoch_te_precision = te_TP / (te_TP + te_FP)
    epoch_te_F1 = (2 * epoch_te_precision * epoch_te_recall) / (epoch_te_precision + epoch_te_recall)


    return epoch_tr_loss, epoch_tr_accuracy, epoch_tr_MCC, epoch_tr_SE,epoch_tr_F1, epoch_tr_SPC,epoch_tr_PPV, epoch_tr_NPV,epoch_te_loss, epoch_te_accuracy, epoch_te_MCC, epoch_te_SE, epoch_te_SPC,epoch_te_PPV,epoch_te_F1,epoch_te_NPV 

In [13]:
tr_loss = []
tr_accuracy = []
tr_MCC = []
tr_SE = []
tr_SPC = []
tr_PPV=[]
tr_NPV=[]
tr_AUC=[]
tr_F1=[]

te_loss = []
te_accuracy = []
te_MCC = []
te_SE = []
te_SPC = []
te_PPV=[]
te_NPV=[]
te_AUC=[]
te_F1=[]
from sklearn import metrics

In [14]:
epochs = 80

for epoch in range(epochs): 
    print(f'{epoch} : ',end = '')
    epoch_tr_loss, epoch_tr_accuracy, epoch_tr_MCC, epoch_tr_SE,epoch_tr_F1, epoch_tr_SPC,epoch_tr_PPV, epoch_tr_NPV,epoch_te_loss, epoch_te_accuracy, epoch_te_MCC, epoch_te_SE, epoch_te_SPC,epoch_te_PPV,epoch_te_F1,epoch_te_NPV= fit(model, optimizer, train_dl, test_dl)
    
    tr_loss.append(epoch_tr_loss)
    tr_accuracy.append(epoch_tr_accuracy)
    tr_MCC.append(epoch_tr_MCC)
    tr_SE.append(epoch_tr_SE)
    tr_SPC.append(epoch_tr_SPC)
    tr_PPV.append(epoch_tr_PPV)
    tr_NPV.append(epoch_tr_NPV)
    tr_F1.append(epoch_tr_F1)
    "tr_AUC.append(epoch_tr_AUC)"
    
    te_loss.append(epoch_te_loss)
    te_accuracy.append(epoch_te_accuracy)
    print(epoch_te_accuracy)
    te_MCC.append(epoch_te_MCC)
    te_SE.append(epoch_te_SE)
    te_SPC.append(epoch_te_SPC)
    te_PPV.append(epoch_te_PPV)
    te_NPV.append(epoch_te_NPV)
    te_F1.append(epoch_te_F1)
    "te_AUC.append(epoch_te_AUC)"

0 : 0.5495833333333333
1 : 0.5070833333333333
2 : 0.7533333333333333
3 : 0.75625
4 : 0.7310416666666667
5 : 0.7354166666666667
6 : 0.6233333333333333
7 : 0.7210416666666667
8 : 0.7583333333333333
9 : 0.7358333333333333
10 : 0.7222916666666667
11 : 0.7691666666666667
12 : 0.7820833333333334
13 : 0.7604166666666666
14 : 0.6904166666666667
15 : 0.77125
16 : 0.7679166666666667
17 : 0.74375
18 : 0.7675
19 : 0.72625
20 : 0.7508333333333334
21 : 0.7547916666666666
22 : 0.775625
23 : 0.7845833333333333
24 : 0.7616666666666667
25 : 0.748125
26 : 0.7472916666666667
27 : 0.746875
28 : 0.700625
29 : 0.7529166666666667
30 : 0.6697916666666667
31 : 0.7135416666666666
32 : 0.7660416666666666
33 : 0.5891666666666666
34 : 0.7158333333333333
35 : 0.508125
36 : 0.5260416666666666
37 : 0.6383333333333333
38 : 0.5414583333333334
39 : 0.559375
40 : 0.7595833333333334
41 : 0.7327083333333333
42 : 0.74
43 : 0.7758333333333334
44 : 0.77
45 : 0.77
46 : 0.7564583333333333
47 : 0.7295833333333334
48 : 0.643333333

In [15]:
column_name = ['loss', 'accuracy', 'MCC', 'SE', 'SPC', 'PPV', 'NPV','F1']

tr_loss = pd.Series(tr_loss)
tr_accuracy = pd.Series(tr_accuracy)
tr_MCC = pd.Series(tr_MCC)
tr_SE = pd.Series(tr_SE)
tr_SPC = pd.Series(tr_SPC)
tr_PPV = pd.Series(tr_PPV)
tr_NPV = pd.Series(tr_NPV)
tr_F1 = pd.Series(tr_F1)

tr_result = pd.concat([tr_loss, tr_accuracy, tr_MCC, tr_SE, tr_SPC, tr_PPV, tr_NPV,tr_F1], axis=1)
tr_result.columns = column_name

In [16]:
column_name = ['loss', 'accuracy', 'MCC', 'SE', 'SPC', 'PPV', 'NPV','F1']

te_loss = pd.Series(te_loss)
te_accuracy = pd.Series(te_accuracy)
te_MCC = pd.Series(te_MCC)
te_SE = pd.Series(te_SE)
te_SPC = pd.Series(te_SPC)
te_PPV = pd.Series(te_PPV)
te_NPV = pd.Series(te_NPV)
te_F1= pd.Series(te_F1)

te_result = pd.concat([te_loss, te_accuracy, te_MCC, te_SE, te_SPC, te_PPV, te_NPV,te_F1], axis=1)
te_result.columns = column_name
# te_result.index = [*range(1, epochs + 1)]

In [17]:
max(te_result.accuracy)

0.785

In [18]:
# save
tr_result.to_csv(r'one_hot_TF_train_result.csv')
te_result.to_csv(r'one_hot_TF_valid_result.csv')