# 1.Original model prediction

In [26]:
import os
import sys
import csv
import time
from collections import deque
import numpy as np
import torch
import torch.nn.functional as F
import torch.optim as optim
from data_loader import define_dataloader, load_embedding, load_data_split
from utils import str2bool, timeSince, get_performance_batchiter, print_performance, write_blackbox_output_batchiter
import data_io_tf


class Args:
    indepfile=None 
    blosum=None
    batch_size=32
    epoch=20
    min_epoch=1
    early_stop=True 
    lr=0.001 
    cuda=True #True 
    seed=1039 
    mode='train'
    save_model=True 
    model='attention'
    drop_rate=0.25        
    lin_size=1024 
    padding="mid"
    heads=5
    max_len_tcr=20 
    max_len_pep=22
    n_fold=5 
    idx_test_fold=0
    idx_val_fold=-1
    #split_type='tcr'

def Original_model_prediction(testfile_path,modelfile_path, result_path):
    import pandas as pd
    from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, f1_score, matthews_corrcoef, precision_score, auc, roc_curve

    device = torch.device('cuda')
    args = Args()

    if torch.cuda.is_available() and not args.cuda:
        print("WARNING: You have a CUDA device, so you should probably run with --cuda")
    device = torch.device('cuda' if args.cuda else 'cpu')

    testfile = pd.read_csv(testfile_path)
    print(testfile.index)
    testfile['idx'] = testfile.index
    x_pep = testfile['Epitope'].values
    x_tcr = testfile['CDR3B'].values
    y = testfile['Affinity'].values

    embedding_matrix = load_embedding(args.blosum)
    
    test_loader = define_dataloader(x_pep, x_tcr, y,
                                    maxlen_pep=args.max_len_pep,
                                    maxlen_tcr=args.max_len_tcr,
                                    padding=args.padding,
                                    batch_size=args.batch_size, device=device)
    
    if args.model == 'attention':
        from attention import Net
    else:
        raise ValueError('unknown model name')
    
    model = Net(embedding_matrix, args).to(device)
    model.load_state_dict(torch.load(modelfile_path, map_location=device))
    print('[PREDICT] ----------------')
    perf_test = get_performance_batchiter(test_loader['loader'], model, device)
    print_performance(perf_test)
    wf_open1 = open(result_path + 'probability.csv', 'w', newline='')
    wf1 = csv.writer(wf_open1, delimiter=',')
    wf1.writerow([ 'Epitope', 'CDR3B', 'y_true', 'y_pred','y_prob'])
    write_blackbox_output_batchiter(test_loader, model, wf1, device, ifscore=True)
    wf_open1.close()


In [None]:
testfile_path="../data/test.csv"
modelfile_path="../Original_model/original.ckpt"
result_path="../result_path/Original_model_prediction"
Original_model_prediction(testfile_path,modelfile_path,result_path)


# 2.Model retraining

In [20]:
import os
import sys
import csv
import time
from collections import deque
import numpy as np
import torch
import torch.nn.functional as F
import torch.optim as optim
from sklearn.metrics import precision_recall_curve, auc
from data_loader import define_dataloader, load_embedding
from utils import get_performance_batchiter, print_performance, write_blackbox_output_batchiter
import data_io_tf

class Args:
    indepfile = None 
    blosum = None
    batch_size = 32
    epoch = 50
    min_epoch = 1
    early_stop = True 
    lr = 0.001 
    cuda = True 
    seed = 1039 
    mode = 'train'
    save_model = True 
    model = 'attention'
    drop_rate = 0.25        
    lin_size = 1024 
    padding = "mid"
    heads = 5
    max_len_tcr = 20 
    max_len_pep = 22
    n_fold = 5 
    idx_test_fold = 0
    idx_val_fold = -1

def Model_retraining(trainfile_path, testfile_path, save_model_path, result_path):
    import pandas as pd
    if torch.cuda.is_available():
        print("gpu enabled")
    else:
        print("no gpu")
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    PRINT_EVERY_EPOCH = 1

    def train(model, device, train_loader, optimizer, epoch):
        model.train()
        for batch in train_loader:
            x_pep, x_tcr, y = batch.X_pep.to(device), batch.X_tcr.to(device), batch.y.to(device)
            optimizer.zero_grad()
            yhat = model(x_pep, x_tcr)
            y = y.unsqueeze(-1).expand_as(yhat)
            loss = F.binary_cross_entropy(yhat, y)
            loss.backward()
            optimizer.step()

        if epoch % PRINT_EVERY_EPOCH == 1:
            print('[TRAIN] Epoch {} Loss {:.4f}'.format(epoch, loss.item()))

    def evaluate_model(model, device, test_loader):
        model.eval()
        y_true = []
        y_scores = []
        with torch.no_grad():
            for batch in test_loader:
                x_pep, x_tcr, y = batch.X_pep.to(device), batch.X_tcr.to(device), batch.y.to(device)
                yhat = model(x_pep, x_tcr)
                y_true.extend(y.cpu().numpy())
                y_scores.extend(yhat.cpu().numpy())
        
        precision, recall, _ = precision_recall_curve(y_true, y_scores)
        auprc = auc(recall, precision)
        return auprc

    args = Args()

    device = torch.device('cuda' if args.cuda else 'cpu')

    torch.manual_seed(args.seed)
    if args.cuda:
        torch.cuda.manual_seed(args.seed)

    embedding_matrix = load_embedding(args.blosum)

    trainfile = pd.read_csv(trainfile_path)
    trainfile = trainfile.reset_index().rename(columns={'index': 'idx'})
    testfile = pd.read_csv(testfile_path)
    testfile['idx'] = testfile.index + len(trainfile)
    alldata = pd.concat([trainfile, testfile], axis=0)
    idx_train = trainfile['idx'].tolist()
    idx_test = testfile['idx'].tolist()
    x_pep = alldata['Epitope'].values
    x_tcr = alldata['CDR3B'].values
    y = alldata['Affinity'].values

    train_loader = define_dataloader(x_pep[idx_train], x_tcr[idx_train], y[idx_train],
                                     args.max_len_pep, args.max_len_tcr,
                                     padding=args.padding,
                                     batch_size=args.batch_size, device=device)
    test_loader = define_dataloader(x_pep[idx_test], x_tcr[idx_test], y[idx_test],
                                    maxlen_pep=train_loader['pep_length'],
                                    maxlen_tcr=train_loader['tcr_length'],
                                    padding=args.padding,
                                    batch_size=args.batch_size, device=device)

    if args.model == 'attention':
        from attention import Net
    else:
        raise ValueError('unknown model name')

    model = Net(embedding_matrix, args).to(device)
    optimizer = optim.Adam(model.parameters(), lr=args.lr)

    if args.mode == 'train':
        t0 = time.time()
        lossArraySize = 10
        lossArray = deque([sys.maxsize], maxlen=lossArraySize)
        for epoch in range(1, args.epoch + 1):
            train(model, device, train_loader['loader'], optimizer, epoch)
            perf_test = get_performance_batchiter(test_loader['loader'], model, device)
            lossArray.append(perf_test['loss'])
            average_loss_change = sum(np.abs(np.diff(lossArray))) / lossArraySize
            auprc = evaluate_model(model, device, test_loader['loader'])
            print(f'[EPOCH {epoch}] AUPRC: {auprc:.4f}')

            if epoch > args.min_epoch and average_loss_change < 10 and args.early_stop:
                break
        
        print('[TEST ] {} ----------------'.format(epoch))
        perf_test = get_performance_batchiter(test_loader['loader'], model, device)
        print_performance(perf_test)

        if args.save_model:
            wf_open1 = open(result_path + 'probability.csv', 'w', newline='')
            wf1 = csv.writer(wf_open1, delimiter=',')
            wf1.writerow(['Epitope', 'CDR3B', 'y_true', 'y_pred', 'y_prob'])
            write_blackbox_output_batchiter(test_loader, model, wf1, device, ifscore=True)
            wf_open1.close()
                        
            torch.save(model.state_dict(), save_model_path)


In [None]:
trainfile_path ="../data/train.csv"
testfile_path="../data/test.csv"
save_modle_path="../Retraining_model/Retraining_model.ckpt"
result_path="../result_path/Retraining_model_prediction"
Model_retraining(trainfile_path,testfile_path,save_modle_path,result_path) 


# 3.Retraining_model_prediction

In [24]:
import os
import sys
import csv
import time
from collections import deque
import numpy as np
import torch
import torch.nn.functional as F
import torch.optim as optim
from data_loader import define_dataloader, load_embedding, load_data_split
from utils import str2bool, timeSince, get_performance_batchiter, print_performance, write_blackbox_output_batchiter
import data_io_tf


class Args:
    indepfile=None 
    blosum=None
    batch_size=32
    epoch=20
    min_epoch=1
    early_stop=True 
    lr=0.001 
    cuda=True #True 
    seed=1039 
    mode='train'
    save_model=True 
    model='attention'
    drop_rate=0.25        
    lin_size=1024 
    padding="mid"
    heads=5
    max_len_tcr=20 
    max_len_pep=22
    n_fold=5 
    idx_test_fold=0
    idx_val_fold=-1
    #split_type='tcr'

def Retraining_model_prediction(testfile_path,modelfile_path, result_path):
    import pandas as pd
    from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, f1_score, matthews_corrcoef, precision_score, auc, roc_curve

    device = torch.device('cuda')
    args = Args()

    if torch.cuda.is_available() and not args.cuda:
        print("WARNING: You have a CUDA device, so you should probably run with --cuda")
    device = torch.device('cuda' if args.cuda else 'cpu')

    testfile = pd.read_csv(testfile_path)
    print(testfile.index)
    testfile['idx'] = testfile.index
    x_pep = testfile['Epitope'].values
    x_tcr = testfile['CDR3B'].values
    y = testfile['Affinity'].values

    embedding_matrix = load_embedding(args.blosum)
    
    test_loader = define_dataloader(x_pep, x_tcr, y,
                                    maxlen_pep=args.max_len_pep,
                                    maxlen_tcr=args.max_len_tcr,
                                    padding=args.padding,
                                    batch_size=args.batch_size, device=device)
    
    if args.model == 'attention':
        from attention import Net
    else:
        raise ValueError('unknown model name')
    
    model = Net(embedding_matrix, args).to(device)
    model.load_state_dict(torch.load(modelfile_path, map_location=device))
    print('[PREDICT] ----------------')
    perf_test = get_performance_batchiter(test_loader['loader'], model, device)
    print_performance(perf_test)
    wf_open1 = open(result_path + 'probability.csv', 'w', newline='')
    wf1 = csv.writer(wf_open1, delimiter=',')
    wf1.writerow([ 'Epitope', 'CDR3B', 'y_true', 'y_pred','y_prob'])
    write_blackbox_output_batchiter(test_loader, model, wf1, device, ifscore=True)
    wf_open1.close()


In [None]:
testfile_path="../data/Validation.csv"
modelfile_path="../Retraining_model/Retraining_model.ckpt"
result_path="../result_path/Retraining_model_prediction"
Retraining_model_prediction(testfile_path,modelfile_path,result_path)