# 1.Original model prediction

In [1]:
import numpy as np
import pandas as pd
import collections
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from SETE import *
from itertools import cycle
from sklearn.metrics import roc_curve, auc, accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from scipy import interp
from sklearn.base import clone
from sklearn.decomposition import PCA, KernelPCA
from sklearn.cross_decomposition import CCA

from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import GradientBoostingClassifier

import warnings

warnings.filterwarnings('ignore')

import collections
from collections import Counter

def kemr_count(df, k_size):
    genes_seq = df[["ID", "CDR3B"]].set_index("ID").to_dict(orient='dict')["CDR3B"]
    def build_kmers(seq, k_size):
        kmers = []  
        n_kmers = len(seq) - k_size + 1
        for i in range(n_kmers):
            kmer = seq[i:i + k_size]
            kmers.append(kmer)
        return kmers
    def summary_kmers(kmers):
        kmers_stat = dict(Counter(kmers))
        return kmers_stat   
    genes_kmers = {}
    for gene in genes_seq.keys():
        genes_kmers[gene] = summary_kmers(build_kmers(seq=genes_seq[gene], k_size=k_size))
    Kmer_tcrcount = pd.DataFrame(genes_kmers).T
    return Kmer_tcrcount

def blast(df, count_size):
    seq = 'GAVLIPFYWSTCMNQDEKRH'
    index = []
    for i in seq:
        for j in seq:
            for z in seq:
                a = i + j + z
                index.append(a)
    count = np.zeros((len(df), count_size))
    count = pd.DataFrame(count)
    count.columns = index
    count.index = df.ID
    return count

def replacement(df1, df2):
    common_cols = list(set(df1.columns) & set(df2.columns))
    df2.loc[:, common_cols] = df1.loc[:, common_cols]
    return df2

def k_mer(df, k_size, count_size):
    Kmer_tcrcount = kemr_count(df, k_size)
    count = blast(df, count_size)
    k_mer = replacement(Kmer_tcrcount, count)
    k_mer = np.array(k_mer.fillna(0))
    return k_mer


In [2]:
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, matthews_corrcoef, precision_score,recall_score
import pickle
def pca_analyse(X_train, X_test, rate=0.8):
    """Perform PCA for the train set and test set."""
    pca = PCA(n_components=rate).fit(X_train)
    return pca.transform(X_train), pca.transform(X_test)

def Original_model_prediction(trainfile_path,testfile_path,save_model_path,result_path):
    train = pd.read_csv(trainfile_path)
    train=train[train['Epitope'].isin(['GILGFVFTL', 'GLCTLVAML', 'NLVPMVATV'])]
    test = pd.read_csv(testfile_path)
    epitope = pd.unique(train['Epitope'])
    # Initialize results_df outside of the loop
    results_list = []
    probab_list=[]
    for i in epitope:
        df_train = train[train['Epitope'] == i].sample(frac=1, random_state=42)  # Filter the dataframe for current epitope
        df_train['ID'] = range(1, len(df_train) + 1)
        X_train = k_mer(df_train, 3, 8000)
        y_train = df_train['Affinity'].values
        df_test = test[test['Epitope'] == i].sample(frac=1, random_state=42)  # Filter the dataframe for current epitope
        df_test['ID'] = range(1, len(df_test) + 1)
        X_test = k_mer(df_test, 3, 8000)
        y_test = df_test['Affinity'].values
        classifier = OneVsRestClassifier(
            GradientBoostingClassifier(
                learning_rate=0.1,
                min_samples_leaf=20,
                max_features='sqrt',
                subsample=0.8,
                random_state=10,
                n_estimators=70,
                max_depth=10,
                min_samples_split=60,
                loss="deviance" ))
        X_train, X_test = pca_analyse(X_train, X_test, 0.8)
        classifier.fit(X_train, y_train)
        with open(save_model_path+i+'model.pkl', 'wb') as file:
            pickle.dump(classifier, file)
        y_pred = classifier.predict(X_test)
        classifier.score(X_test, y_test)
        y_prob = classifier.predict_proba(X_test)[:, -1]
        probab= { 'Epitope': [i] * len(df_test['CDR3B']),'CDR3B': df_test['CDR3B'],'y_true': y_test, 'y_pred': y_pred,'y_prob': y_prob}
        probab_list.append(probab)   
        probability = pd.DataFrame(probab_list)
        probability = pd.DataFrame(probability)
        probability = probability.apply(pd.Series.explode)
        probability.to_csv(result_path+'probability.csv')


In [None]:
testfile_path="./data/test.csv"
testfile_path="./dataOriginal_train.csv"
modelfile_path="./Original_model/original"
result_path="./result_path/Original_model_prediction"
Original_model_prediction(trainfile_path,testfile_path,modelfile_path,result_path)


# 2.Model retraining

In [1]:
import numpy as np
import pandas as pd
import collections
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from SETE import *
from itertools import cycle
from sklearn.metrics import roc_curve, auc, accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from scipy import interp
from sklearn.base import clone
from sklearn.decomposition import PCA, KernelPCA
from sklearn.cross_decomposition import CCA

from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import GradientBoostingClassifier

import warnings

warnings.filterwarnings('ignore')

In [2]:
import collections
from collections import Counter

def kemr_count(df, k_size):
    genes_seq = df[["ID", "CDR3B"]].set_index("ID").to_dict(orient='dict')["CDR3B"]
    def build_kmers(seq, k_size):
        kmers = []  
        n_kmers = len(seq) - k_size + 1
        for i in range(n_kmers):
            kmer = seq[i:i + k_size]
            kmers.append(kmer)
        return kmers
    def summary_kmers(kmers):
        kmers_stat = dict(Counter(kmers))
        return kmers_stat   
    genes_kmers = {}
    for gene in genes_seq.keys():
        genes_kmers[gene] = summary_kmers(build_kmers(seq=genes_seq[gene], k_size=k_size))
    Kmer_tcrcount = pd.DataFrame(genes_kmers).T
    return Kmer_tcrcount

def blast(df, count_size):
    seq = 'GAVLIPFYWSTCMNQDEKRH'
    index = []
    for i in seq:
        for j in seq:
            for z in seq:
                a = i + j + z
                index.append(a)
    count = np.zeros((len(df), count_size))
    count = pd.DataFrame(count)
    count.columns = index
    count.index = df.ID
    return count

def replacement(df1, df2):
    common_cols = list(set(df1.columns) & set(df2.columns))
    df2.loc[:, common_cols] = df1.loc[:, common_cols]
    return df2

def k_mer(df, k_size, count_size):
    Kmer_tcrcount = kemr_count(df, k_size)
    count = blast(df, count_size)
    k_mer = replacement(Kmer_tcrcount, count)
    k_mer = np.array(k_mer.fillna(0))
    return k_mer


In [11]:
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, matthews_corrcoef, precision_score,recall_score
import pickle
def pca_analyse(X_train, X_test, rate=0.8):
    """Perform PCA for the train set and test set."""
    pca = PCA(n_components=rate).fit(X_train)
    return pca.transform(X_train), pca.transform(X_test)

def Model_retraining(trainfile_path,testfile_path,save_model_path,result_path):
    train = pd.read_csv(trainfile_path)
    test = pd.read_csv(testfile_path)
    epitope = pd.unique(test['Epitope'])
    # Initialize results_df outside of the loop
    results_list = []
    probab_list=[]
    for i in epitope:
        df_train = train[train['Epitope'] == i].sample(frac=1, random_state=42)  # Filter the dataframe for current epitope
        df_train['ID'] = range(1, len(df_train) + 1)
        X_train = k_mer(df_train, 3, 8000)
        y_train = df_train['Affinity'].values
        df_test = test[test['Epitope'] == i].sample(frac=1, random_state=42)  # Filter the dataframe for current epitope
        df_test['ID'] = range(1, len(df_test) + 1)
        X_test = k_mer(df_test, 3, 8000)
        y_test = df_test['Affinity'].values
        classifier = OneVsRestClassifier(
            GradientBoostingClassifier(
                learning_rate=0.1,
                min_samples_leaf=20,
                max_features='sqrt',
                subsample=0.8,
                random_state=10,
                n_estimators=70,
                max_depth=10,
                min_samples_split=60,
                loss="deviance" ))
        X_train, X_test = pca_analyse(X_train, X_test, 0.8)
        classifier.fit(X_train, y_train)
        with open(save_model_path+i+'model.pkl', 'wb') as file:
            pickle.dump(classifier, file)
        y_pred = classifier.predict(X_test)
        classifier.score(X_test, y_test)
        y_prob = classifier.predict_proba(X_test)[:, -1]
        probab= { 'Epitope': [i] * len(df_test['CDR3B']),'CDR3B': df_test['CDR3B'],'y_true': y_test, 'y_pred': y_pred,'y_prob': y_prob}     
        probab_list.append(probab)   
        probability = pd.DataFrame(probab_list)
        probability = pd.DataFrame(probability)
        probability = probability.apply(pd.Series.explode)
        probability.to_csv(result_path+'probability.csv')


In [None]:
trainfile_path ="./data/train.csv"
testfile_path="./data/test.csv"
save_modle_path="./Retraining_model/Retraining_model"
result_path="./result_path/Retraining_model_prediction"
Model_retraining(trainfile_path,testfile_path,save_modle_path,result_path) 


# 3.Retraining_model_prediction

In [3]:
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, matthews_corrcoef, precision_score, recall_score
import pickle
import pandas as pd

def pca_analyse(X_train, X_test, rate=0.9):
    """Perform PCA for the train set and test set."""
    pca = PCA(n_components=rate).fit(X_train)
    return pca.transform(X_train), pca.transform(X_test)
def Retraining_model_prediction( testfile_path, modelfile_path, result_path):
    test = pd.read_csv(testfile_path)
    epitope = pd.unique(test['Epitope'])
    probab_list = []
    for i in epitope:
        df_test = test[test['Epitope'] == i].sample(frac=1, random_state=42)
        df_test['ID'] = range(1, len(df_test) + 1)
        X_test = k_mer(df_test, 3, 8000)
        y_test = df_test['Affinity'].values
        model_path = modelfile_path + i + 'model.pkl'
        with open(model_path, 'rb') as file:
            classifier = pickle.load(file)
        _, X_test = pca_analyse(X_test, X_test, 0.8)
        y_pred = classifier.predict(X_test)
        y_prob = classifier.predict_proba(X_test)[:, -1]
        probab = {
            'Epitope': [i] * len(df_test['CDR3B']),
            'CDR3B': df_test['CDR3B'],
            'y_true': y_test,
            'y_pred': y_pred,
            'y_prob': y_prob
        }
        probab_list.append(probab)
    probability = pd.DataFrame(probab_list)
    probability = probability.apply(pd.Series.explode)
    probability.to_csv(result_path + 'probability.csv', index=False)


In [None]:
testfile_path="./data/Validation.csv"
modelfile_path="./Retraining_model/Retraining_model"
result_path="./result_path/Retraining_model_prediction"
Retraining_model_prediction(testfile_path,modelfile_path,result_path)
