In [1]:
# Handle table-like data and matrices
import numpy as np
import pandas as pd

# Helpers
import sys
from datetime import datetime
from scipy.sparse import csr_matrix,save_npz,load_npz
import pickle
from classifiers import TCRs_selection,TCRs_selection_ttest,t_test

from sklearn.model_selection import cross_val_score,StratifiedKFold
from sklearn.metrics import accuracy_score,roc_auc_score,roc_curve,log_loss
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV,cross_validate
from sklearn.metrics import classification_report
from sklearn.model_selection import LeaveOneOut,KFold,StratifiedKFold
from sklearn.decomposition import PCA,KernelPCA
from scipy.stats import fisher_exact,ttest_ind

from classifiers import MAP_estimator,cal_p_value,TCRs_selection,LOOCV_MAP

# Algorithm
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

# Visualisation
import matplotlib.pyplot as plt
import seaborn as sns

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

data_path = '../data/'

In [2]:
count_df = pd.read_pickle(data_path+'count_df.pkl')
freq_df = pd.read_pickle(data_path+'freq_df.pkl')
TCRs = count_df.drop(['sample_name','phenotype_status'],axis=1).columns.values
len(TCRs)

34063

In [11]:
X = count_df.drop(['sample_name','phenotype_status'],axis=1)
y = count_df['phenotype_status']

In [8]:
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV

In [16]:
lrcv = LogisticRegressionCV(penalty='l1',random_state=0,solver='liblinear')

In [17]:
lrcv.fit(X,y)

LogisticRegressionCV(Cs=10, class_weight=None, cv=None, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l1', random_state=0,
           refit=True, scoring=None, solver='liblinear', tol=0.0001,
           verbose=0)

In [18]:
lrcv.C_

array([1291.54966501])

In [22]:
lr= LogisticRegression(penalty='l1',random_state=0,C=1300)
sample_name = []
y_true = []
y_pred = []
y_proba = [] 

TCRs = count_df.drop(['sample_name','phenotype_status'],axis=1).columns.values

kf = LeaveOneOut()
for train_index,test_index in kf.split(count_df):
    data = count_df.copy(deep=True)
    train_cv, test_cv = data.iloc[train_index], data.iloc[test_index]
    print(test_cv['sample_name'].tolist()[0])
    
    asso_TCRs = TCRs_selection(train_cv,TCRs,0.2)
    print('Length of phenotype_associated TCRs: ',len(asso_TCRs))
    train_cv['sum_asso_TCRs'] = train_cv[asso_TCRs].sum(axis=1)
    test_cv['sum_asso_TCRs'] = test_cv[asso_TCRs].sum(axis=1)
    
    lr.fit(train_cv[asso_TCRs],train_cv['phenotype_status'])
    test_pred = lr.predict(test_cv[asso_TCRs])[0]
    test_prob = lr.predict_proba(test_cv[asso_TCRs])[:,1][0]

    sample_name.append(test_cv['sample_name'].tolist()[0])
    y_true.append(test_cv['phenotype_status'].values[0])
    y_pred.append(test_pred)
    y_proba.append(test_prob)

    
    print('y_true:',test_cv['phenotype_status'].values[0],'y_pred:',test_pred,'posterior_c1:',test_prob)
    print()

print('cv auroc:',roc_auc_score(y_true,y_proba))
print('cv log_loss:',log_loss(y_true,y_proba))

RA47
Length of phenotype_associated TCRs:  69
y_true: 1 y_pred: 0 posterior_c1: 9.496148640311485e-06

HC9
Length of phenotype_associated TCRs:  31
y_true: 0 y_pred: 1 posterior_c1: 0.9999999890331742

RA29
Length of phenotype_associated TCRs:  65
y_true: 1 y_pred: 1 posterior_c1: 0.9437996197183972

RA8
Length of phenotype_associated TCRs:  69
y_true: 1 y_pred: 0 posterior_c1: 1.4134705722194307e-05

RA63
Length of phenotype_associated TCRs:  69
y_true: 1 y_pred: 1 posterior_c1: 0.9999999999451128

RA33
Length of phenotype_associated TCRs:  67
y_true: 1 y_pred: 1 posterior_c1: 0.9999999843437475

HC17
Length of phenotype_associated TCRs:  33
y_true: 0 y_pred: 1 posterior_c1: 0.9961375843270169

HC3
Length of phenotype_associated TCRs:  33
y_true: 0 y_pred: 0 posterior_c1: 0.22281788642999736

RA17
Length of phenotype_associated TCRs:  70
y_true: 1 y_pred: 1 posterior_c1: 0.9999999993891129

RA46
Length of phenotype_associated TCRs:  61
y_true: 1 y_pred: 1 posterior_c1: 0.9999999999930

Length of phenotype_associated TCRs:  62
y_true: 1 y_pred: 1 posterior_c1: 0.9999999946319

HC16
Length of phenotype_associated TCRs:  30
y_true: 0 y_pred: 0 posterior_c1: 0.25233090048796647

cv auroc: 0.3942307692307692
cv log_loss: 5.107024306718105


In [21]:
lr= LogisticRegression(penalty='l1',random_state=0,C=1300)
sample_name = []
y_true = []
y_pred = []
y_proba = [] 

TCRs = count_df.drop(['sample_name','phenotype_status'],axis=1).columns.values

kf = LeaveOneOut()
for train_index,test_index in kf.split(count_df):
    data = count_df.copy(deep=True)
    train_cv, test_cv = data.iloc[train_index], data.iloc[test_index]
    print(test_cv['sample_name'].tolist()[0])
    
    asso_TCRs = TCRs_selection(train_cv,TCRs,0.3)
    print('Length of phenotype_associated TCRs: ',len(asso_TCRs))
    train_cv['sum_asso_TCRs'] = train_cv[asso_TCRs].sum(axis=1)
    test_cv['sum_asso_TCRs'] = test_cv[asso_TCRs].sum(axis=1)
    
    lr.fit(train_cv[asso_TCRs],train_cv['phenotype_status'])
    test_pred = lr.predict(test_cv[asso_TCRs])[0]
    test_prob = lr.predict_proba(test_cv[asso_TCRs])[:,1][0]

    sample_name.append(test_cv['sample_name'].tolist()[0])
    y_true.append(test_cv['phenotype_status'].values[0])
    y_pred.append(test_pred)
    y_proba.append(test_prob)

    
    print('y_true:',test_cv['phenotype_status'].values[0],'y_pred:',test_pred,'posterior_c1:',test_prob)
    print()

print('cv auroc:',roc_auc_score(y_true,y_proba))
print('cv log_loss:',log_loss(y_true,y_proba))

RA47
Length of phenotype_associated TCRs:  164
y_true: 1 y_pred: 0 posterior_c1: 0.0009869857139717165

HC9
Length of phenotype_associated TCRs:  182
y_true: 0 y_pred: 1 posterior_c1: 0.9999972333307218

RA29
Length of phenotype_associated TCRs:  155
y_true: 1 y_pred: 0 posterior_c1: 0.00555551952086437

RA8
Length of phenotype_associated TCRs:  175
y_true: 1 y_pred: 0 posterior_c1: 0.007787865850652314

RA63
Length of phenotype_associated TCRs:  177
y_true: 1 y_pred: 1 posterior_c1: 0.9999389717186408

RA33
Length of phenotype_associated TCRs:  170
y_true: 1 y_pred: 0 posterior_c1: 1.0794659457856468e-05

HC17
Length of phenotype_associated TCRs:  193
y_true: 0 y_pred: 1 posterior_c1: 0.8798309271067021

HC3
Length of phenotype_associated TCRs:  188
y_true: 0 y_pred: 1 posterior_c1: 0.9991179593425606

RA17
Length of phenotype_associated TCRs:  178
y_true: 1 y_pred: 1 posterior_c1: 0.999999182016522

RA46
Length of phenotype_associated TCRs:  171
y_true: 1 y_pred: 1 posterior_c1: 0.99

Length of phenotype_associated TCRs:  164
y_true: 1 y_pred: 1 posterior_c1: 0.999999997991937

RA76
Length of phenotype_associated TCRs:  167
y_true: 1 y_pred: 1 posterior_c1: 0.9995455928202254

HC16
Length of phenotype_associated TCRs:  194
y_true: 0 y_pred: 1 posterior_c1: 0.9995046576738814

cv auroc: 0.37884615384615394
cv log_loss: 4.093722751676792


In [3]:
clf = RandomForestClassifier(bootstrap=True, class_weight= None, criterion='gini',
                                 max_depth=10, max_features='auto', max_leaf_nodes=None,
                                 min_impurity_decrease=0.0, min_impurity_split=None,
                                 min_samples_leaf=1, min_samples_split=2,
                                 min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                                 oob_score=True, random_state=0, verbose=0, warm_start=False)

In [4]:
count_df.head()

Unnamed: 0,sample_name,phenotype_status,"(CAAGGSSYEQYF, TCRBV07, TCRBV07-08, 01, TCRBJ02, TCRBJ02-07, 01)","(CACLPGQTSYEQYF, TCRBV30, TCRBV30-01, 01, TCRBJ02, TCRBJ02-07, 01)","(CACRGEGGNTIYF, TCRBV30, TCRBV30-01, 01, TCRBJ01, TCRBJ01-03, 01)","(CAFGQEGQPQHF, TCRBV30, TCRBV30-01, 01, TCRBJ01, TCRBJ01-05, 01)","(CAGETAEAFF, TCRBV06, TCRBV06-01, 01, TCRBJ01, TCRBJ01-01, 01)","(CAGGMNTEAFF, TCRBV30, TCRBV30-01, 01, TCRBJ01, TCRBJ01-01, 01)","(CAGGNTEAFF, TCRBV02, TCRBV02-01, 01, TCRBJ01, TCRBJ01-01, 01)","(CAGGRAGGTDTQYF, TCRBV07, TCRBV07-09, null, TCRBJ02, TCRBJ02-03, 01)",...,"(CSVWTGSSYEQYF, TCRBV29, TCRBV29-01, 01, TCRBJ02, TCRBJ02-07, 01)","(CSVYLPGQGNGGYGYTF, TCRBV29, TCRBV29-01, 01, TCRBJ01, TCRBJ01-02, 01)","(CSVYMNTEAFF, TCRBV29, TCRBV29-01, 01, TCRBJ01, TCRBJ01-01, 01)","(CSVYNNEQFF, TCRBV29, TCRBV29-01, 01, TCRBJ02, TCRBJ02-01, 01)","(CSVYRDYEQYF, TCRBV29, TCRBV29-01, 01, TCRBJ02, TCRBJ02-07, 01)","(CSVYTGELFF, TCRBV29, TCRBV29-01, 01, TCRBJ02, TCRBJ02-02, 01)","(CTGLHETQYF, TCRBV27, TCRBV27-01, 01, TCRBJ02, TCRBJ02-05, 01)","(CVTRDFYEQYF, TCRBV02, TCRBV02-01, 01, TCRBJ02, TCRBJ02-07, 01)","(RASSPFPTNQAASSNQPQHF, TCRBV07, TCRBV07-03, 03, TCRBJ01, TCRBJ01-05, 01)","(RASSSTSGGPSWNDEQFF, TCRBV07, TCRBV07-03, 03, TCRBJ02, TCRBJ02-01, 01)"
0,RA47,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,HC9,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,2
2,RA29,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,RA8,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,RA63,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### No feature selection

In [5]:
# Init lists
sample_name = []
y_true = []
y_pred = []
y_proba = [] 

TCRs = count_df.drop(['sample_name','phenotype_status'],axis=1).columns.values

kf = LeaveOneOut()
for train_index,test_index in kf.split(count_df):
    data = count_df.copy(deep=True)
    train_cv, test_cv = data.iloc[train_index], data.iloc[test_index]
    print(test_cv['sample_name'].tolist()[0])
    
    asso_TCRs = TCRs
    
    clf.fit(train_cv[asso_TCRs],train_cv['phenotype_status'])
    test_pred = clf.predict(test_cv[asso_TCRs])[0]
    test_prob = clf.predict_proba(test_cv[asso_TCRs])[:,1][0]

    sample_name.append(test_cv['sample_name'].tolist()[0])
    y_true.append(test_cv['phenotype_status'].values[0])
    y_pred.append(test_pred)
    y_proba.append(test_prob)

    
    print('y_true:',test_cv['phenotype_status'].values[0],'y_pred:',test_pred,'posterior_c1:',test_prob)
    print()

print('cv auroc:',roc_auc_score(y_true,y_proba))
print('cv log_loss:',log_loss(y_true,y_proba))

RA47
y_true: 1 y_pred: 1 posterior_c1: 0.83

HC9
y_true: 0 y_pred: 1 posterior_c1: 0.87

RA29
y_true: 1 y_pred: 1 posterior_c1: 0.8198275862068966

RA8
y_true: 1 y_pred: 1 posterior_c1: 0.9498275862068966

RA63
y_true: 1 y_pred: 1 posterior_c1: 0.94

RA33
y_true: 1 y_pred: 1 posterior_c1: 0.9198412698412699

HC17
y_true: 0 y_pred: 1 posterior_c1: 0.73

HC3
y_true: 0 y_pred: 1 posterior_c1: 0.86

RA17
y_true: 1 y_pred: 1 posterior_c1: 0.969666295884316

RA46
y_true: 1 y_pred: 1 posterior_c1: 0.8698387096774194

HC6
y_true: 0 y_pred: 1 posterior_c1: 0.76

RA26
y_true: 1 y_pred: 1 posterior_c1: 0.85

RA15
y_true: 1 y_pred: 1 posterior_c1: 0.91

RA53
y_true: 1 y_pred: 1 posterior_c1: 0.8596606269772792

RA32
y_true: 1 y_pred: 1 posterior_c1: 0.8898245614035087

RA40
y_true: 1 y_pred: 1 posterior_c1: 0.9498360655737704

HC8
y_true: 0 y_pred: 1 posterior_c1: 0.88

RA51
y_true: 1 y_pred: 1 posterior_c1: 0.84

RA68
y_true: 1 y_pred: 1 posterior_c1: 0.8398214285714286

RA21
y_true: 1 y_pred: 1 

#### Feature selection based on Fisher exact test

In [6]:
# Init lists
sample_name = []
y_true = []
y_pred = []
y_proba = [] 

TCRs = count_df.drop(['sample_name','phenotype_status'],axis=1).columns.values

kf = LeaveOneOut()
for train_index,test_index in kf.split(count_df):
    data = count_df.copy(deep=True)
    train_cv, test_cv = data.iloc[train_index], data.iloc[test_index]
    print(test_cv['sample_name'].tolist()[0])
    
    asso_TCRs = TCRs_selection(train_cv,TCRs,0.3)
    print('Length of phenotype_associated TCRs: ',len(asso_TCRs))
    train_cv['sum_asso_TCRs'] = train_cv[asso_TCRs].sum(axis=1)
    test_cv['sum_asso_TCRs'] = test_cv[asso_TCRs].sum(axis=1)
    
    clf.fit(train_cv[asso_TCRs],train_cv['phenotype_status'])
    test_pred = clf.predict(test_cv[asso_TCRs])[0]
    test_prob = clf.predict_proba(test_cv[asso_TCRs])[:,1][0]

    sample_name.append(test_cv['sample_name'].tolist()[0])
    y_true.append(test_cv['phenotype_status'].values[0])
    y_pred.append(test_pred)
    y_proba.append(test_prob)

    
    print('y_true:',test_cv['phenotype_status'].values[0],'y_pred:',test_pred,'posterior_c1:',test_prob)
    print()

print('cv auroc:',roc_auc_score(y_true,y_proba))
print('cv log_loss:',log_loss(y_true,y_proba))

RA47
Length of phenotype_associated TCRs:  164
y_true: 1 y_pred: 1 posterior_c1: 0.6203822171832128

HC9
Length of phenotype_associated TCRs:  182
y_true: 0 y_pred: 1 posterior_c1: 0.5965568031800779

RA29
Length of phenotype_associated TCRs:  155
y_true: 1 y_pred: 1 posterior_c1: 0.6326772699041977

RA8
Length of phenotype_associated TCRs:  175
y_true: 1 y_pred: 0 posterior_c1: 0.37218024006453176

RA63
Length of phenotype_associated TCRs:  177
y_true: 1 y_pred: 1 posterior_c1: 0.5660014632161906

RA33
Length of phenotype_associated TCRs:  170
y_true: 1 y_pred: 1 posterior_c1: 0.6928672522670203

HC17
Length of phenotype_associated TCRs:  193
y_true: 0 y_pred: 1 posterior_c1: 0.6632388801734455

HC3
Length of phenotype_associated TCRs:  188
y_true: 0 y_pred: 1 posterior_c1: 0.6893378921003861

RA17
Length of phenotype_associated TCRs:  178
y_true: 1 y_pred: 1 posterior_c1: 0.5870056076010293

RA46
Length of phenotype_associated TCRs:  171
y_true: 1 y_pred: 1 posterior_c1: 0.7361938544

y_true: 1 y_pred: 1 posterior_c1: 0.766128104501644

RA75
Length of phenotype_associated TCRs:  164
y_true: 1 y_pred: 1 posterior_c1: 0.8735554756690185

RA76
Length of phenotype_associated TCRs:  167
y_true: 1 y_pred: 1 posterior_c1: 0.7731888440280231

HC16
Length of phenotype_associated TCRs:  194
y_true: 0 y_pred: 1 posterior_c1: 0.7091214242256187

cv auroc: 0.27999999999999997
cv log_loss: 0.7193908815257406


In [6]:
# Init lists
sample_name = []
y_true = []
y_pred = []
y_proba = [] 

TCRs = count_df.drop(['sample_name','phenotype_status'],axis=1).columns.values

kf = LeaveOneOut()
for train_index,test_index in kf.split(count_df):
    data = count_df.copy(deep=True)
    train_cv, test_cv = data.iloc[train_index], data.iloc[test_index]
    print(test_cv['sample_name'].tolist()[0])
    
    asso_TCRs = TCRs_selection(train_cv,TCRs,0.2)
    print('Length of phenotype_associated TCRs: ',len(asso_TCRs))
    train_cv['sum_asso_TCRs'] = train_cv[asso_TCRs].sum(axis=1)
    test_cv['sum_asso_TCRs'] = test_cv[asso_TCRs].sum(axis=1)
    
    clf.fit(train_cv[asso_TCRs],train_cv['phenotype_status'])
    test_pred = clf.predict(test_cv[asso_TCRs])[0]
    test_prob = clf.predict_proba(test_cv[asso_TCRs])[:,1][0]

    sample_name.append(test_cv['sample_name'].tolist()[0])
    y_true.append(test_cv['phenotype_status'].values[0])
    y_pred.append(test_pred)
    y_proba.append(test_prob)

    
    print('y_true:',test_cv['phenotype_status'].values[0],'y_pred:',test_pred,'posterior_c1:',test_prob)
    print()

print('cv auroc:',roc_auc_score(y_true,y_proba))
print('cv log_loss:',log_loss(y_true,y_proba))

RA47
Length of phenotype_associated TCRs:  69
y_true: 1 y_pred: 1 posterior_c1: 0.5666676136940176

HC9
Length of phenotype_associated TCRs:  31
y_true: 0 y_pred: 1 posterior_c1: 0.6461407870649709

RA29
Length of phenotype_associated TCRs:  65
y_true: 1 y_pred: 1 posterior_c1: 0.5654590646851942

RA8
Length of phenotype_associated TCRs:  69
y_true: 1 y_pred: 0 posterior_c1: 0.30810029548018153

RA63
Length of phenotype_associated TCRs:  69
y_true: 1 y_pred: 1 posterior_c1: 0.6463414891313686

RA33
Length of phenotype_associated TCRs:  67
y_true: 1 y_pred: 1 posterior_c1: 0.7607860457171353

HC17
Length of phenotype_associated TCRs:  33
y_true: 0 y_pred: 1 posterior_c1: 0.6262651458634906

HC3
Length of phenotype_associated TCRs:  33
y_true: 0 y_pred: 1 posterior_c1: 0.7050244675398258

RA17
Length of phenotype_associated TCRs:  70
y_true: 1 y_pred: 1 posterior_c1: 0.6269031753160018

RA46
Length of phenotype_associated TCRs:  61
y_true: 1 y_pred: 1 posterior_c1: 0.7251669742537656

HC

y_true: 1 y_pred: 1 posterior_c1: 0.9044951714951714

RA76
Length of phenotype_associated TCRs:  62
y_true: 1 y_pred: 1 posterior_c1: 0.7899134870370971

HC16
Length of phenotype_associated TCRs:  30
y_true: 0 y_pred: 0 posterior_c1: 0.36891062644976147

cv auroc: 0.33307692307692305
cv log_loss: 0.7598337086303928


## Feature selection: Intersection of Fisher and ttest 

In [None]:
# Init lists
sample_name = []
y_true = []
y_pred = []
y_proba = [] 

TCRs = freq_df.drop(['sample_name','phenotype_status'],axis=1).columns.values

kf = LeaveOneOut()
for train_index,test_index in kf.split(freq_df):
    train_cv, test_cv = freq_df.iloc[train_index], freq_df.iloc[test_index]
    print(test_cv['sample_name'].tolist()[0])
    
    asso_TCRs1 = TCRs_selection(train_cv,TCRs,0.2)
    asso_TCRs2 = TCRs_selection_ttest(train_cv,TCRs,{'threshold':0.02})
    print('Length of phenotype_associated TCRs: ','Fisher:',len(asso_TCRs1),'t-test:',len(asso_TCRs2))

    asso_TCRs = list(set(asso_TCRs1).intersection(set(asso_TCRs2)))
    print('Length of the intersection:',len(asso_TCRs))

    clf.fit(train_cv[asso_TCRs],train_cv['phenotype_status'])
    test_pred = clf.predict(test_cv[asso_TCRs])[0]
    test_prob = clf.predict_proba(test_cv[asso_TCRs])[:,1][0]

    sample_name.append(test_cv['sample_name'].tolist()[0])
    y_true.append(test_cv['phenotype_status'].values[0])
    y_pred.append(test_pred)
    y_proba.append(test_prob)

    
    print('y_true:',test_cv['phenotype_status'].values[0],'y_pred:',test_pred,'posterior_c1:',test_prob)
    print()

print('cv auroc:',roc_auc_score(y_true,y_proba))
print('cv log_loss:',log_loss(y_true,y_proba))

RA47
Length of phenotype_associated TCRs:  Fisher: 69 t-test: 85
Length of the intersection: 48
y_true: 1 y_pred: 1 posterior_c1: 0.5081539649835483

HC9
Length of phenotype_associated TCRs:  Fisher: 31 t-test: 93
Length of the intersection: 25
y_true: 0 y_pred: 1 posterior_c1: 0.6403034736640675

RA29
Length of phenotype_associated TCRs:  Fisher: 65 t-test: 83
Length of the intersection: 46
y_true: 1 y_pred: 0 posterior_c1: 0.44841686970974853

RA8
Length of phenotype_associated TCRs:  Fisher: 69 t-test: 92
Length of the intersection: 50
y_true: 1 y_pred: 1 posterior_c1: 0.5649911472914405

RA63
Length of phenotype_associated TCRs:  Fisher: 69 t-test: 92
Length of the intersection: 50
y_true: 1 y_pred: 1 posterior_c1: 0.7889682443077872

RA33
Length of phenotype_associated TCRs:  Fisher: 67 t-test: 89
Length of the intersection: 48
y_true: 1 y_pred: 1 posterior_c1: 0.951547619047619

HC17
Length of phenotype_associated TCRs:  Fisher: 33 t-test: 98
Length of the intersection: 27
y_true

Length of phenotype_associated TCRs:  Fisher: 61 t-test: 83
Length of the intersection: 44
y_true: 1 y_pred: 1 posterior_c1: 0.7608837147988919

RA82
Length of phenotype_associated TCRs:  Fisher: 67 t-test: 89
Length of the intersection: 49
y_true: 1 y_pred: 1 posterior_c1: 0.8234984827146214

RA57


### New feature: sum of associated TCRs

In [5]:
kf = LeaveOneOut()
for train_index,test_index in kf.split(freq_df):
    train_cv, test_cv = freq_df.iloc[train_index], freq_df.iloc[test_index]

    train = train_cv.drop(['sample_name','phenotype_status'],axis=1)
    test = test_cv.drop(['sample_name','phenotype_status'],axis=1)
    
    asso_TCRs = TCRs_selection(train_cv,TCRs,0.2)
    train['sum_asso_TCRs'] = train[asso_TCRs].sum(axis=1)
    test['sum_asso_TCRs'] = test[asso_TCRs].sum(axis=1)
    
    asso_TCRs = asso_TCRs + ['sum_asso_TCRs']
    
    clf.fit(train[asso_TCRs],train_cv['phenotype_status'])
    test_pred = clf.predict(test[asso_TCRs])[0]
    test_prob = clf.predict_proba(test[asso_TCRs])[:,1][0]

    sample_name.append(test_cv['sample_name'].tolist()[0])
    y_true.append(test_cv['phenotype_status'].values[0])
    y_pred.append(test_pred)
    y_proba.append(test_prob)

    
    print('y_true:',test_cv['phenotype_status'].values[0],'y_pred:',test_pred,'posterior_c1:',test_prob)
    print()
    break

In [13]:
new1 = train_cv.loc[:,asso_TCRs].sum(axis=1)
new2 = train_cv[asso_TCRs].sum(axis=1)
p = pd.DataFrame({'status':train_cv['phenotype_status'],'new_feature':new1})
print(p[p['status']==0]['new_feature'].mean(),p[p['status']==1]['new_feature'].mean())

5.734544729829541e-05 0.0008260088911752312


In [13]:
# Init lists
sample_name = []
y_true = []
y_pred = []
y_proba = [] 

TCRs = freq_df.drop(['sample_name','phenotype_status','total_TCRs'],axis=1).columns.values

kf = LeaveOneOut()
for train_index,test_index in kf.split(freq_df):
    data = freq_df.copy(deep=True)
    train_cv, test_cv = data.iloc[train_index], data.iloc[test_index]
    print(test_cv['sample_name'].tolist()[0])
    
    asso_TCRs = TCRs_selection(train_cv,TCRs,0.2)
    
    print('Mean of the sum_asso_TCRs: ','Class 0:',train_cv[train_cv['phenotype_status']==0]['sum_asso_TCRs'].mean(),
          'Class 1:',train_cv[train_cv['phenotype_status']==1]['sum_asso_TCRs'].mean())
    print('Sum_asso_TCRs of the testing sample:',test_cv['sum_asso_TCRs'].tolist()[0])
    
    asso_TCRs = asso_TCRs.tolist()+['sum_asso_TCRs']
    
    clf.fit(train_cv[asso_TCRs],train_cv['phenotype_status'])
    test_pred = clf.predict(test_cv[asso_TCRs])[0]
    test_prob = clf.predict_proba(test_cv[asso_TCRs])[:,1][0]

    sample_name.append(test_cv['sample_name'].tolist()[0])
    y_true.append(test_cv['phenotype_status'].values[0])
    y_pred.append(test_pred)
    y_proba.append(test_prob)

    
    print('y_true:',test_cv['phenotype_status'].values[0],'y_pred:',test_pred,'posterior_c1:',test_prob)
    print()

print('cv auroc:',roc_auc_score(y_true,y_proba))
print('cv log_loss:',log_loss(y_true,y_proba))

RA47
Mean of the sum_asso_TCRs:  Class 0: 5.734544729829541e-05 Class 1: 0.0008260088911752312
Sum_asso_TCRs of the testing sample: 0.00023917723032767282
y_true: 1 y_pred: 1 posterior_c1: 0.5223180531116328

HC9
Mean of the sum_asso_TCRs:  Class 0: 4.6174552679196166e-05 Class 1: 0.0004869688251373103
Sum_asso_TCRs of the testing sample: 0.00033110390040394673
y_true: 0 y_pred: 1 posterior_c1: 0.9113932257960655

RA29
Mean of the sum_asso_TCRs:  Class 0: 5.4034408294255926e-05 Class 1: 0.0007951761368263883
Sum_asso_TCRs of the testing sample: 0.00017570832418185812
y_true: 1 y_pred: 1 posterior_c1: 0.5190735652730691

RA8
Mean of the sum_asso_TCRs:  Class 0: 5.734544729829541e-05 Class 1: 0.000827469578691439
Sum_asso_TCRs of the testing sample: 0.00015790304752881732
y_true: 1 y_pred: 0 posterior_c1: 0.42130515567756677

RA63
Mean of the sum_asso_TCRs:  Class 0: 5.734544729829541e-05 Class 1: 0.0008266545421992621
Sum_asso_TCRs of the testing sample: 0.00041245617653124357
y_true: 1

y_true: 1 y_pred: 1 posterior_c1: 0.8868330384212736

RA52
Mean of the sum_asso_TCRs:  Class 0: 5.4034408294255926e-05 Class 1: 0.0007341484694501092
Sum_asso_TCRs of the testing sample: 0.00075937832228016
y_true: 1 y_pred: 1 posterior_c1: 0.9708095238095238

RA60
Mean of the sum_asso_TCRs:  Class 0: 5.734544729829541e-05 Class 1: 0.0007773728428655968
Sum_asso_TCRs of the testing sample: 0.0005345211581291759
y_true: 1 y_pred: 1 posterior_c1: 0.8427535277433857

RA28
Mean of the sum_asso_TCRs:  Class 0: 5.734544729829541e-05 Class 1: 0.0008327432504859127
Sum_asso_TCRs of the testing sample: 0.00046389361373125093
y_true: 1 y_pred: 1 posterior_c1: 0.7386373983411696

RA2
Mean of the sum_asso_TCRs:  Class 0: 5.734544729829541e-05 Class 1: 0.0008352781643693926
Sum_asso_TCRs of the testing sample: 0.00030165912518853697
y_true: 1 y_pred: 1 posterior_c1: 0.5841519834301219

RA74
Mean of the sum_asso_TCRs:  Class 0: 5.734544729829541e-05 Class 1: 0.000799027710356233
Sum_asso_TCRs of the

y_true: 1 y_pred: 1 posterior_c1: 0.8887157319608593

HC2
Mean of the sum_asso_TCRs:  Class 0: 6.06545220006922e-05 Class 1: 0.0005769031351501941
Sum_asso_TCRs of the testing sample: 0.00033536789858474745
y_true: 0 y_pred: 1 posterior_c1: 0.994079254079254

RA65
Mean of the sum_asso_TCRs:  Class 0: 5.734544729829541e-05 Class 1: 0.0008042342128106347
Sum_asso_TCRs of the testing sample: 0.0005718097779472029
y_true: 1 y_pred: 1 posterior_c1: 0.9281859416389335

RA75
Mean of the sum_asso_TCRs:  Class 0: 5.4034408294255926e-05 Class 1: 0.000740373247829268
Sum_asso_TCRs of the testing sample: 0.0007725653461521954
y_true: 1 y_pred: 1 posterior_c1: 0.9827777777777778

RA76
Mean of the sum_asso_TCRs:  Class 0: 5.734544729829541e-05 Class 1: 0.0007644859877473392
Sum_asso_TCRs of the testing sample: 0.00017164435290078955
y_true: 1 y_pred: 1 posterior_c1: 0.5041907178341143

HC16
Mean of the sum_asso_TCRs:  Class 0: 4.965985689397455e-05 Class 1: 0.00047320344199084535
Sum_asso_TCRs of th

In [10]:
# Init lists
sample_name = []
y_true = []
y_pred = []
y_proba = [] 

TCRs = freq_df.drop(['sample_name','phenotype_status','total_TCRs'],axis=1).columns.values

kf = LeaveOneOut()
for train_index,test_index in kf.split(freq_df):
    data = freq_df.copy(deep=True)
    train_cv, test_cv = data.iloc[train_index], data.iloc[test_index]
    print(test_cv['sample_name'].tolist()[0])
    
    asso_TCRs = TCRs_selection(train_cv,TCRs,0.2)
    train_cv['sum_asso_TCRs'] = train_cv[asso_TCRs].sum(axis=1)
    test_cv['sum_asso_TCRs'] = test_cv[asso_TCRs].sum(axis=1)
    
    clf.fit(train_cv[asso_TCRs],train_cv['phenotype_status'])
    test_pred = clf.predict(test_cv[asso_TCRs])[0]
    test_prob = clf.predict_proba(test_cv[asso_TCRs])[:,1][0]

    sample_name.append(test_cv['sample_name'].tolist()[0])
    y_true.append(test_cv['phenotype_status'].values[0])
    y_pred.append(test_pred)
    y_proba.append(test_prob)

    
    print('y_true:',test_cv['phenotype_status'].values[0],'y_pred:',test_pred,'posterior_c1:',test_prob)
    print()

print('cv auroc:',roc_auc_score(y_true,y_proba))
print('cv log_loss:',log_loss(y_true,y_proba))

RA47
y_true: 1 y_pred: 0 posterior_c1: 0.4559243242160052

HC9
y_true: 0 y_pred: 1 posterior_c1: 0.5476924953305642

RA29
y_true: 1 y_pred: 0 posterior_c1: 0.45157156172347307

RA8
y_true: 1 y_pred: 0 posterior_c1: 0.4344244698052342

RA63
y_true: 1 y_pred: 1 posterior_c1: 0.7158866604858796

RA33
y_true: 1 y_pred: 1 posterior_c1: 0.9745454545454545

HC17
y_true: 0 y_pred: 0 posterior_c1: 0.4862684078541365

HC3
y_true: 0 y_pred: 1 posterior_c1: 0.6173276693652499

RA17
y_true: 1 y_pred: 1 posterior_c1: 0.5887854115192951

RA46
y_true: 1 y_pred: 1 posterior_c1: 0.7277714377437766

HC6
y_true: 0 y_pred: 1 posterior_c1: 0.7040144856241488

RA26
y_true: 1 y_pred: 1 posterior_c1: 0.7026920565878476

RA15
y_true: 1 y_pred: 1 posterior_c1: 0.8094983948887419

RA53
y_true: 1 y_pred: 1 posterior_c1: 0.8304122810200124

RA32
y_true: 1 y_pred: 1 posterior_c1: 0.5818559949210795

RA40
y_true: 1 y_pred: 0 posterior_c1: 0.2642936404254288

HC8
y_true: 0 y_pred: 1 posterior_c1: 0.758694520026563

RA