In [1]:
# Handle table-like data and matrices
import numpy as np
import pandas as pd

# Helpers
import sys
from datetime import datetime
from scipy.sparse import csr_matrix,save_npz,load_npz
import pickle
from classifiers import TCRs_selection

from sklearn.model_selection import cross_val_score,StratifiedKFold
from sklearn.metrics import accuracy_score,roc_auc_score,roc_curve,log_loss
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV,cross_validate
from sklearn.metrics import classification_report
from sklearn.model_selection import LeaveOneOut,KFold,StratifiedKFold
from sklearn.decomposition import PCA,KernelPCA
from scipy.stats import fisher_exact,ttest_ind

from classifiers import MAP_estimator,cal_p_value,TCRs_selection,LOOCV_MAP

# Algorithm
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

# Visualisation
import matplotlib.pyplot as plt
import seaborn as sns

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')
   

In [2]:
def t_test(pos,neg):
    t_stat, pvalue = ttest_ind(pos,neg,equal_var=False)
    if t_stat > 0:
        pvalue = pvalue/2
    else:
        pvalue = 1-(pvalue/2)
    
    return pvalue

def TCRs_selection_ttest(train,kwargs):
    
    TCRs = train.drop(['sample_name','phenotype_status','total_TCRs'],axis=1).columns.values # a list of TCR candidates

    # the subdf of different phenotype_status(negative/positive)
    neg_df = train[train['phenotype_status']==0]
    pos_df = train[train['phenotype_status']==1]

    TCRs_pvalue = dict() # init dict
    for tcr in TCRs: # for each TCR in the list of TCR candidates
        
        pos = pos_df[tcr]
        neg = neg_df[tcr]

        if np.mean(pos)!=0 or np.mean(neg)!=0: # if does not occur in any classes, not to add to the dict 
            TCRs_pvalue[tcr] = [np.mean(pos),np.mean(neg)] # add the TCR incidence to dict
            p_value = t_test(pos,neg)
            TCRs_pvalue[tcr].append(p_value) # store p-value

    p_value_df = pd.DataFrame.from_dict(TCRs_pvalue,orient='index') # generate df from TCR incidence dict
    p_value_df.columns = ['mean(pos)','mean(neg)','p_value'] # rename df

    threshold = kwargs['threshold']
    TCRs_asso = p_value_df[p_value_df.p_value<=threshold].T.columns.values # select a list of pehnotype_associated TCRs
    return TCRs_asso


In [3]:
count_df = pd.read_pickle('../data/'+'count_df.pkl')
freq_df = pd.read_pickle('../data/'+'freq_df.pkl')
TCRs = count_df.drop(['sample_name','phenotype_status'],axis=1).columns.values

In [4]:
clf = RandomForestClassifier(bootstrap=True, class_weight= None, criterion='gini',
                                 max_depth=10, max_features='auto', max_leaf_nodes=None,
                                 min_impurity_decrease=0.0, min_impurity_split=None,
                                 min_samples_leaf=1, min_samples_split=2,
                                 min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                                 oob_score=True, random_state=0, verbose=0, warm_start=False)

## Feature selection: Intersection of Fisher and ttest 

In [6]:
# Init lists
sample_name = []
y_true = []
y_pred = []
y_proba = [] 

TCRs = freq_df.drop(['sample_name','phenotype_status','total_TCRs'],axis=1).columns.values

kf = LeaveOneOut()
for train_index,test_index in kf.split(freq_df):
    train_cv, test_cv = freq_df.iloc[train_index], freq_df.iloc[test_index]
    print(test_cv['sample_name'].tolist()[0])
    
    asso_TCRs1 = TCRs_selection(train_cv,TCRs,0.2)
    asso_TCRs2 = TCRs_selection_ttest(train_cv,{'threshold':0.02})
    print('Length of phenotype_associated TCRs: ','Fisher:',len(asso_TCRs),'t-test:',len(asso_TCRs2))

    asso_TCRs = list(set(asso_TCRs1).intersection(set(asso_TCRs2)))
    print('Length of the intersection:',len(asso_TCRs))

    clf.fit(train_cv[asso_TCRs],train_cv['phenotype_status'])
    test_pred = clf.predict(test_cv[asso_TCRs])[0]
    test_prob = clf.predict_proba(test_cv[asso_TCRs])[:,1][0]

    sample_name.append(test_cv['sample_name'].tolist()[0])
    y_true.append(test_cv['phenotype_status'].values[0])
    y_pred.append(test_pred)
    y_proba.append(test_prob)

    
    print('y_true:',test_cv['phenotype_status'].values[0],'y_pred:',test_pred,'posterior_c1:',test_prob)
    print()

print('cv auroc:',roc_auc_score(y_true,y_proba))
print('cv log_loss:',log_loss(y_true,y_proba))

RA47
Length of phenotype_associated TCRs:  Fisher: 69 t-test: 85
Length of the intersection: 48
y_true: 1 y_pred: 0 posterior_c1: 0.4787729671691872

HC9
Length of phenotype_associated TCRs:  Fisher: 31 t-test: 93
Length of the intersection: 25
y_true: 0 y_pred: 1 posterior_c1: 0.6712093846257818

RA29
Length of phenotype_associated TCRs:  Fisher: 65 t-test: 83
Length of the intersection: 46
y_true: 1 y_pred: 0 posterior_c1: 0.4031953583988941

RA8
Length of phenotype_associated TCRs:  Fisher: 69 t-test: 92
Length of the intersection: 50
y_true: 1 y_pred: 0 posterior_c1: 0.4947268387601035

RA63
Length of phenotype_associated TCRs:  Fisher: 69 t-test: 92
Length of the intersection: 50
y_true: 1 y_pred: 1 posterior_c1: 0.824347799289112

RA33
Length of phenotype_associated TCRs:  Fisher: 67 t-test: 89
Length of the intersection: 48
y_true: 1 y_pred: 1 posterior_c1: 0.963103448275862

HC17
Length of phenotype_associated TCRs:  Fisher: 33 t-test: 98
Length of the intersection: 27
y_true: 

Length of phenotype_associated TCRs:  Fisher: 61 t-test: 83
Length of the intersection: 44
y_true: 1 y_pred: 1 posterior_c1: 0.6391895595781659

RA82
Length of phenotype_associated TCRs:  Fisher: 67 t-test: 89
Length of the intersection: 49
y_true: 1 y_pred: 1 posterior_c1: 0.8523900390056737

RA57
Length of phenotype_associated TCRs:  Fisher: 65 t-test: 83
Length of the intersection: 45
y_true: 1 y_pred: 1 posterior_c1: 0.6853933912241561

RA72
Length of phenotype_associated TCRs:  Fisher: 64 t-test: 83
Length of the intersection: 44
y_true: 1 y_pred: 1 posterior_c1: 0.7342634473246694

RA18
Length of phenotype_associated TCRs:  Fisher: 70 t-test: 93
Length of the intersection: 50
y_true: 1 y_pred: 1 posterior_c1: 0.6514063267878801

HC20
Length of phenotype_associated TCRs:  Fisher: 33 t-test: 104
Length of the intersection: 26
y_true: 0 y_pred: 1 posterior_c1: 0.99

RA14
Length of phenotype_associated TCRs:  Fisher: 70 t-test: 93
Length of the intersection: 50
y_true: 1 y_pred: 0 po

### New feature: sum of associated TCRs

In [5]:
kf = LeaveOneOut()
for train_index,test_index in kf.split(freq_df):
    train_cv, test_cv = freq_df.iloc[train_index], freq_df.iloc[test_index]

    train = train_cv.drop(['sample_name','phenotype_status'],axis=1)
    test = test_cv.drop(['sample_name','phenotype_status'],axis=1)
    
    asso_TCRs = TCRs_selection(train_cv,TCRs,0.2)
    train['sum_asso_TCRs'] = train[asso_TCRs].sum(axis=1)
    test['sum_asso_TCRs'] = test[asso_TCRs].sum(axis=1)
    
    asso_TCRs = asso_TCRs + ['sum_asso_TCRs']
    
    clf.fit(train[asso_TCRs],train_cv['phenotype_status'])
    test_pred = clf.predict(test[asso_TCRs])[0]
    test_prob = clf.predict_proba(test[asso_TCRs])[:,1][0]

    sample_name.append(test_cv['sample_name'].tolist()[0])
    y_true.append(test_cv['phenotype_status'].values[0])
    y_pred.append(test_pred)
    y_proba.append(test_prob)

    
    print('y_true:',test_cv['phenotype_status'].values[0],'y_pred:',test_pred,'posterior_c1:',test_prob)
    print()
    break

In [13]:
new1 = train_cv.loc[:,asso_TCRs].sum(axis=1)
new2 = train_cv[asso_TCRs].sum(axis=1)
p = pd.DataFrame({'status':train_cv['phenotype_status'],'new_feature':new1})
print(p[p['status']==0]['new_feature'].mean(),p[p['status']==1]['new_feature'].mean())

5.734544729829541e-05 0.0008260088911752312


In [13]:
# Init lists
sample_name = []
y_true = []
y_pred = []
y_proba = [] 

TCRs = freq_df.drop(['sample_name','phenotype_status','total_TCRs'],axis=1).columns.values

kf = LeaveOneOut()
for train_index,test_index in kf.split(freq_df):
    data = freq_df.copy(deep=True)
    train_cv, test_cv = data.iloc[train_index], data.iloc[test_index]
    print(test_cv['sample_name'].tolist()[0])
    
    asso_TCRs = TCRs_selection(train_cv,TCRs,0.2)
    train_cv['sum_asso_TCRs'] = train_cv[asso_TCRs].sum(axis=1)
    test_cv['sum_asso_TCRs'] = test_cv[asso_TCRs].sum(axis=1)
    
    print('Mean of the sum_asso_TCRs: ','Class 0:',train_cv[train_cv['phenotype_status']==0]['sum_asso_TCRs'].mean(),
          'Class 1:',train_cv[train_cv['phenotype_status']==1]['sum_asso_TCRs'].mean())
    print('Sum_asso_TCRs of the testing sample:',test_cv['sum_asso_TCRs'].tolist()[0])
    
    asso_TCRs = asso_TCRs.tolist()+['sum_asso_TCRs']
    
    clf.fit(train_cv[asso_TCRs],train_cv['phenotype_status'])
    test_pred = clf.predict(test_cv[asso_TCRs])[0]
    test_prob = clf.predict_proba(test_cv[asso_TCRs])[:,1][0]

    sample_name.append(test_cv['sample_name'].tolist()[0])
    y_true.append(test_cv['phenotype_status'].values[0])
    y_pred.append(test_pred)
    y_proba.append(test_prob)

    
    print('y_true:',test_cv['phenotype_status'].values[0],'y_pred:',test_pred,'posterior_c1:',test_prob)
    print()

print('cv auroc:',roc_auc_score(y_true,y_proba))
print('cv log_loss:',log_loss(y_true,y_proba))

RA47
Mean of the sum_asso_TCRs:  Class 0: 5.734544729829541e-05 Class 1: 0.0008260088911752312
Sum_asso_TCRs of the testing sample: 0.00023917723032767282
y_true: 1 y_pred: 1 posterior_c1: 0.5223180531116328

HC9
Mean of the sum_asso_TCRs:  Class 0: 4.6174552679196166e-05 Class 1: 0.0004869688251373103
Sum_asso_TCRs of the testing sample: 0.00033110390040394673
y_true: 0 y_pred: 1 posterior_c1: 0.9113932257960655

RA29
Mean of the sum_asso_TCRs:  Class 0: 5.4034408294255926e-05 Class 1: 0.0007951761368263883
Sum_asso_TCRs of the testing sample: 0.00017570832418185812
y_true: 1 y_pred: 1 posterior_c1: 0.5190735652730691

RA8
Mean of the sum_asso_TCRs:  Class 0: 5.734544729829541e-05 Class 1: 0.000827469578691439
Sum_asso_TCRs of the testing sample: 0.00015790304752881732
y_true: 1 y_pred: 0 posterior_c1: 0.42130515567756677

RA63
Mean of the sum_asso_TCRs:  Class 0: 5.734544729829541e-05 Class 1: 0.0008266545421992621
Sum_asso_TCRs of the testing sample: 0.00041245617653124357
y_true: 1

y_true: 1 y_pred: 1 posterior_c1: 0.8868330384212736

RA52
Mean of the sum_asso_TCRs:  Class 0: 5.4034408294255926e-05 Class 1: 0.0007341484694501092
Sum_asso_TCRs of the testing sample: 0.00075937832228016
y_true: 1 y_pred: 1 posterior_c1: 0.9708095238095238

RA60
Mean of the sum_asso_TCRs:  Class 0: 5.734544729829541e-05 Class 1: 0.0007773728428655968
Sum_asso_TCRs of the testing sample: 0.0005345211581291759
y_true: 1 y_pred: 1 posterior_c1: 0.8427535277433857

RA28
Mean of the sum_asso_TCRs:  Class 0: 5.734544729829541e-05 Class 1: 0.0008327432504859127
Sum_asso_TCRs of the testing sample: 0.00046389361373125093
y_true: 1 y_pred: 1 posterior_c1: 0.7386373983411696

RA2
Mean of the sum_asso_TCRs:  Class 0: 5.734544729829541e-05 Class 1: 0.0008352781643693926
Sum_asso_TCRs of the testing sample: 0.00030165912518853697
y_true: 1 y_pred: 1 posterior_c1: 0.5841519834301219

RA74
Mean of the sum_asso_TCRs:  Class 0: 5.734544729829541e-05 Class 1: 0.000799027710356233
Sum_asso_TCRs of the

y_true: 1 y_pred: 1 posterior_c1: 0.8887157319608593

HC2
Mean of the sum_asso_TCRs:  Class 0: 6.06545220006922e-05 Class 1: 0.0005769031351501941
Sum_asso_TCRs of the testing sample: 0.00033536789858474745
y_true: 0 y_pred: 1 posterior_c1: 0.994079254079254

RA65
Mean of the sum_asso_TCRs:  Class 0: 5.734544729829541e-05 Class 1: 0.0008042342128106347
Sum_asso_TCRs of the testing sample: 0.0005718097779472029
y_true: 1 y_pred: 1 posterior_c1: 0.9281859416389335

RA75
Mean of the sum_asso_TCRs:  Class 0: 5.4034408294255926e-05 Class 1: 0.000740373247829268
Sum_asso_TCRs of the testing sample: 0.0007725653461521954
y_true: 1 y_pred: 1 posterior_c1: 0.9827777777777778

RA76
Mean of the sum_asso_TCRs:  Class 0: 5.734544729829541e-05 Class 1: 0.0007644859877473392
Sum_asso_TCRs of the testing sample: 0.00017164435290078955
y_true: 1 y_pred: 1 posterior_c1: 0.5041907178341143

HC16
Mean of the sum_asso_TCRs:  Class 0: 4.965985689397455e-05 Class 1: 0.00047320344199084535
Sum_asso_TCRs of th

In [10]:
# Init lists
sample_name = []
y_true = []
y_pred = []
y_proba = [] 

TCRs = freq_df.drop(['sample_name','phenotype_status','total_TCRs'],axis=1).columns.values

kf = LeaveOneOut()
for train_index,test_index in kf.split(freq_df):
    data = freq_df.copy(deep=True)
    train_cv, test_cv = data.iloc[train_index], data.iloc[test_index]
    print(test_cv['sample_name'].tolist()[0])
    
    asso_TCRs = TCRs_selection(train_cv,TCRs,0.2)
    train_cv['sum_asso_TCRs'] = train_cv[asso_TCRs].sum(axis=1)
    test_cv['sum_asso_TCRs'] = test_cv[asso_TCRs].sum(axis=1)
    
    clf.fit(train_cv[asso_TCRs],train_cv['phenotype_status'])
    test_pred = clf.predict(test_cv[asso_TCRs])[0]
    test_prob = clf.predict_proba(test_cv[asso_TCRs])[:,1][0]

    sample_name.append(test_cv['sample_name'].tolist()[0])
    y_true.append(test_cv['phenotype_status'].values[0])
    y_pred.append(test_pred)
    y_proba.append(test_prob)

    
    print('y_true:',test_cv['phenotype_status'].values[0],'y_pred:',test_pred,'posterior_c1:',test_prob)
    print()

print('cv auroc:',roc_auc_score(y_true,y_proba))
print('cv log_loss:',log_loss(y_true,y_proba))

RA47
y_true: 1 y_pred: 0 posterior_c1: 0.4559243242160052

HC9
y_true: 0 y_pred: 1 posterior_c1: 0.5476924953305642

RA29
y_true: 1 y_pred: 0 posterior_c1: 0.45157156172347307

RA8
y_true: 1 y_pred: 0 posterior_c1: 0.4344244698052342

RA63
y_true: 1 y_pred: 1 posterior_c1: 0.7158866604858796

RA33
y_true: 1 y_pred: 1 posterior_c1: 0.9745454545454545

HC17
y_true: 0 y_pred: 0 posterior_c1: 0.4862684078541365

HC3
y_true: 0 y_pred: 1 posterior_c1: 0.6173276693652499

RA17
y_true: 1 y_pred: 1 posterior_c1: 0.5887854115192951

RA46
y_true: 1 y_pred: 1 posterior_c1: 0.7277714377437766

HC6
y_true: 0 y_pred: 1 posterior_c1: 0.7040144856241488

RA26
y_true: 1 y_pred: 1 posterior_c1: 0.7026920565878476

RA15
y_true: 1 y_pred: 1 posterior_c1: 0.8094983948887419

RA53
y_true: 1 y_pred: 1 posterior_c1: 0.8304122810200124

RA32
y_true: 1 y_pred: 1 posterior_c1: 0.5818559949210795

RA40
y_true: 1 y_pred: 0 posterior_c1: 0.2642936404254288

HC8
y_true: 0 y_pred: 1 posterior_c1: 0.758694520026563

RA