In [5]:
# Handle table-like data and matrices
import numpy as np
import pandas as pd

# Helpers
import sys
from datetime import datetime
from scipy.sparse import csr_matrix,save_npz,load_npz
import pickle
from classifiers import TCRs_selection,TCRs_selection_ttest,t_test

from sklearn.model_selection import cross_val_score,StratifiedKFold
from sklearn.metrics import accuracy_score,roc_auc_score,roc_curve,log_loss
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV,cross_validate
from sklearn.metrics import classification_report
from sklearn.model_selection import LeaveOneOut,KFold,StratifiedKFold
from sklearn.decomposition import PCA,KernelPCA
from sklearn.feature_selection import RFE
from scipy.stats import fisher_exact,ttest_ind

from classifiers import MAP_estimator,cal_p_value,TCRs_selection,LOOCV_MAP

# Algorithm
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

# Visualisation
import matplotlib.pyplot as plt
import seaborn as sns

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

data_path = '../data/'

In [6]:
count_df = pd.read_pickle(data_path+'count_df.pkl')
freq_df = pd.read_pickle(data_path+'freq_df.pkl')
TCRs = count_df.drop(['sample_name','phenotype_status'],axis=1).columns.values
len(TCRs)

34063

In [3]:
clf = RandomForestClassifier(bootstrap=True, class_weight= None, criterion='gini',
                                 max_depth=10, max_features='auto', max_leaf_nodes=None,
                                 min_impurity_decrease=0.0, min_impurity_split=None,
                                 min_samples_leaf=1, min_samples_split=2,
                                 min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                                 oob_score=True, random_state=0, verbose=0, warm_start=False)

#### Feature selection by Fisher+RFE

In [None]:
# Init lists
sample_name = []
y_true = []
y_pred = []
y_proba = [] 

TCRs = count_df.drop(['sample_name','phenotype_status'],axis=1).columns.values

kf = LeaveOneOut()
for train_index,test_index in kf.split(count_df):
    data = count_df.copy(deep=True)
    train_cv, test_cv = data.iloc[train_index], data.iloc[test_index]
    print(test_cv['sample_name'].tolist()[0])
    
    asso_TCRs = TCRs_selection(train_cv,TCRs,0.4)
    print('Length of phenotype_associated TCRs: ',len(asso_TCRs))
    train_cv['sum_asso_TCRs'] = train_cv[asso_TCRs].sum(axis=1)
    test_cv['sum_asso_TCRs'] = test_cv[asso_TCRs].sum(axis=1)
    
    rfe= RFE(clf,80,step=1).fit(train_cv[asso_TCRs],train_cv['phenotype_status'])
    #rfe.fit(train_cv[asso_TCRs],train_cv['phenotype_status'])
    test_pred = rfe.predict(test_cv[asso_TCRs])[0]
    test_prob = rfe.predict_proba(test_cv[asso_TCRs])[:,1][0]

    sample_name.append(test_cv['sample_name'].tolist()[0])
    y_true.append(test_cv['phenotype_status'].values[0])
    y_pred.append(test_pred)
    y_proba.append(test_prob)

    
    print('y_true:',test_cv['phenotype_status'].values[0],'y_pred:',test_pred,'posterior_c1:',test_prob)
    print()

print('cv auroc:',roc_auc_score(y_true,y_proba))
print('cv log_loss:',log_loss(y_true,y_proba))

RA47
Length of phenotype_associated TCRs:  529
y_true: 1 y_pred: 1 posterior_c1: 0.5838172669974011

HC9
Length of phenotype_associated TCRs:  542
y_true: 0 y_pred: 1 posterior_c1: 0.6794607302440506

RA29
Length of phenotype_associated TCRs:  510
y_true: 1 y_pred: 1 posterior_c1: 0.7559489179619493

RA8
Length of phenotype_associated TCRs:  546
y_true: 1 y_pred: 0 posterior_c1: 0.4496627883490492

RA63
Length of phenotype_associated TCRs:  540
y_true: 1 y_pred: 1 posterior_c1: 0.6535613331007883

RA33
Length of phenotype_associated TCRs:  527
y_true: 1 y_pred: 1 posterior_c1: 0.8133370464277664

HC17
Length of phenotype_associated TCRs:  569
y_true: 0 y_pred: 1 posterior_c1: 0.6147809732788815

HC3
Length of phenotype_associated TCRs:  553
y_true: 0 y_pred: 1 posterior_c1: 0.5954601009610002

RA17
Length of phenotype_associated TCRs:  549
y_true: 1 y_pred: 1 posterior_c1: 0.5885703431212884

RA46
Length of phenotype_associated TCRs:  508
y_true: 1 y_pred: 1 posterior_c1: 0.85346648086

## Feature selection: Intersection of Fisher and ttest 

In [4]:
# Init lists
sample_name = []
y_true = []
y_pred = []
y_proba = [] 

TCRs = freq_df.drop(['sample_name','phenotype_status'],axis=1).columns.values

kf = LeaveOneOut()
for train_index,test_index in kf.split(freq_df):
    train_cv, test_cv = freq_df.iloc[train_index], freq_df.iloc[test_index]
    print(test_cv['sample_name'].tolist()[0])
    
    asso_TCRs1 = TCRs_selection(train_cv,TCRs,0.2)
    asso_TCRs2 = TCRs_selection_ttest(train_cv,TCRs,{'threshold':0.02})
    print('Length of phenotype_associated TCRs: ','Fisher:',len(asso_TCRs1),'t-test:',len(asso_TCRs2))

    asso_TCRs = list(set(asso_TCRs1).intersection(set(asso_TCRs2)))
    print('Length of the intersection:',len(asso_TCRs))

    clf.fit(train_cv[asso_TCRs],train_cv['phenotype_status'])
    test_pred = clf.predict(test_cv[asso_TCRs])[0]
    test_prob = clf.predict_proba(test_cv[asso_TCRs])[:,1][0]

    sample_name.append(test_cv['sample_name'].tolist()[0])
    y_true.append(test_cv['phenotype_status'].values[0])
    y_pred.append(test_pred)
    y_proba.append(test_prob)

    
    print('y_true:',test_cv['phenotype_status'].values[0],'y_pred:',test_pred,'posterior_c1:',test_prob)
    print()

print('cv auroc:',roc_auc_score(y_true,y_proba))
print('cv log_loss:',log_loss(y_true,y_proba))

RA47
Length of phenotype_associated TCRs:  Fisher: 75 t-test: 91
Length of the intersection: 53
y_true: 1 y_pred: 0 posterior_c1: 0.4982552359321503

HC9
Length of phenotype_associated TCRs:  Fisher: 33 t-test: 101
Length of the intersection: 27
y_true: 0 y_pred: 1 posterior_c1: 0.6811990396805713

RA29
Length of phenotype_associated TCRs:  Fisher: 73 t-test: 90
Length of the intersection: 52
y_true: 1 y_pred: 1 posterior_c1: 0.5315181400379161

RA8
Length of phenotype_associated TCRs:  Fisher: 78 t-test: 100
Length of the intersection: 57
y_true: 1 y_pred: 1 posterior_c1: 0.5763742892055475

RA63
Length of phenotype_associated TCRs:  Fisher: 78 t-test: 100
Length of the intersection: 57
y_true: 1 y_pred: 1 posterior_c1: 0.8197531693055179

RA33
Length of phenotype_associated TCRs:  Fisher: 74 t-test: 98
Length of the intersection: 54
y_true: 1 y_pred: 1 posterior_c1: 0.9284375

HC17
Length of phenotype_associated TCRs:  Fisher: 35 t-test: 107
Length of the intersection: 29
y_true: 0 y

### New feature: sum of associated TCRs

In [5]:
kf = LeaveOneOut()
for train_index,test_index in kf.split(freq_df):
    train_cv, test_cv = freq_df.iloc[train_index], freq_df.iloc[test_index]

    train = train_cv.drop(['sample_name','phenotype_status'],axis=1)
    test = test_cv.drop(['sample_name','phenotype_status'],axis=1)
    
    asso_TCRs = TCRs_selection(train_cv,TCRs,0.2)
    train['sum_asso_TCRs'] = train[asso_TCRs].sum(axis=1)
    test['sum_asso_TCRs'] = test[asso_TCRs].sum(axis=1)
    
    asso_TCRs = asso_TCRs + ['sum_asso_TCRs']
    
    clf.fit(train[asso_TCRs],train_cv['phenotype_status'])
    test_pred = clf.predict(test[asso_TCRs])[0]
    test_prob = clf.predict_proba(test[asso_TCRs])[:,1][0]

    sample_name.append(test_cv['sample_name'].tolist()[0])
    y_true.append(test_cv['phenotype_status'].values[0])
    y_pred.append(test_pred)
    y_proba.append(test_prob)

    
    print('y_true:',test_cv['phenotype_status'].values[0],'y_pred:',test_pred,'posterior_c1:',test_prob)
    print()
    break

In [13]:
new1 = train_cv.loc[:,asso_TCRs].sum(axis=1)
new2 = train_cv[asso_TCRs].sum(axis=1)
p = pd.DataFrame({'status':train_cv['phenotype_status'],'new_feature':new1})
print(p[p['status']==0]['new_feature'].mean(),p[p['status']==1]['new_feature'].mean())

5.734544729829541e-05 0.0008260088911752312


In [13]:
# Init lists
sample_name = []
y_true = []
y_pred = []
y_proba = [] 

TCRs = freq_df.drop(['sample_name','phenotype_status','total_TCRs'],axis=1).columns.values

kf = LeaveOneOut()
for train_index,test_index in kf.split(freq_df):
    data = freq_df.copy(deep=True)
    train_cv, test_cv = data.iloc[train_index], data.iloc[test_index]
    print(test_cv['sample_name'].tolist()[0])
    
    asso_TCRs = TCRs_selection(train_cv,TCRs,0.2)
    
    print('Mean of the sum_asso_TCRs: ','Class 0:',train_cv[train_cv['phenotype_status']==0]['sum_asso_TCRs'].mean(),
          'Class 1:',train_cv[train_cv['phenotype_status']==1]['sum_asso_TCRs'].mean())
    print('Sum_asso_TCRs of the testing sample:',test_cv['sum_asso_TCRs'].tolist()[0])
    
    asso_TCRs = asso_TCRs.tolist()+['sum_asso_TCRs']
    
    clf.fit(train_cv[asso_TCRs],train_cv['phenotype_status'])
    test_pred = clf.predict(test_cv[asso_TCRs])[0]
    test_prob = clf.predict_proba(test_cv[asso_TCRs])[:,1][0]

    sample_name.append(test_cv['sample_name'].tolist()[0])
    y_true.append(test_cv['phenotype_status'].values[0])
    y_pred.append(test_pred)
    y_proba.append(test_prob)

    
    print('y_true:',test_cv['phenotype_status'].values[0],'y_pred:',test_pred,'posterior_c1:',test_prob)
    print()

print('cv auroc:',roc_auc_score(y_true,y_proba))
print('cv log_loss:',log_loss(y_true,y_proba))

RA47
Mean of the sum_asso_TCRs:  Class 0: 5.734544729829541e-05 Class 1: 0.0008260088911752312
Sum_asso_TCRs of the testing sample: 0.00023917723032767282
y_true: 1 y_pred: 1 posterior_c1: 0.5223180531116328

HC9
Mean of the sum_asso_TCRs:  Class 0: 4.6174552679196166e-05 Class 1: 0.0004869688251373103
Sum_asso_TCRs of the testing sample: 0.00033110390040394673
y_true: 0 y_pred: 1 posterior_c1: 0.9113932257960655

RA29
Mean of the sum_asso_TCRs:  Class 0: 5.4034408294255926e-05 Class 1: 0.0007951761368263883
Sum_asso_TCRs of the testing sample: 0.00017570832418185812
y_true: 1 y_pred: 1 posterior_c1: 0.5190735652730691

RA8
Mean of the sum_asso_TCRs:  Class 0: 5.734544729829541e-05 Class 1: 0.000827469578691439
Sum_asso_TCRs of the testing sample: 0.00015790304752881732
y_true: 1 y_pred: 0 posterior_c1: 0.42130515567756677

RA63
Mean of the sum_asso_TCRs:  Class 0: 5.734544729829541e-05 Class 1: 0.0008266545421992621
Sum_asso_TCRs of the testing sample: 0.00041245617653124357
y_true: 1

In [10]:
# Init lists
sample_name = []
y_true = []
y_pred = []
y_proba = [] 

TCRs = freq_df.drop(['sample_name','phenotype_status','total_TCRs'],axis=1).columns.values

kf = LeaveOneOut()
for train_index,test_index in kf.split(freq_df):
    data = freq_df.copy(deep=True)
    train_cv, test_cv = data.iloc[train_index], data.iloc[test_index]
    print(test_cv['sample_name'].tolist()[0])
    
    asso_TCRs = TCRs_selection(train_cv,TCRs,0.2)
    train_cv['sum_asso_TCRs'] = train_cv[asso_TCRs].sum(axis=1)
    test_cv['sum_asso_TCRs'] = test_cv[asso_TCRs].sum(axis=1)
    
    clf.fit(train_cv[asso_TCRs],train_cv['phenotype_status'])
    test_pred = clf.predict(test_cv[asso_TCRs])[0]
    test_prob = clf.predict_proba(test_cv[asso_TCRs])[:,1][0]

    sample_name.append(test_cv['sample_name'].tolist()[0])
    y_true.append(test_cv['phenotype_status'].values[0])
    y_pred.append(test_pred)
    y_proba.append(test_prob)

    
    print('y_true:',test_cv['phenotype_status'].values[0],'y_pred:',test_pred,'posterior_c1:',test_prob)
    print()

print('cv auroc:',roc_auc_score(y_true,y_proba))
print('cv log_loss:',log_loss(y_true,y_proba))

RA47
y_true: 1 y_pred: 0 posterior_c1: 0.4559243242160052

HC9
y_true: 0 y_pred: 1 posterior_c1: 0.5476924953305642

RA29
y_true: 1 y_pred: 0 posterior_c1: 0.45157156172347307

RA8
y_true: 1 y_pred: 0 posterior_c1: 0.4344244698052342

RA63
y_true: 1 y_pred: 1 posterior_c1: 0.7158866604858796

RA33
y_true: 1 y_pred: 1 posterior_c1: 0.9745454545454545

HC17
y_true: 0 y_pred: 0 posterior_c1: 0.4862684078541365

HC3
y_true: 0 y_pred: 1 posterior_c1: 0.6173276693652499

RA17
y_true: 1 y_pred: 1 posterior_c1: 0.5887854115192951

RA46
y_true: 1 y_pred: 1 posterior_c1: 0.7277714377437766

HC6
y_true: 0 y_pred: 1 posterior_c1: 0.7040144856241488

RA26
y_true: 1 y_pred: 1 posterior_c1: 0.7026920565878476

RA15
y_true: 1 y_pred: 1 posterior_c1: 0.8094983948887419

RA53
y_true: 1 y_pred: 1 posterior_c1: 0.8304122810200124

RA32
y_true: 1 y_pred: 1 posterior_c1: 0.5818559949210795

RA40
y_true: 1 y_pred: 0 posterior_c1: 0.2642936404254288

HC8
y_true: 0 y_pred: 1 posterior_c1: 0.758694520026563

RA