In [1]:
import matplotlib
matplotlib.use('Agg')

# Handle table-like data and matrices
import numpy as np
import pandas as pd

# Helpers
import sys
sys.path.insert(0,'../../')
from utils import data_path,results_path,grid_search,estimator_result,cross_validate,evaluate_param
from scipy.sparse import csr_matrix,save_npz,load_npz
from sklearn.model_selection import cross_val_score,LeaveOneOut,StratifiedKFold
from sklearn.metrics import accuracy_score,roc_auc_score,roc_curve
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import MinMaxScaler,StandardScaler
import pickle

# Feature selection
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import RFECV,VarianceThreshold

# Algorithm
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

# Visualisation
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [44]:
# Load incidences and p-values that paper gives
inc_paper = pickle.load(open(data_path+'MAP_estimator/'+'inc_paper.pkl','rb'))
inc_paper.head()

Unnamed: 0,V family,V gene,V allele,CDR3,J family,J gene,J allele,Incidence in CMV+ subjects,Incidence in CMV- subjects,P-value,HLA restriction,TCR
0,TCRBV09,TCRBV09-01,,CASSGQGAYEQYF,TCRBJ02,TCRBJ02-07,1,61,11,2.16e-13,,"(TCRBV09, TCRBV09-01, null, CASSGQGAYEQYF, TCR..."
1,TCRBV19,TCRBV19-01,,CASSIGPLEHNEQFF,TCRBJ02,TCRBJ02-01,1,30,0,1.75e-11,A1,"(TCRBV19, TCRBV19-01, null, CASSIGPLEHNEQFF, T..."
2,TCRBV05,TCRBV05-01,1.0,CASSPDRVGQETQYF,TCRBJ02,TCRBJ02-05,1,33,1,2.66e-11,,"(TCRBV05, TCRBV05-01, 01, CASSPDRVGQETQYF, TCR..."
3,TCRBV07,TCRBV07-02,1.0,CASSLEAEYEQYF,TCRBJ02,TCRBJ02-07,1,30,1,3.2e-10,B8,"(TCRBV07, TCRBV07-02, 01, CASSLEAEYEQYF, TCRBJ..."
4,TCRBV28,TCRBV28-01,1.0,CASSIEGNQPQHF,TCRBJ01,TCRBJ01-05,1,26,0,5.31e-10,,"(TCRBV28, TCRBV28-01, 01, CASSIEGNQPQHF, TCRBJ..."


In [20]:
# Load the statistics we got
inc = pd.read_pickle(data_path+'inc_p_values.pkl')
inc

Unnamed: 0_level_0,TCRBV01,TCRBV01,TCRBV01,TCRBV01,TCRBV01,TCRBV01,TCRBV01,TCRBV01,TCRBV01,TCRBV01,...,null,null,null,null,null,null,null,null,null,null
Unnamed: 0_level_1,TCRBV01-01,TCRBV01-01,TCRBV01-01,TCRBV01-01,TCRBV01-01,TCRBV01-01,TCRBV01-01,TCRBV01-01,TCRBV01-01,TCRBV01-01,...,unresolved,unresolved,unresolved,unresolved,unresolved,unresolved,unresolved,unresolved,unresolved,unresolved
Unnamed: 0_level_2,01,01,01,01,01,01,01,01,01,01,...,null,null,null,null,null,null,null,null,null,null
Unnamed: 0_level_3,CTFQETQYF,CTHGGHSLPTPSNQPQHF,CTRGQGGTEAFF,CTSGEQYF,CTSGGLAESTDTQYF,CTSGPSNQPQHF,CTSGPYEQYF,CTSGYEQYF,CTSILTRYNSNQPQHF,CTSNPTEAFF,...,CASSSSGLAGGRSSYNEQFF,CASSSSPGLAGGSSYNEQFF,CASSSTGAGNQPQHF,CASSSYNLRGGGRGGRNEQFF,CASSTGLAGGLSSGANVLTF,CASSTPGLAGGSSSYNEQFF,CASSVGGRGKNTEAFF,CASSYPGLAGGSSSYNEQFF,CASTRTESSYNEQFF,CATGTGDSNQPQHF
Unnamed: 0_level_4,TCRBJ02,TCRBJ01,TCRBJ01,TCRBJ02,TCRBJ02,TCRBJ01,TCRBJ02,TCRBJ02,TCRBJ01,TCRBJ01,...,TCRBJ02,TCRBJ02,TCRBJ01,TCRBJ02,TCRBJ02,TCRBJ02,TCRBJ01,TCRBJ02,TCRBJ02,TCRBJ01
Unnamed: 0_level_5,TCRBJ02-05,TCRBJ01-05,TCRBJ01-01,TCRBJ02-07,TCRBJ02-03,TCRBJ01-05,TCRBJ02-07,TCRBJ02-07,TCRBJ01-05,TCRBJ01-01,...,TCRBJ02-01,TCRBJ02-01,TCRBJ01-05,TCRBJ02-01,TCRBJ02-06,TCRBJ02-01,TCRBJ01-01,TCRBJ02-01,TCRBJ02-01,TCRBJ01-05
Unnamed: 0_level_6,01,01,01,01,01,01,01,01,01,01,...,01,01,01,01,01,01,01,01,01,01
CMV+,2.0,0.0,0.0,1.0,1.0,2.0,1.0,0.0,0.0,1.0,...,1.0,1.0,2.0,0.0,2.0,1.0,0.0,2.0,0.0,1.0
CMV-,0.0,2.0,2.0,1.0,1.0,0.0,1.0,3.0,2.0,1.0,...,1.0,1.0,0.0,2.0,0.0,1.0,2.0,0.0,2.0,1.0
p_value,0.202886,1.0,1.0,0.69883,0.69883,0.202886,0.69883,1.0,1.0,0.69883,...,0.69883,0.69883,0.202886,1.0,0.202886,0.69883,1.0,0.202886,1.0,0.69883


In [49]:
# Get 166 TCRs with smallest p-values
inc.T.sort_values(by='p_value', ascending=True)[:166]

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,CMV+,CMV-,p_value
TCRBV09,TCRBV09-01,,CASSGQGAYEQYF,TCRBJ02,TCRBJ02-07,01,61.0,12.0,7.958014e-13
TCRBV19,TCRBV19-01,,CASSIGPLEHNEQFF,TCRBJ02,TCRBJ02-01,01,30.0,0.0,1.751455e-11
TCRBV05,TCRBV05-01,01,CASSPDRVGQETQYF,TCRBJ02,TCRBJ02-05,01,33.0,1.0,2.664239e-11
TCRBV07,TCRBV07-02,01,CASSLEAEYEQYF,TCRBJ02,TCRBJ02-07,01,30.0,1.0,3.202211e-10
TCRBV28,TCRBV28-01,01,CASSIEGNQPQHF,TCRBJ01,TCRBJ01-05,01,26.0,0.0,5.306200e-10
TCRBV24,unresolved,,CATSDGDEQFF,TCRBJ02,TCRBJ02-01,01,41.0,6.0,5.866204e-10
TCRBV05,TCRBV05-06,01,CASSLVAGGRETQYF,TCRBJ02,TCRBJ02-05,01,50.0,11.0,6.102922e-10
TCRBV07,TCRBV07-06,01,CASSRGRQETQYF,TCRBJ02,TCRBJ02-05,01,37.0,5.0,2.560004e-09
TCRBV09,TCRBV09-01,,CASSAGQGVTYEQYF,TCRBJ02,TCRBJ02-07,01,24.0,0.0,2.882701e-09
TCRBV04,TCRBV04-03,01,CASSPQRNTEAFF,TCRBJ01,TCRBJ01-01,01,29.0,2.0,6.654355e-09


In [21]:
# Get TCRs
TCRs_inc = inc.sort_values(by='p_value', ascending=True, axis=1).columns.values # Rank TCRs by p-values
TCRs_paper = inc_paper['TCR']

In [45]:
print(len(list(set(TCRs_paper).intersection(TCRs_inc[:164]))))
print(len(list(set(TCRs_paper).intersection(TCRs_inc[:165]))))

163
164


In [32]:
# Get the different TCR
TCR_diff = list(set(TCRs_inc[:165])-set(TCRs_paper))
inc[TCR_diff]

Unnamed: 0_level_0,TCRBV07
Unnamed: 0_level_1,TCRBV07-02
Unnamed: 0_level_2,01
Unnamed: 0_level_3,CASSPRTGYEQYF
Unnamed: 0_level_4,TCRBJ02
Unnamed: 0_level_5,TCRBJ02-07
Unnamed: 0_level_6,01
CMV+,29.0
CMV-,9.0
p_value,5.7e-05


In [42]:
# Get the position of the different TCR
TCRs_inc.tolist().index(TCR_diff[0])

120