In [1]:
# Handle table-like data and matrices
import numpy as np
import pandas as pd
from sympy import *
from scipy.stats import fisher_exact,uniform
# Helpers
import os
import sys
sys.path.insert(0,'../')
from scipy.special import digamma,betaln
import time
from scipy.optimize import minimize
# Prediction
from classifiers import MAP_estimator,cal_p_value,TCRs_selection,LOOCV_MAP

from numpy import random
from sklearn.metrics import accuracy_score, roc_curve, roc_auc_score,confusion_matrix,log_loss
from sklearn.model_selection import LeaveOneOut,KFold,StratifiedKFold

# Visualisation
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline

# import plotly
# plotly.tools.set_credentials_file(username='tracyqin326', api_key='EICCf5vuIzI5hVfA4gYC')
# import plotly.plotly as py
# import plotly.graph_objs as go

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
train_origin = pd.read_csv('../data/'+'train.csv')
count_df = pd.read_pickle('../data/'+'count_df.pkl')

In [3]:
def objective(priors,n,k):
    '''
    Compute objective function value
    Args:
        priors - list [a,b], beta prior
        n - list of the number of unique_TCRs
        k - list of the number of phenotype_associated_TCRs

        NB: n, k are lists of specific class, not the complete list of the all training samples
    '''
    a = priors[0] # parameter a of beta 
    b = priors[1] # parameter b of beta
    N_l = len(n) # number of samples 

    '''
    Compute objective function value
    '''
    sum_log_beta = 0
    for i in range(N_l):
        sum_log_beta += betaln(k[i]+a,n[i]-k[i]+b)

    obj = (-N_l*betaln(a,b))+sum_log_beta

    return obj # return objective value
    
def prior_init(train_df):
    
    neg_df = train_df[train_df['phenotype_status']==0]
    n_c0 = neg_df['unique_TCRs'].tolist()
    k_c0 = neg_df['phenotype_associated_TCRs'].tolist()

    pos_df = train_df[train_df['phenotype_status']==1]
    n_c1 = pos_df['unique_TCRs'].tolist()
    k_c1 = pos_df['phenotype_associated_TCRs'].tolist()
    
    a0min, a0max, a0num = 0.1, 1.5, 50
    b0min, b0max, b0num = 4, 6.5,50
    a0, b0 = np.meshgrid(np.logspace(a0min, a0max, a0num), np.logspace(b0min, b0max, b0num))
    obj0 = objective([a0,b0],n_c0,k_c0)

    a1min, a1max, a1num = 0.1, 1.5, 50
    b1min, b1max, b1num = 4, 6.5,50
    a1, b1 = np.meshgrid(np.logspace(a1min, a1max, a1num), np.logspace(b1min, b1max, b1num))
    obj1 = objective([a1,b1],n_c1,k_c1)
    
    c0_i0,c0_i1 = np.unravel_index(obj0.argmax(), obj0.shape)
    c1_i0,c1_i1 = np.unravel_index(obj1.argmax(), obj1.shape)
    prior_c0_init = [a0[c0_i0][c0_i1],b0[c0_i0][c0_i1]]
    prior_c1_init = [a1[c1_i0][c1_i1],b1[c1_i0][c1_i1]]

    return [prior_c0_init,prior_c1_init]

In [4]:
 def MAP_predict(train,test,prior_c0,prior_c1):
        '''
        Predicting testing set 
        '''
        # Initialize priors and perform MAP estimation
        neg_df = train[train['phenotype_status']==0]
        n_c0 = neg_df['unique_TCRs'].tolist()
        k_c0 = neg_df['phenotype_associated_TCRs'].tolist()

        pos_df = train[train['phenotype_status']==1]
        n_c1 = pos_df['unique_TCRs'].tolist()
        k_c1 = pos_df['phenotype_associated_TCRs'].tolist()
        
        print('prior init:',[prior_c0,prior_c1])
        print('prior init obj:',objective(prior_c0,n_c0,k_c0),objective(prior_c1,n_c1,k_c1))
    
        MAP = MAP_estimator(prior_c0,prior_c1) # construct a MAP_estimator instance
        MAP.fit(train) # train the model using training set
        priors = MAP.priors()
        print('After opt,priors:',priors)
        print('After opt, prior objs:',objective(priors[0],n_c0,k_c0),objective(priors[1],n_c1,k_c1))
        y_pred = MAP.predict(test)[0] # predict label of testing sample
        y_proba = MAP.predict_proba_c1(test)[0] # compute positive-class posterior probability
        # test_proba_v2 = MAP.predict_proba(test_cv,1)[0] # compute positive-class posterior probability by predict_proba method

        return y_pred, y_proba

In [5]:
ref = [[2.05, 20217.54],[18.66, 12616.69]]

In [8]:
verbose = True
sample_name = []
y_true = []
y_pred = []
y_proba = [] # used to store positive-class posterior probability using posterior_c1 method
# y_proba_v2 = [] # used to test the method predict_proba

ref_proba_l = []
TCRs = count_df.drop(['sample_name','phenotype_status'],axis=1).columns.values # a list of TCR candidates
kf = LeaveOneOut()
for train_index,test_index in kf.split(train_origin): # for each cv round

    train = train_origin.copy(deep=True) # a copy of the original training data
    train_cv, test_cv = train.iloc[train_index], train.iloc[test_index] # get training samples and one testing sample

    # Select a list of associated TCRs based on count df of training samples and threshold
    count_train = count_df[count_df['sample_name'].isin(train_cv['sample_name'])] # count df of training samples
    count_test = count_df[count_df['sample_name'].isin(test_cv['sample_name'])] # count df of the testing sample

    TCRs_asso = TCRs_selection(count_train,TCRs,0.2) # select a list of TCRs

    '''
    Get statistics: number of phenotype_associated_TCRs of each sample
    '''
    # training set
    train_sample = train_cv['sample_name'].tolist()
    train_asso = []
    for i in range(len(train_sample)): # for each training sample

        temp_train = count_train.loc[count_train.sample_name==train_sample[i]] # count df of the training sample
        i_asso = np.count_nonzero(temp_train[TCRs_asso].values) # count the number of phenotype_associated TCRs in this sample
        train_asso.append(i_asso)

    train_cv['phenotype_associated_TCRs'] = train_asso # add the 'phenotype_associated_TCRs' column to the training data


    # testing set, the same steps as the above
    test_sample = test_cv['sample_name'].tolist()
    test_asso = []
    for i in range(len(test_sample)): # for each testing sample (in LOOCV, only one)

        temp_test = count_test.loc[count_test.sample_name==test_sample[i]]
        i_asso = np.count_nonzero(temp_test[TCRs_asso].values)
        test_asso.append(i_asso)

    test_cv['phenotype_associated_TCRs'] = test_asso

    '''
    Train the estimator, predict testing set (testing sample)
    '''

    priors_init_value = prior_init(train_cv)
    prior_c0 = priors_init_value[0] 
    prior_c1 = priors_init_value[1]
    
    test_pred, test_proba = MAP_predict(train_cv,test_cv,prior_c0,prior_c1)
    
    # append results to lists, round to 3 decimal points
    sample_name.append(test_cv['sample_name'].tolist()[0])
    y_true.append(test_cv['phenotype_status'].tolist()[0])
    y_pred.append(test_pred)
    y_proba.append(round(test_proba,3))
    # y_proba_v2.append(round(test_proba_v2,3))

    # Results of this round

    print('y_true:',test_cv['phenotype_status'].tolist()[0],' y_pred:',test_pred,
        ' y_proba_c1: %.3f'%test_proba)

    print('\nReference:')

    ref_c0 = ref[0]
    ref_c1 = ref[1]
    ref_pred,ref_proba = MAP_predict(train_cv,test_cv,ref_c0,ref_c1)
    ref_proba_l.append(ref_proba)
    print('y_true:',test_cv['phenotype_status'].tolist()[0],' y_pred:',ref_pred,
        ' y_proba_c1: %.3f'%ref_proba)
    print('*'*100)

print('cv auroc:',roc_auc_score(y_true,y_proba))
print('cv log_loss:',log_loss(y_true,y_proba))

print('\nReference:')
print('cv auroc:',roc_auc_score(y_true,ref_proba_l))
print('cv log_loss:',log_loss(y_true,ref_proba_l))
print()

prior init: [[1.5336077187700117, 40949.15062380427], [25.95886586126395, 40949.15062380427]]
prior init obj: -166.8989436897682 -4409.619491830817
After opt,priors: [array([1.52889130e+00, 4.09491506e+04]), array([2.62335318e+01, 4.09490295e+04])]
After opt, prior objs: -166.89889658044558 -4409.59847316402
y_true: 1  y_pred: 1  y_proba_c1: 0.874

Reference:
prior init: [[2.05, 20217.54], [18.66, 12616.69]]
prior init obj: -172.78902336815372 -4531.616002201263
After opt,priors: [array([8.09625828e-01, 2.02175400e+04]), array([8.39553565e+00, 1.26166663e+04])]
After opt, prior objs: -167.11672014329815 -4411.208596981873
y_true: 1  y_pred: 1  y_proba_c1: 0.871
****************************************************************************************************
prior init: [[1.2589254117941673, 51794.74679231213], [31.622776601683793, 93193.95762340775]]
prior init obj: -103.40452490956523 -2566.654057172942
After opt,priors: [array([1.11285560e+00, 5.17947467e+04]), array([3.16227665e+

prior init: [[1.5336077187700117, 40949.15062380427], [22.758459260747887, 36410.31949310677]]
prior init obj: -166.8989436897682 -4368.317717176571
After opt,priors: [array([1.52889130e+00, 4.09491506e+04]), array([2.29892079e+01, 3.64102112e+04])]
After opt, prior objs: -166.89889658044558 -4368.299901429797
y_true: 1  y_pred: 1  y_proba_c1: 0.987

Reference:
prior init: [[2.05, 20217.54], [18.66, 12616.69]]
prior init obj: -172.78902336815372 -4494.54225064769
After opt,priors: [array([8.09625828e-01, 2.02175400e+04]), array([8.25455753e+00, 1.26166658e+04])]
After opt, prior objs: -167.11672014329815 -4369.656631351041
y_true: 1  y_pred: 1  y_proba_c1: 0.975
****************************************************************************************************
prior init: [[1.5336077187700117, 40949.15062380427], [22.758459260747887, 36410.31949310677]]
prior init obj: -166.8989436897682 -4314.028246618924
After opt,priors: [array([1.52889130e+00, 4.09491506e+04]), array([2.28053261e+

prior init: [[1.5336077187700117, 40949.15062380427], [27.724079967417744, 46053.78255822417]]
prior init obj: -166.8989436897682 -4267.894113936578
After opt,priors: [array([1.52889130e+00, 4.09491506e+04]), array([2.79523000e+01, 4.60536595e+04])]
After opt, prior objs: -166.89889658044558 -4267.881562988157
y_true: 1  y_pred: 1  y_proba_c1: 1.000

Reference:
prior init: [[2.05, 20217.54], [18.66, 12616.69]]
prior init obj: -172.78902336815372 -4404.185652614411
After opt,priors: [array([8.09625828e-01, 2.02175400e+04]), array([7.95975994e+00, 1.26166677e+04])]
After opt, prior objs: -167.11672014329815 -4269.840480503466
y_true: 1  y_pred: 1  y_proba_c1: 1.000
****************************************************************************************************
prior init: [[1.5336077187700117, 40949.15062380427], [22.758459260747887, 36410.31949310677]]
prior init obj: -166.8989436897682 -4316.49860408355
After opt,priors: [array([1.52889130e+00, 4.09491506e+04]), array([2.26420944e+

prior init: [[1.2589254117941673, 36410.31949310677], [16.378937069540648, 28786.155923545713]]
prior init obj: -156.32420255732723 -3961.2534018288134
After opt,priors: [array([1.22684685e+00, 3.64103194e+04]), array([1.65198750e+01, 2.87861389e+04])]
After opt, prior objs: -156.32132043957245 -3961.242154253414
y_true: 1  y_pred: 1  y_proba_c1: 0.950

Reference:
prior init: [[2.05, 20217.54], [18.66, 12616.69]]
prior init obj: -163.04023632145254 -4111.355449169059
After opt,priors: [array([7.19087469e-01, 2.02175401e+04]), array([7.47428516e+00, 1.26166702e+04])]
After opt, prior objs: -156.38097173860297 -3962.4843814124615
y_true: 1  y_pred: 1  y_proba_c1: 0.934
****************************************************************************************************
prior init: [[1.2589254117941673, 58251.36712468927], [29.60932939627084, 82864.27728546843]]
prior init obj: -103.63512666162569 -2638.3636196429143
After opt,priors: [array([1.22414327e+00, 5.82513670e+04]), array([2.9127

prior init: [[1.5336077187700117, 40949.15062380427], [19.952623149688797, 32374.575428176468]]
prior init obj: -166.8989436897682 -4247.232377416978
After opt,priors: [array([1.52889130e+00, 4.09491506e+04]), array([1.98910943e+01, 3.23745651e+04])]
After opt, prior objs: -166.89889658044558 -4247.230853267654
y_true: 1  y_pred: 1  y_proba_c1: 1.000

Reference:
prior init: [[2.05, 20217.54], [18.66, 12616.69]]
prior init obj: -172.78902336815372 -4380.546154034775
After opt,priors: [array([8.09625828e-01, 2.02175400e+04]), array([8.00945120e+00, 1.26166673e+04])]
After opt, prior objs: -167.11672014329815 -4248.747752467112
y_true: 1  y_pred: 1  y_proba_c1: 0.998
****************************************************************************************************
prior init: [[1.5336077187700117, 40949.15062380427], [16.378937069540648, 25595.479226995332]]
prior init obj: -166.8989436897682 -4439.5870508138905
After opt,priors: [array([1.52889130e+00, 4.09491506e+04]), array([1.637412

prior init: [[1.5336077187700117, 40949.15062380427], [18.682223847710375, 32374.575428176468]]
prior init obj: -166.8989436897682 -4012.8416748907766
After opt,priors: [array([1.52889130e+00, 4.09491506e+04]), array([1.88677880e+01, 3.23745065e+04])]
After opt, prior objs: -166.89889658044558 -4012.8262910136255
y_true: 1  y_pred: 1  y_proba_c1: 0.999

Reference:
prior init: [[2.05, 20217.54], [18.66, 12616.69]]
prior init obj: -172.78902336815372 -4158.250525175012
After opt,priors: [array([8.09625828e-01, 2.02175400e+04]), array([7.62706966e+00, 1.26166695e+04])]
After opt, prior objs: -167.11672014329815 -4014.2073216588324
y_true: 1  y_pred: 1  y_proba_c1: 0.996
****************************************************************************************************
prior init: [[1.2589254117941673, 36410.31949310677], [22.758459260747887, 36410.31949310677]]
prior init obj: -156.32420255732723 -4283.174339151708
After opt,priors: [array([1.22684685e+00, 3.64103194e+04]), array([2.2654

prior init: [[1.5336077187700117, 40949.15062380427], [24.30604433384409, 40949.15062380427]]
prior init obj: -166.8989436897682 -4091.8488185824244
After opt,priors: [array([1.52889130e+00, 4.09491506e+04]), array([2.45174219e+01, 4.09490408e+04])]
After opt, prior objs: -166.89889658044558 -4091.835441626492
y_true: 1  y_pred: 1  y_proba_c1: 0.998

Reference:
prior init: [[2.05, 20217.54], [18.66, 12616.69]]
prior init obj: -172.78902336815372 -4229.097948608527
After opt,priors: [array([8.09625828e-01, 2.02175400e+04]), array([7.87148155e+00, 1.26166681e+04])]
After opt, prior objs: -167.11672014329815 -4093.345459246324
y_true: 1  y_pred: 1  y_proba_c1: 0.992
****************************************************************************************************
prior init: [[1.5336077187700117, 40949.15062380427], [22.758459260747887, 36410.31949310677]]
prior init obj: -166.8989436897682 -4332.883336872794
After opt,priors: [array([1.52889130e+00, 4.09491506e+04]), array([2.27094030e

prior init: [[1.2589254117941673, 28786.155923545713], [29.60932939627084, 65512.85568595509]]
prior init obj: -163.49867277487647 -3351.4195544170216
After opt,priors: [array([1.13140158e+00, 2.87861558e+04]), array([3.01720205e+01, 6.55127046e+04])]
After opt, prior objs: -163.44805739592994 -3351.3651365202386
y_true: 0  y_pred: 1  y_proba_c1: 0.990

Reference:
prior init: [[2.05, 20217.54], [18.66, 12616.69]]
prior init obj: -168.25135818126728 -3558.605257878924
After opt,priors: [array([8.18563975e-01, 2.02175399e+04]), array([   13.66000171, 12616.69413033])]
After opt, prior objs: -163.28774154552957 -3445.176559011903
y_true: 0  y_pred: 0  y_proba_c1: 0.335
****************************************************************************************************
prior init: [[1.5336077187700117, 40949.15062380427], [25.95886586126395, 46053.78255822417]]
prior init obj: -166.8989436897682 -3826.624077100074
After opt,priors: [array([1.52889130e+00, 4.09491506e+04]), array([2.5778361