In [1]:
# Handle table-like data and matrices
import numpy as np
import pandas as pd

# Helpers
from helpers import *
import sys
import pickle

# model selection
from sklearn.model_selection import LeaveOneOut,KFold,StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score,roc_auc_score,log_loss
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

# Algorithm
from MAP_estimator import MAP_estimator
import lightgbm as lgb

# Visualisation
import matplotlib.pyplot as plt
import seaborn as sns

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
def fit_evaluate_model_lgbm(X_train, Y_train, X_valid, Y_valid, train_data_columns, importance=False):
    
    X_trainDF = pd.DataFrame(X_train, columns=train_data_columns)
    X_validDF = pd.DataFrame(X_valid, columns=train_data_columns)
    train_dataset = lgb.Dataset(X_trainDF, Y_train.reshape(Y_train.shape[0]))
    test_dataset = lgb.Dataset(X_validDF, Y_valid.reshape(Y_valid.shape[0]))
    
    # Fit
    params = {
            'objective': 'binary',
            'metric': 'binary_logloss',
            'boosting': 'gbdt'
    }
    gbm = lgb.train(params, train_dataset)
    # Evaluate
    predict_y_proba_gbm = gbm.predict(X_valid, num_iteration=gbm.best_iteration) # Proba of class 1
    predict_y_gbm = np.where(predict_y_proba_gbm.reshape((predict_y_proba_gbm.shape[0])) > 0.5, 1, 0)

    
    if (importance == True):
        ax = lgb.plot_importance(gbm, max_num_features=20, figsize=(16, 5))
        plt.show()
    
    # gbmDF = pd.DataFrame([tuple(gbm.feature_importance())], columns= gbm.feature_name())
    # gbmDF.sort_index(axis=1, inplace=True)
    gbmDF = pd.DataFrame({'TCR': gbm.feature_name(),'Feature_Importance':gbm.feature_imporatance()})
    
    return score, gbmDF, gbm

In [2]:
clf = lgb.LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
                        learning_rate=0.1, max_depth=-1, min_child_samples=20,
                        min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
                        n_jobs=-1, num_leaves=31, objective='binary', random_state=0,
                        reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
                        subsample_for_bin=200000, subsample_freq=1)
LOO = LeaveOneOut()
version = ['count','bin','freq']

In [4]:
for v in version:
    print('Version:',v)
    data = pd.read_pickle('../data/' + v + '_df.pkl')
    LOO_CV(clf,data,v,verbose=False)

Version: count
loocv auroc: 0.6569230769230768
loocv accuracy: 0.6941176470588235

Version: bin
loocv auroc: 0.6715384615384616
loocv accuracy: 0.6941176470588235

Version: freq
loocv auroc: 0.6561538461538462
loocv accuracy: 0.6941176470588235



In [5]:
for v in version:
    print('Version:',v)
    data = pd.read_pickle('../data/' + v + '_df.pkl')
    if v == 'freq':
        drop_cols = ['sample_name', 'phenotype_status','total_TCRs']
    else:
        drop_cols = ['sample_name', 'phenotype_status']
    X = data.drop(drop_cols, axis=1).values
    y = data['phenotype_status']
    print('The number of correctly classified samples:',
          np.count_nonzero(cross_val_score(clf,X,y,cv=LOO,scoring='accuracy')))

Version: count
The number of correctly classified samples: 59
Version: bin
The number of correctly classified samples: 59
Version: freq
The number of correctly classified samples: 59


In [3]:
version = ['bin']
for v in version:
    print('Version:',v)
    data = pd.read_pickle('../data/' + v + '_df.pkl')
    LOO_CV(clf,data,v,verbose=True)

Version: bin
RA47
y_true: 1 y_pred: 1 posterior_c1: 0.599

HC9
y_true: 0 y_pred: 1 posterior_c1: 0.716

RA29
y_true: 1 y_pred: 1 posterior_c1: 0.929

RA8
y_true: 1 y_pred: 1 posterior_c1: 0.987

RA63
y_true: 1 y_pred: 1 posterior_c1: 0.977

RA33
y_true: 1 y_pred: 1 posterior_c1: 0.988

HC17
y_true: 0 y_pred: 0 posterior_c1: 0.426

HC3
y_true: 0 y_pred: 1 posterior_c1: 0.895

RA17
y_true: 1 y_pred: 1 posterior_c1: 0.935

RA46
y_true: 1 y_pred: 0 posterior_c1: 0.432

HC6
y_true: 0 y_pred: 0 posterior_c1: 0.289

RA26
y_true: 1 y_pred: 0 posterior_c1: 0.442

RA15
y_true: 1 y_pred: 1 posterior_c1: 0.941

RA53
y_true: 1 y_pred: 1 posterior_c1: 0.843

RA32
y_true: 1 y_pred: 0 posterior_c1: 0.373

RA40
y_true: 1 y_pred: 1 posterior_c1: 0.687

HC8
y_true: 0 y_pred: 1 posterior_c1: 0.936

RA51
y_true: 1 y_pred: 1 posterior_c1: 0.631

RA68
y_true: 1 y_pred: 1 posterior_c1: 0.904

RA21
y_true: 1 y_pred: 1 posterior_c1: 0.942

RA13
y_true: 1 y_pred: 1 posterior_c1: 0.887

RA3
y_true: 1 y_pred: 1 po

## Feature Selection

In [3]:
data = pd.read_pickle('../data/' + 'count' + '_df.pkl')

In [5]:
drop_cols = ['sample_name', 'phenotype_status']
kf = LeaveOneOut()
for train_index, test_index in kf.split(data):
    train_cv, test_cv = data.iloc[train_index], data.iloc[test_index]

    X_train = train_cv.drop(drop_cols, axis=1)
    X_test = test_cv.drop(drop_cols, axis=1)
    break

In [6]:
clf.fit(X_train, train_cv['phenotype_status'])

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        learning_rate=0.1, max_depth=-1, min_child_samples=20,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
        n_jobs=-1, num_leaves=31, objective='binary', random_state=0,
        reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=1)

In [8]:
np.where(clf.feature_importances_!=0)

(array([ 2438,  2631,  2641,  3508,  5683,  7250, 10086, 11026, 11116,
        11568, 11695, 13452, 13467, 26767, 26880, 33858]),)

In [9]:
X_train2 = lgb.Dataset(X_train,train_cv['phenotype_status'])
X_test2 = lgb.Dataset(X_test,test_cv['phenotype_status'])

In [15]:
params = {
            'objective': 'binary',
            'metric': 'binary_logloss',
            'boosting': 'gbdt'
        }

In [17]:
gbm = lgb.train(params,X_train2)

In [45]:
gbm.feature_importance_

array([0, 0, 0, ..., 0, 0, 0])

In [32]:
gbmDF = pd.DataFrame({'TCR': gbm.feature_name(),'Feature_Importance':gbm.feature_importance})

In [24]:
np.where(gbm.feature_importance()!=0)

(array([ 2438,  2631,  2641,  3508,  5683,  7250, 10086, 11026, 11116,
        11568, 11695, 13452, 13467, 26767, 26880, 33858]),)

In [28]:
gbm.predict(X_test.values,num_iteration=gbm.best_iteration)

array([0.59556484])

In [34]:
gbmDF.sort_index(axis=1, inplace=True)

In [36]:
gbmDF.T.sort_values()

Unnamed: 0,0
"('CAAGGSSYEQYF',_'TCRBV07',_'TCRBV07-08',_'01',_'TCRBJ02',_'TCRBJ02-07',_'01')",0
"('CACLPGQTSYEQYF',_'TCRBV30',_'TCRBV30-01',_'01',_'TCRBJ02',_'TCRBJ02-07',_'01')",0
"('CACRGEGGNTIYF',_'TCRBV30',_'TCRBV30-01',_'01',_'TCRBJ01',_'TCRBJ01-03',_'01')",0
"('CAFGQEGQPQHF',_'TCRBV30',_'TCRBV30-01',_'01',_'TCRBJ01',_'TCRBJ01-05',_'01')",0
"('CAGETAEAFF',_'TCRBV06',_'TCRBV06-01',_'01',_'TCRBJ01',_'TCRBJ01-01',_'01')",0
"('CAGGMNTEAFF',_'TCRBV30',_'TCRBV30-01',_'01',_'TCRBJ01',_'TCRBJ01-01',_'01')",0
"('CAGGNTEAFF',_'TCRBV02',_'TCRBV02-01',_'01',_'TCRBJ01',_'TCRBJ01-01',_'01')",0
"('CAGGRAGGTDTQYF',_'TCRBV07',_'TCRBV07-09',_'null',_'TCRBJ02',_'TCRBJ02-03',_'01')",0
"('CAGGTEAFF',_'TCRBV06',_'TCRBV06-01',_'01',_'TCRBJ01',_'TCRBJ01-01',_'01')",0
"('CAGGTGDSNQPQHF',_'TCRBV10',_'TCRBV10-03',_'01',_'TCRBJ01',_'TCRBJ01-05',_'01')",0
