In [1]:
import matplotlib
matplotlib.use('Agg')

# Handle table-like data and matrices
import numpy as np
import pandas as pd

# Helpers
import sys
sys.path.insert(0,'../../')
from utils import data_path,results_path,grid_search,estimator_result,cross_validate,evaluate_param
from scipy.sparse import csr_matrix,save_npz,load_npz
from sklearn.model_selection import cross_val_score,LeaveOneOut,StratifiedKFold
from sklearn.metrics import accuracy_score,roc_auc_score,roc_curve
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import MinMaxScaler,StandardScaler
import pickle

# Feature selection
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import RFECV,VarianceThreshold

# Algorithm
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

# Visualisation
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

## Load data

In [2]:
store = pd.HDFStore(data_path+'version_1/'+'data.h5')
print(store.keys())
train = store['train']
store.close()

['/test', '/train']


In [3]:
train.drop(['sample_name','CMV_status'],axis=1,inplace=True)
TCRs=train.columns.values

In [4]:
# train_X, train_y, test_X ,test_y = load_data('v1','freq')
with open(data_path + 'sorted_TCRs_ind.pkl', 'rb') as f:
    TCRs_ind = pickle.load(f)

p = 70
ind = TCRs_ind[:p]
# train_X_sub = train_X[:,ind].toarray()
# test_X_sub = test_X[:,ind].toarray()

In [5]:
train_X = load_npz(data_path + 'version_1/'+ 'train_bin.npz')
test_X = load_npz(data_path + 'version_1/'+ 'test_bin.npz')

In [6]:
train_X_sub = train_X[:,ind].toarray()
test_X_sub = test_X[:,ind].toarray()

In [7]:
df = pd.DataFrame(train_X_sub,columns = np.array(TCRs)[list(ind)])

In [8]:
df.apply(pd.Series.value_counts)

Unnamed: 0,"(TCRBV09, TCRBV09-01, null, CASSGQGAYEQYF, TCRBJ02, TCRBJ02-07, 01)","(TCRBV19, TCRBV19-01, null, CASSIGPLEHNEQFF, TCRBJ02, TCRBJ02-01, 01)","(TCRBV05, TCRBV05-01, 01, CASSPDRVGQETQYF, TCRBJ02, TCRBJ02-05, 01)","(TCRBV07, TCRBV07-02, 01, CASSLEAEYEQYF, TCRBJ02, TCRBJ02-07, 01)","(TCRBV28, TCRBV28-01, 01, CASSIEGNQPQHF, TCRBJ01, TCRBJ01-05, 01)","(TCRBV24, unresolved, null, CATSDGDEQFF, TCRBJ02, TCRBJ02-01, 01)","(TCRBV05, TCRBV05-06, 01, CASSLVAGGRETQYF, TCRBJ02, TCRBJ02-05, 01)","(TCRBV07, TCRBV07-06, 01, CASSRGRQETQYF, TCRBJ02, TCRBJ02-05, 01)","(TCRBV09, TCRBV09-01, null, CASSAGQGVTYEQYF, TCRBJ02, TCRBJ02-07, 01)","(TCRBV04, TCRBV04-03, 01, CASSPQRNTEAFF, TCRBJ01, TCRBJ01-01, 01)",...,"(TCRBV12, unresolved, null, CASSLTGGRNQPQHF, TCRBJ01, TCRBJ01-05, 01)","(TCRBV05, TCRBV05-08, 01, CASSIQGYSNQPQHF, TCRBJ01, TCRBJ01-05, 01)","(TCRBV19, TCRBV19-01, null, CASSTTGGDGYTF, TCRBJ01, TCRBJ01-02, 01)","(TCRBV09, TCRBV09-01, null, CASSVTGGTDTQYF, TCRBJ02, TCRBJ02-03, 01)","(TCRBV09, TCRBV09-01, null, CASSVLAGPTDTQYF, TCRBJ02, TCRBJ02-03, 01)","(TCRBV07, TCRBV07-09, null, CASSHRDRNYEQYF, TCRBJ02, TCRBJ02-07, 01)","(TCRBV04, TCRBV04-03, 01, CASSPSRNTEAFF, TCRBJ01, TCRBJ01-01, 01)","(TCRBV12, unresolved, null, CASSLGGPGDTQYF, TCRBJ02, TCRBJ02-03, 01)","(TCRBV06, TCRBV06-01, 01, CASSEARGGVEKLFF, TCRBJ01, TCRBJ01-04, 01)","(TCRBV06, TCRBV06-04, null, CASRSDSGANVLTF, TCRBJ02, TCRBJ02-06, 01)"
0,568,611,607,610,615,594,580,599,617,610,...,594,626,607,525,610,619,619,613,616,623
1,73,30,34,31,26,47,61,42,24,31,...,47,15,34,116,31,22,22,28,25,18


In [9]:
train_y = pd.read_csv(data_path + 'train_Y.csv')['CMV_status']
test_y = pd.read_csv(data_path + 'test_Y.csv')['CMV_status']

In [10]:
store = pd.HDFStore(data_path+'version_1/'+'data.h5')
test = store['test']
store.close()

In [11]:
test.drop(['sample_name','CMV_status'],axis=1,inplace=True)
TCRs = test.columns.values
df_test = pd.DataFrame(test_X_sub,columns = TCRs[list(ind)])

In [12]:
df_test.apply(pd.Series.value_counts)

Unnamed: 0,"(TCRBV09, TCRBV09-01, null, CASSGQGAYEQYF, TCRBJ02, TCRBJ02-07, 01)","(TCRBV19, TCRBV19-01, null, CASSIGPLEHNEQFF, TCRBJ02, TCRBJ02-01, 01)","(TCRBV05, TCRBV05-01, 01, CASSPDRVGQETQYF, TCRBJ02, TCRBJ02-05, 01)","(TCRBV07, TCRBV07-02, 01, CASSLEAEYEQYF, TCRBJ02, TCRBJ02-07, 01)","(TCRBV28, TCRBV28-01, 01, CASSIEGNQPQHF, TCRBJ01, TCRBJ01-05, 01)","(TCRBV24, unresolved, null, CATSDGDEQFF, TCRBJ02, TCRBJ02-01, 01)","(TCRBV05, TCRBV05-06, 01, CASSLVAGGRETQYF, TCRBJ02, TCRBJ02-05, 01)","(TCRBV07, TCRBV07-06, 01, CASSRGRQETQYF, TCRBJ02, TCRBJ02-05, 01)","(TCRBV09, TCRBV09-01, null, CASSAGQGVTYEQYF, TCRBJ02, TCRBJ02-07, 01)","(TCRBV04, TCRBV04-03, 01, CASSPQRNTEAFF, TCRBJ01, TCRBJ01-01, 01)",...,"(TCRBV12, unresolved, null, CASSLTGGRNQPQHF, TCRBJ01, TCRBJ01-05, 01)","(TCRBV05, TCRBV05-08, 01, CASSIQGYSNQPQHF, TCRBJ01, TCRBJ01-05, 01)","(TCRBV19, TCRBV19-01, null, CASSTTGGDGYTF, TCRBJ01, TCRBJ01-02, 01)","(TCRBV09, TCRBV09-01, null, CASSVTGGTDTQYF, TCRBJ02, TCRBJ02-03, 01)","(TCRBV09, TCRBV09-01, null, CASSVLAGPTDTQYF, TCRBJ02, TCRBJ02-03, 01)","(TCRBV07, TCRBV07-09, null, CASSHRDRNYEQYF, TCRBJ02, TCRBJ02-07, 01)","(TCRBV04, TCRBV04-03, 01, CASSPSRNTEAFF, TCRBJ01, TCRBJ01-01, 01)","(TCRBV12, unresolved, null, CASSLGGPGDTQYF, TCRBJ02, TCRBJ02-03, 01)","(TCRBV06, TCRBV06-01, 01, CASSEARGGVEKLFF, TCRBJ01, TCRBJ01-04, 01)","(TCRBV06, TCRBV06-04, null, CASRSDSGANVLTF, TCRBJ02, TCRBJ02-06, 01)"
0,107,119,114,114,116,116,115,110,115,115,...,112,117,115,100,115,114,117,120.0,117,118
1,13,1,6,6,4,4,5,10,5,5,...,8,3,5,20,5,6,3,,3,2


## Clustering

In [13]:
from sklearn.cluster import KMeans

In [14]:
kmeans = KMeans(n_clusters=69, random_state=0).fit(df.T)

In [15]:
y_pred = kmeans.fit_predict(df.T)
unique_elements, counts_elements = np.unique(y_pred, return_counts=True)

In [18]:
for e,c in zip(unique_elements,counts_elements):
    if c >1 :
        print(e,c)

18 2


In [19]:
np.where(y_pred==18)

(array([18, 26]),)

In [32]:
kmeans = KMeans(n_clusters=68, random_state=0).fit(train_X_sub.T)
y_pred = kmeans.fit_predict(train_X_sub.T)
unique_elements, counts_elements = np.unique(y_pred, return_counts=True)
for e,c in zip(unique_elements,counts_elements):
    if c >1 :
        print(e,c)

2 2
18 2


In [34]:
np.where(y_pred==2)

(array([17, 28]),)

In [35]:
np.where(y_pred==18)

(array([18, 26]),)

### Feature construction

In [20]:
TCRs = df.columns.values

In [21]:
df['18_26'] = df[TCRs[18]]+df[TCRs[26]]
df_test['18_26'] = df_test[TCRs[18]]+df_test[TCRs[26]]

In [41]:
df['17_28'] = df[TCRs[17]]+df[TCRs[28]]
df_test['17_28'] = df_test[TCRs[17]]+df_test[TCRs[28]]

In [58]:
df.drop(['18_26'],axis=1,inplace=True)
df_test.drop(['18_26'],axis=1,inplace=True)

In [59]:
lr = LogisticRegression(random_state=0,penalty='l1',intercept_scaling=0.5)
estimator_result(lr,df.values,train_y,df_test.values,test_y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=0.5, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

Cross validation:
accuracy score 0.8891277472527473
AUROC 0.9445980921104074
________________________________________________________________________________
Training set:
accuracy score 0.9297971918876755
AUROC 0.9661892497640767
log-loss: 0.2313099088455185
________________________________________________________________________________
Testing set;
accuracy score: 0.8916666666666667
AUROC 0.9136118215402101
log-loss: 0.3729052318933802
classification_report
             precision    recall  f1-score   support

          0       0.87      0.96      0.91        69
          1       0.93      0.80      0.86        51

avg / total       0.90      0.89      0.89       120

Confusion matrix:
      CMV-  CMV+
CMV-    66     3
CMV

In [60]:
lr2 = LogisticRegression(random_state=0,intercept_scaling=0.5)
estimator_result(lr2,df.values,train_y,df_test.values,test_y)
print()
estimator_result(lr2,train_X_sub,train_y,test_X_sub,test_y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=0.5, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

Cross validation:
accuracy score 0.900090048840049
AUROC 0.9529681757760574
________________________________________________________________________________
Training set:
accuracy score 0.9297971918876755
AUROC 0.9666070305127399
log-loss: 0.24513092352395663
________________________________________________________________________________
Testing set;
accuracy score: 0.8916666666666667
AUROC 0.9241261722080136
log-loss: 0.3560160187941102
classification_report
             precision    recall  f1-score   support

          0       0.87      0.96      0.91        69
          1       0.93      0.80      0.86        51

avg / total       0.90      0.89      0.89       120

Confusion matrix:
      CMV-  CMV+
CMV-    66     3
CMV