This notebook is about classifying tcr based on its specificity. We use k-neighbours here. Considering the dataset size and the complexity of tcr, for human, we pick the most 10 antigen epitopes(for both gene). For mouse, for each gene, we pick the most 10 antigen epitopes separately.

In [129]:
import pandas as pd
import numpy as np
from tcrdist.repertoire import TCRrep
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report

In [185]:
def sample(dataset):
    total_samples = 15000

    group_sizes = dataset['antigen.epitope'].value_counts(normalize=True)

    samples_per_group = np.floor(group_sizes * total_samples).astype(int)

    stratified_sample = dataset.groupby('antigen.epitope', group_keys=False).apply(
    lambda x: x.sample(n=min(len(x), samples_per_group[x.name]))
    )

    return stratified_sample

In [186]:
# load data
data = pd.read_csv('vdjdb.txt', sep = '\t')
data = data.drop(['web.method', 'web.method.seq', 'web.cdr3fix.nc','web.cdr3fix.unmp','reference.id', 'method', 'meta', 'cdr3fix'], axis=1)
data_clean = data.dropna()
# top10 = data_clean['antigen.epitope'].value_counts().head(10).index

# human alpha
data_alpha = data_clean[(data_clean['gene'] == 'TRA')& (data_clean['complex.id'] != 0)]
data_alpha_human_test = data_alpha[data_alpha['species'] == 'HomoSapiens']
data_alpha_human_test = data_alpha_human_test.rename(columns = {'gene':'gene_a',
                                           'cdr3':'cdr3_a_aa',
                                           'v.segm':'v_a_gene',
                                           'j.segm':'j_a_gene'})
top10_alpha = data_alpha_human_test['antigen.epitope'].value_counts().head(10).index
alpha_top10_classification = data_alpha_human_test[data_alpha_human_test['antigen.epitope'].isin(top10_alpha)]
alpha_top10_classification = sample(alpha_top10_classification)

# human beta
data_beta = data_clean[(data_clean['gene'] == 'TRB')& (data_clean['complex.id'] != 0)]
data_test_human_beta = data_beta[data_beta['species'] == 'HomoSapiens']
data_test_human_beta = data_test_human_beta.rename(columns = {'gene':'gene_b',
                                           'cdr3':'cdr3_b_aa',
                                           'v.segm':'v_b_gene',
                                           'j.segm':'j_b_gene'})
top10_beta = data_test_human_beta['antigen.epitope'].value_counts().head(10).index
beta_top10_classification = data_test_human_beta[data_test_human_beta['antigen.epitope'].isin(top10_beta)]
beta_top10_classification = sample(beta_top10_classification)

# mouse alpha
data_alpha = data_clean[(data_clean['gene'] == 'TRA')]
data_alpha_mouse = data_alpha[(data_alpha['species'] == 'MusMusculus')]
data_alpha_mouse = data_alpha_mouse.rename(columns = {'gene':'gene_a',
                                           'cdr3':'cdr3_a_aa',
                                           'v.segm':'v_a_gene',
                                           'j.segm':'j_a_gene'})
top10_mouse_alpha = data_alpha_mouse['antigen.epitope'].value_counts().head(10).index
data_alpha_mouse = data_alpha_mouse[data_alpha_mouse['antigen.epitope'].isin(top10_mouse_alpha)]

# mouse beta
data_beta = data_clean[(data_clean['gene'] == 'TRB')]
data_beta_mouse = data_beta[(data_beta['species'] == 'MusMusculus')]
data_beta_mouse = data_beta_mouse.rename(columns = {'gene':'gene_b',
                                           'cdr3':'cdr3_b_aa',
                                           'v.segm':'v_b_gene',
                                           'j.segm':'j_b_gene'})
top10_mouse_beta = data_beta_mouse['antigen.epitope'].value_counts().head(10).index
data_beta_mouse = data_beta_mouse[data_beta_mouse['antigen.epitope'].isin(top10_mouse_beta)]

  stratified_sample = dataset.groupby('antigen.epitope', group_keys=False).apply(
  stratified_sample = dataset.groupby('antigen.epitope', group_keys=False).apply(


# Alpha human

In [193]:
Xtrain, Xtest, ytrain, ytest = train_test_split(alpha_top10_classification, alpha_top10_classification['antigen.epitope'], test_size=0.4, random_state=22)

In [194]:
Xtrain_  = TCRrep(cell_df = Xtrain, 
            organism = 'human', 
            chains = ['alpha'], 
            db_file = 'alphabeta_gammadelta_db.tsv')

ytrain = Xtrain_.clone_df['antigen.epitope']
Xtrain = Xtrain_.pw_alpha

Xtest_ = TCRrep(cell_df = Xtest,             
            organism = 'human', 
            chains = ['alpha'], 
            compute_distances = False)

Xtest_.compute_rect_distances(df = Xtest_.clone_df, df2 = Xtrain_.clone_df)

Xtest = Xtest_.rw_alpha
ytest = Xtest_.clone_df['antigen.epitope']


  self._validate_cell_df()
  clones = cell_df.groupby(index_cols)['count'].agg(np.sum).reset_index()

  self._validate_cell_df()
  clones = cell_df.groupby(index_cols)['count'].agg(np.sum).reset_index()


In [195]:
knn = KNeighborsClassifier(n_neighbors=35, metric='precomputed', weights='distance')

# Fit the classifier on the training data
model=knn.fit(Xtrain, ytrain)

# Make a prediction on the training data
Ypred=model.predict(Xtest)


accuracy = accuracy_score(ytest, Ypred)
f1 = f1_score(ytest, Ypred, average='macro') 

print(f'Accuracy: {accuracy:.2f}')
print(f'F1 Score: {f1:.2f}')

print(classification_report(ytest, Ypred))

Accuracy: 0.71
F1 Score: 0.46
                precision    recall  f1-score   support

    AVFDRKSDAK       0.09      0.03      0.05       498
     GILGFVFTL       0.75      0.69      0.71       568
     IVTDFSVIK       0.50      0.22      0.31       218
     KLGGALQAK       0.74      0.91      0.82      3758
     NLVPMVATV       0.93      0.41      0.57       164
      RAKFKQLL       0.63      0.43      0.51       325
     RLRAEAQVK       0.00      0.00      0.00       114
     SPRWYFYYL       0.87      0.18      0.29       114
TFEYVSQPFLMDLE       0.83      0.95      0.89       112
     YLQPRTFLL       0.61      0.40      0.49       127

      accuracy                           0.71      5998
     macro avg       0.60      0.42      0.46      5998
  weighted avg       0.66      0.71      0.67      5998



# beta human

In [199]:
Xtrain, Xtest, ytrain, ytest = train_test_split(beta_top10_classification, beta_top10_classification['antigen.epitope'], test_size=0.4, random_state=12)

In [200]:
Xtrain_  = TCRrep(cell_df = Xtrain, 
            organism = 'human', 
            chains = ['beta'], 
            db_file = 'alphabeta_gammadelta_db.tsv')

ytrain = Xtrain_.clone_df['antigen.epitope']
Xtrain = Xtrain_.pw_beta

Xtest_ = TCRrep(cell_df = Xtest,               
            organism = 'human', 
            chains = ['beta'], 
            compute_distances = False)

Xtest_.compute_rect_distances(df = Xtest_.clone_df, df2 = Xtrain_.clone_df)

Xtest = Xtest_.rw_beta
ytest = Xtest_.clone_df['antigen.epitope']


  self._validate_cell_df()
  clones = cell_df.groupby(index_cols)['count'].agg(np.sum).reset_index()

  self._validate_cell_df()
  clones = cell_df.groupby(index_cols)['count'].agg(np.sum).reset_index()


In [201]:
knn = KNeighborsClassifier(n_neighbors=13, metric='precomputed', weights='distance')

# Fit the classifier on the training data
model=knn.fit(Xtrain, ytrain)

# Make a prediction on the training data
Ypred=model.predict(Xtest)


accuracy = accuracy_score(ytest, Ypred)
f1 = f1_score(ytest, Ypred, average='macro') 

print(f'Accuracy: {accuracy:.2f}')
print(f'F1 Score: {f1:.2f}')

print(classification_report(ytest, Ypred))

Accuracy: 0.71
F1 Score: 0.43
                precision    recall  f1-score   support

    AVFDRKSDAK       0.09      0.04      0.05       475
     GILGFVFTL       0.75      0.74      0.75       571
     IVTDFSVIK       0.50      0.28      0.36       201
     KLGGALQAK       0.74      0.91      0.82      3785
     NLVPMVATV       0.90      0.40      0.56       161
      RAKFKQLL       0.53      0.40      0.46       327
     RLRAEAQVK       0.00      0.00      0.00       114
     SPRWYFYYL       0.81      0.21      0.34       123
TFEYVSQPFLMDLE       0.74      0.32      0.44       111
     YLQPRTFLL       0.77      0.45      0.57       130

      accuracy                           0.71      5998
     macro avg       0.58      0.37      0.43      5998
  weighted avg       0.66      0.71      0.67      5998



# alpha mouse

In [138]:
Xtrain, Xtest, ytrain, ytest = train_test_split(data_alpha_mouse, data_alpha_mouse['antigen.epitope'], test_size=0.2, random_state=42)

In [139]:
Xtrain_  = TCRrep(cell_df = Xtrain, 
            organism = 'mouse', 
            chains = ['alpha'], 
            db_file = 'alphabeta_gammadelta_db.tsv')

ytrain = Xtrain_.clone_df['antigen.epitope']
Xtrain = Xtrain_.pw_alpha

Xtest_ = TCRrep(cell_df = Xtest,              
            organism = 'mouse', 
            chains = ['alpha'], 
            compute_distances = False)

Xtest_.compute_rect_distances(df = Xtest_.clone_df, df2 = Xtrain_.clone_df)

Xtest = Xtest_.rw_alpha
ytest = Xtest_.clone_df['antigen.epitope']


  self._validate_cell_df()
  clones = cell_df.groupby(index_cols)['count'].agg(np.sum).reset_index()

  self._validate_cell_df()
  clones = cell_df.groupby(index_cols)['count'].agg(np.sum).reset_index()


In [140]:
knn = KNeighborsClassifier(n_neighbors=5, metric='precomputed', weights='distance')

# Fit the classifier on the training data
model=knn.fit(Xtrain, ytrain)

# Make a prediction on the training data
Ypred=model.predict(Xtest)


accuracy = accuracy_score(ytest, Ypred)
f1 = f1_score(ytest, Ypred, average='macro') 

print(f'Accuracy: {accuracy:.2f}')
print(f'F1 Score: {f1:.2f}')

print(classification_report(ytest, Ypred))

Accuracy: 0.68
F1 Score: 0.60
              precision    recall  f1-score   support

   ASNENMETM       0.58      0.63      0.61        63
   HGIRNASFI       0.73      0.61      0.67        59
   KAVYNFATC       0.75      0.21      0.33        14
   LSLRNPILV       0.37      0.44      0.40        25
   SQLLNAKYL       0.57      0.50      0.53         8
  SSLENFRAYV       0.68      0.76      0.72        92
    SSPPMFRV       0.71      0.66      0.68        38
   SSYRRPVGI       0.80      0.80      0.80       120
    TVYGFCLL       0.59      0.65      0.62        20

    accuracy                           0.68       439
   macro avg       0.64      0.59      0.60       439
weighted avg       0.69      0.68      0.68       439



# beta mouse

In [141]:
Xtrain, Xtest, ytrain, ytest = train_test_split(data_beta_mouse, data_beta_mouse['antigen.epitope'], test_size=0.2, random_state=42)

In [142]:
Xtrain_  = TCRrep(cell_df = Xtrain, 
            organism = 'mouse', 
            chains = ['beta'], 
            db_file = 'alphabeta_gammadelta_db.tsv')

ytrain = Xtrain_.clone_df['antigen.epitope']
Xtrain = Xtrain_.pw_beta

Xtest_ = TCRrep(cell_df = Xtest,           
            organism = 'mouse', 
            chains = ['beta'], 
            compute_distances = False)

Xtest_.compute_rect_distances(df = Xtest_.clone_df, df2 = Xtrain_.clone_df)

Xtest = Xtest_.rw_beta
ytest = Xtest_.clone_df['antigen.epitope']


  self._validate_cell_df()
  clones = cell_df.groupby(index_cols)['count'].agg(np.sum).reset_index()

  self._validate_cell_df()
  clones = cell_df.groupby(index_cols)['count'].agg(np.sum).reset_index()


In [143]:
knn = KNeighborsClassifier(n_neighbors=5, metric='precomputed', weights='distance')

# Fit the classifier on the training data
model=knn.fit(Xtrain, ytrain)

# Make a prediction on the training data
Ypred=model.predict(Xtest)


accuracy = accuracy_score(ytest, Ypred)
f1 = f1_score(ytest, Ypred, average='macro') 

print(f'Accuracy: {accuracy:.2f}')
print(f'F1 Score: {f1:.2f}')

print(classification_report(ytest, Ypred))

Accuracy: 0.78
F1 Score: 0.72
              precision    recall  f1-score   support

   ASNENMETM       0.84      0.72      0.77        71
   HGIRNASFI       0.66      0.78      0.71        59
   KAVYNFATC       0.62      0.42      0.50        12
   LSLRNPILV       0.73      0.79      0.76        28
    RALEYKNL       0.65      0.50      0.56        40
   SQLLNAKYL       0.75      0.96      0.84        28
  SSLENFRAYV       0.87      0.76      0.81        96
    SSPPMFRV       0.86      0.92      0.89        66
   SSYRRPVGI       0.80      0.87      0.83       127
    TVYGFCLL       0.64      0.50      0.56        14

    accuracy                           0.78       541
   macro avg       0.74      0.72      0.72       541
weighted avg       0.78      0.78      0.78       541

