This notebook is about classifying tcr based on its specificity. We use k-neighbours here. Considering the dataset size and the complexity of tcr, for human, we pick the most 10 antigen epitopes(for both gene). For mouse, for each gene, we pick the most 10 antigen epitopes separately.

In [1]:
import pandas as pd
import numpy as np
from tcrdist.repertoire import TCRrep
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report

In [2]:
# load data
data = pd.read_csv('vdjdb.txt', sep = '\t')
data = data.drop(['web.method', 'web.method.seq', 'web.cdr3fix.nc','web.cdr3fix.unmp','reference.id', 'method', 'meta', 'cdr3fix'], axis=1)
data_clean = data.dropna()
# top10 = data_clean['antigen.epitope'].value_counts().head(10).index

# human alpha
data_alpha = data_clean[(data_clean['gene'] == 'TRA') & (data_clean['complex.id'] == 0)]
data_alpha_human_test = data_alpha[data_alpha['species'] == 'HomoSapiens']
data_alpha_human_test = data_alpha_human_test.rename(columns = {'gene':'gene_a',
                                           'cdr3':'cdr3_a_aa',
                                           'v.segm':'v_a_gene',
                                           'j.segm':'j_a_gene'})
top10 = data_alpha_human_test['antigen.epitope'].value_counts().head(10).index
alpha_top10_classification = data_alpha_human_test[data_alpha_human_test['antigen.epitope'].isin(top10)]

# human beta
data_beta = data_clean[(data_clean['gene'] == 'TRB') & (data_clean['complex.id'] == 0)]
data_test_human_beta = data_beta[data_beta['species'] == 'HomoSapiens']
data_test_human_beta = data_test_human_beta.rename(columns = {'gene':'gene_b',
                                           'cdr3':'cdr3_b_aa',
                                           'v.segm':'v_b_gene',
                                           'j.segm':'j_b_gene'})
top10 = data_test_human_beta['antigen.epitope'].value_counts().head(10).index
beta_top10_classification = data_test_human_beta[data_test_human_beta['antigen.epitope'].isin(top10)]

# mouse alpha
data_alpha = data_clean[(data_clean['gene'] == 'TRA')]
data_alpha_mouse = data_alpha[(data_alpha['species'] == 'MusMusculus')]
data_alpha_mouse = data_alpha_mouse.rename(columns = {'gene':'gene_a',
                                           'cdr3':'cdr3_a_aa',
                                           'v.segm':'v_a_gene',
                                           'j.segm':'j_a_gene'})
top10_mouse_alpha = data_alpha_mouse['antigen.epitope'].value_counts().head(10).index
data_alpha_mouse = data_alpha_mouse[data_alpha_mouse['antigen.epitope'].isin(top10_mouse_alpha)]

# mouse beta
data_beta = data_clean[(data_clean['gene'] == 'TRB')]
data_beta_mouse = data_beta[(data_beta['species'] == 'MusMusculus')]
data_beta_mouse = data_beta_mouse.rename(columns = {'gene':'gene_b',
                                           'cdr3':'cdr3_b_aa',
                                           'v.segm':'v_b_gene',
                                           'j.segm':'j_b_gene'})
top10_mouse_beta = data_beta_mouse['antigen.epitope'].value_counts().head(10).index
data_beta_mouse = data_beta_mouse[data_beta_mouse['antigen.epitope'].isin(top10_mouse_beta)]

# Alpha human

In [3]:
Xtrain, Xtest, ytrain, ytest = train_test_split(alpha_top10_classification, alpha_top10_classification['antigen.epitope'], test_size=0.2, random_state=12)

In [4]:
Xtrain_  = TCRrep(cell_df = Xtrain, 
            organism = 'human', 
            chains = ['alpha'], 
            db_file = 'alphabeta_gammadelta_db.tsv')

ytrain = Xtrain_.clone_df['antigen.epitope']
Xtrain = Xtrain_.pw_alpha

Xtest_ = TCRrep(cell_df = Xtest,             
            organism = 'human', 
            chains = ['alpha'], 
            compute_distances = False)

Xtest_.compute_rect_distances(df = Xtest_.clone_df, df2 = Xtrain_.clone_df)

Xtest = Xtest_.rw_alpha
ytest = Xtest_.clone_df['antigen.epitope']


  self._validate_cell_df()
  clones = cell_df.groupby(index_cols)['count'].agg(np.sum).reset_index()

  self._validate_cell_df()
  clones = cell_df.groupby(index_cols)['count'].agg(np.sum).reset_index()


In [5]:
knn = KNeighborsClassifier(n_neighbors=15, metric='precomputed', weights='distance')

# Fit the classifier on the training data
model=knn.fit(Xtrain, ytrain)

# Make a prediction on the training data
Ypred=model.predict(Xtest)


accuracy = accuracy_score(ytest, Ypred)
f1 = f1_score(ytest, Ypred, average='macro') 

print(f'Accuracy: {accuracy:.2f}')
print(f'F1 Score: {f1:.2f}')

print(classification_report(ytest, Ypred))

Accuracy: 0.62
F1 Score: 0.57
                      precision    recall  f1-score   support

FRDYVDRFYKTLRAEQASQE       0.79      0.91      0.85        58
           GILGFVFTL       0.68      0.61      0.64       500
      GLIYNRMGAVTTEV       0.60      0.31      0.41        29
           LLLGIGILV       0.58      0.53      0.56        85
           LLWNGPMAV       0.64      0.45      0.53        20
            NEGVKAAW       0.89      0.65      0.75        49
           NLVPMVATV       0.53      0.65      0.58       384
       PKYVKQNTLKLAT       0.33      0.12      0.17        17
    QARQMVQAMRTIGTHP       0.45      0.47      0.46        19
           YLQPRTFLL       0.76      0.74      0.75        81

            accuracy                           0.62      1242
           macro avg       0.63      0.55      0.57      1242
        weighted avg       0.63      0.62      0.62      1242



# beta human

In [6]:
Xtrain, Xtest, ytrain, ytest = train_test_split(beta_top10_classification, beta_top10_classification['antigen.epitope'], test_size=0.3, random_state=12)

In [7]:
Xtrain_  = TCRrep(cell_df = Xtrain, 
            organism = 'human', 
            chains = ['beta'], 
            db_file = 'alphabeta_gammadelta_db.tsv')

ytrain = Xtrain_.clone_df['antigen.epitope']
Xtrain = Xtrain_.pw_beta

Xtest_ = TCRrep(cell_df = Xtest,               
            organism = 'human', 
            chains = ['beta'], 
            compute_distances = False)

Xtest_.compute_rect_distances(df = Xtest_.clone_df, df2 = Xtrain_.clone_df)

Xtest = Xtest_.rw_beta
ytest = Xtest_.clone_df['antigen.epitope']


  self._validate_cell_df()
  clones = cell_df.groupby(index_cols)['count'].agg(np.sum).reset_index()

  self._validate_cell_df()
  clones = cell_df.groupby(index_cols)['count'].agg(np.sum).reset_index()


In [8]:
knn = KNeighborsClassifier(n_neighbors=25, metric='precomputed', weights='distance')

# Fit the classifier on the training data
model=knn.fit(Xtrain, ytrain)

# Make a prediction on the training data
Ypred=model.predict(Xtest)


accuracy = accuracy_score(ytest, Ypred)
f1 = f1_score(ytest, Ypred, average='macro') 

print(f'Accuracy: {accuracy:.2f}')
print(f'F1 Score: {f1:.2f}')

print(classification_report(ytest, Ypred))

Accuracy: 0.66
F1 Score: 0.58
                      precision    recall  f1-score   support

          ELAGIGILTV       0.84      0.73      0.78       471
FRDYVDRFYKTLRAEQASQE       0.80      0.95      0.87       146
           GILGFVFTL       0.80      0.57      0.66       890
           GLCTLVAML       0.65      0.33      0.44       302
         KAFSPEVIPMF       0.81      0.60      0.69        96
          KRWIILGLNK       0.72      0.53      0.61       173
     LLQTGIHVRVSQPSL       0.75      0.04      0.07        85
           NLVPMVATV       0.56      0.83      0.67      1361
       PKYVKQNTLKLAT       0.74      0.15      0.24        96
           YLQPRTFLL       0.70      0.77      0.73       180

            accuracy                           0.66      3800
           macro avg       0.74      0.55      0.58      3800
        weighted avg       0.70      0.66      0.65      3800



# alpha mouse

In [9]:
Xtrain, Xtest, ytrain, ytest = train_test_split(data_alpha_mouse, data_alpha_mouse['antigen.epitope'], test_size=0.2, random_state=42)

In [10]:
Xtrain_  = TCRrep(cell_df = Xtrain, 
            organism = 'mouse', 
            chains = ['alpha'], 
            db_file = 'alphabeta_gammadelta_db.tsv')

ytrain = Xtrain_.clone_df['antigen.epitope']
Xtrain = Xtrain_.pw_alpha

Xtest_ = TCRrep(cell_df = Xtest,              
            organism = 'mouse', 
            chains = ['alpha'], 
            compute_distances = False)

Xtest_.compute_rect_distances(df = Xtest_.clone_df, df2 = Xtrain_.clone_df)

Xtest = Xtest_.rw_alpha
ytest = Xtest_.clone_df['antigen.epitope']


  self._validate_cell_df()
  clones = cell_df.groupby(index_cols)['count'].agg(np.sum).reset_index()

  self._validate_cell_df()
  clones = cell_df.groupby(index_cols)['count'].agg(np.sum).reset_index()


In [11]:
knn = KNeighborsClassifier(n_neighbors=5, metric='precomputed', weights='distance')

# Fit the classifier on the training data
model=knn.fit(Xtrain, ytrain)

# Make a prediction on the training data
Ypred=model.predict(Xtest)


accuracy = accuracy_score(ytest, Ypred)
f1 = f1_score(ytest, Ypred, average='macro') 

print(f'Accuracy: {accuracy:.2f}')
print(f'F1 Score: {f1:.2f}')

print(classification_report(ytest, Ypred))

Accuracy: 0.68
F1 Score: 0.60
              precision    recall  f1-score   support

   ASNENMETM       0.58      0.63      0.61        63
   HGIRNASFI       0.73      0.61      0.67        59
   KAVYNFATC       0.75      0.21      0.33        14
   LSLRNPILV       0.37      0.44      0.40        25
   SQLLNAKYL       0.57      0.50      0.53         8
  SSLENFRAYV       0.68      0.76      0.72        92
    SSPPMFRV       0.71      0.66      0.68        38
   SSYRRPVGI       0.80      0.80      0.80       120
    TVYGFCLL       0.59      0.65      0.62        20

    accuracy                           0.68       439
   macro avg       0.64      0.59      0.60       439
weighted avg       0.69      0.68      0.68       439



# beta mouse

In [12]:
Xtrain, Xtest, ytrain, ytest = train_test_split(data_beta_mouse, data_beta_mouse['antigen.epitope'], test_size=0.2, random_state=42)

In [13]:
Xtrain_  = TCRrep(cell_df = Xtrain, 
            organism = 'mouse', 
            chains = ['beta'], 
            db_file = 'alphabeta_gammadelta_db.tsv')

ytrain = Xtrain_.clone_df['antigen.epitope']
Xtrain = Xtrain_.pw_beta

Xtest_ = TCRrep(cell_df = Xtest,           
            organism = 'mouse', 
            chains = ['beta'], 
            compute_distances = False)

Xtest_.compute_rect_distances(df = Xtest_.clone_df, df2 = Xtrain_.clone_df)

Xtest = Xtest_.rw_beta
ytest = Xtest_.clone_df['antigen.epitope']


  self._validate_cell_df()
  clones = cell_df.groupby(index_cols)['count'].agg(np.sum).reset_index()

  self._validate_cell_df()
  clones = cell_df.groupby(index_cols)['count'].agg(np.sum).reset_index()


In [14]:
knn = KNeighborsClassifier(n_neighbors=5, metric='precomputed', weights='distance')

# Fit the classifier on the training data
model=knn.fit(Xtrain, ytrain)

# Make a prediction on the training data
Ypred=model.predict(Xtest)


accuracy = accuracy_score(ytest, Ypred)
f1 = f1_score(ytest, Ypred, average='macro') 

print(f'Accuracy: {accuracy:.2f}')
print(f'F1 Score: {f1:.2f}')

print(classification_report(ytest, Ypred))

Accuracy: 0.78
F1 Score: 0.72
              precision    recall  f1-score   support

   ASNENMETM       0.84      0.72      0.77        71
   HGIRNASFI       0.66      0.78      0.71        59
   KAVYNFATC       0.62      0.42      0.50        12
   LSLRNPILV       0.73      0.79      0.76        28
    RALEYKNL       0.65      0.50      0.56        40
   SQLLNAKYL       0.75      0.96      0.84        28
  SSLENFRAYV       0.87      0.76      0.81        96
    SSPPMFRV       0.86      0.92      0.89        66
   SSYRRPVGI       0.80      0.87      0.83       127
    TVYGFCLL       0.64      0.50      0.56        14

    accuracy                           0.78       541
   macro avg       0.74      0.72      0.72       541
weighted avg       0.78      0.78      0.78       541

