**Note** This notebook runs under python3.7 environment. In order to run this notebook correctly, I suggest config a relevant virtual environment for it. Data we used here is vdjdb.

##  Overall Data Preparation

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('mini_project/vdjdb.txt', sep='\t')
df = df.drop(['web.method', 'web.method.seq', 'web.cdr3fix.nc','web.cdr3fix.unmp','reference.id', 'method', 'meta', 'cdr3fix'], axis=1)

In [3]:
df_clean = df.dropna()

In [4]:
df_alpha = df_clean[df_clean['gene'] == 'TRA']
df_beta = df_clean[df_clean['gene'] == 'TRB']

In [5]:
df_alpha = df_alpha[df_alpha['complex.id']!=0]
df_beta = df_beta[df_beta['complex.id']!=0]

**Note** Ape(MacacaMulatta) does not have paired alpha and beta chains, so it will not be taken into consideration

## Human

#### Data preparation for human

In [6]:
df_alpha_human = df_alpha[df_alpha['species'] == 'HomoSapiens']
df_beta_human = df_beta[df_beta['species'] == 'HomoSapiens']

In [7]:
df_alpha_human = df_alpha_human.drop(['gene', 'species', 'mhc.a', 'mhc.b', 'mhc.class', 
                            'antigen.epitope', 'antigen.gene', 'antigen.species', 'vdjdb.score'], axis=1)
df_alpha_human.head()


Unnamed: 0,complex.id,cdr3,v.segm,j.segm
0,1,CIVRAPGRADMRF,TRAV26-1*01,TRAJ43*01
3,2,CAVPSGAGSYQLTF,TRAV20*01,TRAJ28*01
7,4,CAYRPPGTYKYIF,TRAV38-2/DV8*01,TRAJ40*01
9,5,CIVRAPGRADMRF,TRAV26-1*01,TRAJ43*01
12,6,CAVPSGAGSYQLTF,TRAV20*01,TRAJ28*01


In [8]:
df_beta_human = df_beta_human.drop(['gene', 'species', 'mhc.a', 'mhc.b', 'mhc.class', 
                            'antigen.epitope', 'antigen.gene', 'vdjdb.score'], axis=1)
df_beta_human.head()

Unnamed: 0,complex.id,cdr3,v.segm,j.segm,antigen.species
1,1,CASSYLPGQGDHYSNQPQHF,TRBV13*01,TRBJ1-5*01,HIV-1
4,2,CASSFEPGQGFYSNQPQHF,TRBV13*01,TRBJ1-5*01,HIV-1
6,3,CASSYEPGQVSHYSNQPQHF,TRBV13*01,TRBJ1-5*01,HIV-1
8,4,CASSALASLNEQFF,TRBV14*01,TRBJ2-1*01,HIV-1
10,5,CASSYLPGQGDHYSNQPQHF,TRBV13*01,TRBJ1-5*01,HIV-1


In [9]:
df_merge = pd.merge(df_alpha_human, df_beta_human, on='complex.id')
df_merge = df_merge.drop(['complex.id'], axis=1)

In [10]:
df_merge = df_merge.rename(columns={'cdr3_x':'cdr3_alpha',
                'v.segm_x':'v.segm_alpha',
                'j.segm_x':'j.segm_alpha',
                'cdr3_y':'cdr3_beta',
                'v.segm_y':'v.segm_beta',
                'j.segm_y':'j.segm_beta'})
df_merge.head()

Unnamed: 0,cdr3_alpha,v.segm_alpha,j.segm_alpha,cdr3_beta,v.segm_beta,j.segm_beta,antigen.species
0,CIVRAPGRADMRF,TRAV26-1*01,TRAJ43*01,CASSYLPGQGDHYSNQPQHF,TRBV13*01,TRBJ1-5*01,HIV-1
1,CAVPSGAGSYQLTF,TRAV20*01,TRAJ28*01,CASSFEPGQGFYSNQPQHF,TRBV13*01,TRBJ1-5*01,HIV-1
2,CAYRPPGTYKYIF,TRAV38-2/DV8*01,TRAJ40*01,CASSALASLNEQFF,TRBV14*01,TRBJ2-1*01,HIV-1
3,CIVRAPGRADMRF,TRAV26-1*01,TRAJ43*01,CASSYLPGQGDHYSNQPQHF,TRBV13*01,TRBJ1-5*01,HIV-1
4,CAVPSGAGSYQLTF,TRAV20*01,TRAJ28*01,CASSFEPGQGFYSNQPQHF,TRBV13*01,TRBJ1-5*01,HIV-1


In [11]:
df_merge.shape

(27550, 7)

In [12]:
antigen_species = df_merge['antigen.species'].unique()
print(antigen_species)

['HIV-1' 'HomoSapiens' 'EBV' 'M.tuberculosis' 'HTLV-1' 'InfluenzaA' 'CMV'
 'SaccharomycesCerevisiae' 'HCV' 'E.Coli' 'HHV' 'synthetic'
 'TriticumAestivum' 'DENV1' 'DENV3/4' 'PseudomonasFluorescens'
 'PseudomonasAeruginosa' 'SARS-CoV-2' 'HIV1' 'Homo sapiens' 'HSV-2' 'YFV'
 'HPV-16' 'MCPyV' 'HPV' 'StreptomycesKanamyceticus' 'Wheat' 'HCoV-HKU1']


Classify the paired alpha and beta chains based on the antigen species. The file is named as the name of antigen species and in each file paired alpha and beta chains are saved as .tsv.

In [13]:
antigen_species_dict = {}

for antigen in antigen_species:
    anti = df_merge[df_merge['antigen.species'] == antigen]

    if antigen != 'DENV3/4': # as '/ ' is confused when save documents
        antigen_species_dict[antigen] = anti
    else:
        antigen_species_dict['DENV3or4'] = anti
        


If you run this notebook second time, please do not run this kernel.

In [None]:
import os

folder_path = './alpha_beta_directory'  
os.mkdir(folder_path) 

for key, df in antigen_species_dict.items():
    path = f'alpha_beta_directory/{key}'
    os.mkdir(path)
    file_path = os.path.join(path, f"{key}.tsv")
    df.to_csv(file_path, sep='\t', index=False)

### DeepTCR

In [14]:
from DeepTCR.DeepTCR import DeepTCR_U

human = DeepTCR_U('human')

alpha_beta_directory = './alpha_beta_directory'


aa_column_alpha = 0   # amino alpha
v_alpha_column = 1   # V.alpha
j_alpha_column = 2   # J.alpha
aa_column_beta = 3 # amino beta
v_beta_column = 4 # V.beta
j_beta_column = 5 # J.beta



Load data

In [15]:
human.Get_Data(directory=alpha_beta_directory, Load_Prev_Data=False,aggregate_by_aa=True,
               aa_column_alpha=0,v_alpha_column=1,j_alpha_column=2,
           aa_column_beta=3, v_beta_column=4, j_beta_column=5)

Loading Data...
Embedding Sequences...
Data Loaded


Do variational autoencoder(VAE) to take the CDR3 sequences from both the α- and β-chains along with their corresponding V, D, and J gene usage and learn a joint representation of these inputs. For more information please read 'DeepTCR is a deep learning framework for revealing sequence concepts within T-cell repertoires' by  Sidhom et al..


In [16]:
human.Train_VAE(Load_Prev_Data=False)

  kernel_regularizer=tf.keras.regularizers.l2(l2_reg))
  return layer.apply(inputs)
  conv_out = tf.compat.v1.layers.flatten(tf.reduce_max(input_tensor=conv, axis=2))
  return layer.apply(inputs)
  conv = tf.compat.v1.layers.dropout(conv, prob)
  return layer.apply(inputs, training=training)
  kernel_regularizer=tf.keras.regularizers.l2(l2_reg))
  conv = tf.compat.v1.layers.dropout(conv, prob)
  conv_3_out = tf.compat.v1.layers.flatten(tf.reduce_max(input_tensor=conv_3,axis=2))
  return tf.compat.v1.layers.flatten(conv_3),conv_out,indices
  fc = tf.compat.v1.layers.dense(GO.Features, 256)
  return layer.apply(inputs)
  fc = tf.compat.v1.layers.dense(fc, latent_dim)
  z_log_var = tf.compat.v1.layers.dense(fc, latent_dim, activation=tf.nn.softplus, name='z_log_var')
  fc_up = tf.compat.v1.layers.dense(z, 128)
  fc_up = tf.compat.v1.layers.dense(fc_up, 256)
  upsample_beta = tf.compat.v1.layers.conv2d_transpose(upsample_beta, units[-1-_], (1, 3), (1, 2),activation=tf.nn.relu)
  return lay

Epoch = 0, Iteration = 0 Total Loss: 17.66009: Recon Loss: 17.57899: Latent Loss: 0.08111: Sparsity Loss: 0.00000: Recon Accuracy: 0.03441
Epoch = 0, Iteration = 1 Total Loss: 17.15816: Recon Loss: 17.07820: Latent Loss: 0.07996: Sparsity Loss: 0.00000: Recon Accuracy: 0.04558
Epoch = 0, Iteration = 2 Total Loss: 16.83628: Recon Loss: 16.75318: Latent Loss: 0.08310: Sparsity Loss: 0.00000: Recon Accuracy: 0.06174
Epoch = 1, Iteration = 0 Total Loss: 16.54614: Recon Loss: 16.44696: Latent Loss: 0.09919: Sparsity Loss: 0.00000: Recon Accuracy: 0.08205
Epoch = 1, Iteration = 1 Total Loss: 16.37320: Recon Loss: 16.22873: Latent Loss: 0.14447: Sparsity Loss: 0.00000: Recon Accuracy: 0.10015
Epoch = 1, Iteration = 2 Total Loss: 16.15807: Recon Loss: 16.02844: Latent Loss: 0.12963: Sparsity Loss: 0.00000: Recon Accuracy: 0.11792
Epoch = 2, Iteration = 0 Total Loss: 16.03386: Recon Loss: 15.92674: Latent Loss: 0.10712: Sparsity Loss: 0.00000: Recon Accuracy: 0.13238
Epoch = 2, Iteration = 1 To

Epoch = 20, Iteration = 0 Total Loss: 5.02523: Recon Loss: 4.59836: Latent Loss: 0.42687: Sparsity Loss: 0.00000: Recon Accuracy: 0.65062
Epoch = 20, Iteration = 1 Total Loss: 4.89519: Recon Loss: 4.45718: Latent Loss: 0.43801: Sparsity Loss: 0.00000: Recon Accuracy: 0.65759
Epoch = 20, Iteration = 2 Total Loss: 4.74789: Recon Loss: 4.30239: Latent Loss: 0.44550: Sparsity Loss: 0.00000: Recon Accuracy: 0.66406
Epoch = 21, Iteration = 0 Total Loss: 4.50551: Recon Loss: 4.04323: Latent Loss: 0.46228: Sparsity Loss: 0.00000: Recon Accuracy: 0.67590
Epoch = 21, Iteration = 1 Total Loss: 4.34421: Recon Loss: 3.86575: Latent Loss: 0.47845: Sparsity Loss: 0.00000: Recon Accuracy: 0.68659
Epoch = 21, Iteration = 2 Total Loss: 4.15407: Recon Loss: 3.66490: Latent Loss: 0.48917: Sparsity Loss: 0.00000: Recon Accuracy: 0.69926
Epoch = 22, Iteration = 0 Total Loss: 4.02793: Recon Loss: 3.52493: Latent Loss: 0.50300: Sparsity Loss: 0.00000: Recon Accuracy: 0.70442
Epoch = 22, Iteration = 1 Total Lo

Epoch = 40, Iteration = 0 Total Loss: 1.82529: Recon Loss: 1.40490: Latent Loss: 0.42039: Sparsity Loss: 0.00000: Recon Accuracy: 0.81326
Epoch = 40, Iteration = 1 Total Loss: 1.80029: Recon Loss: 1.38153: Latent Loss: 0.41876: Sparsity Loss: 0.00000: Recon Accuracy: 0.81449
Epoch = 40, Iteration = 2 Total Loss: 1.79274: Recon Loss: 1.37740: Latent Loss: 0.41533: Sparsity Loss: 0.00000: Recon Accuracy: 0.81355
Epoch = 41, Iteration = 0 Total Loss: 1.79062: Recon Loss: 1.37758: Latent Loss: 0.41303: Sparsity Loss: 0.00000: Recon Accuracy: 0.81447
Epoch = 41, Iteration = 1 Total Loss: 1.78602: Recon Loss: 1.37667: Latent Loss: 0.40935: Sparsity Loss: 0.00000: Recon Accuracy: 0.81472
Epoch = 41, Iteration = 2 Total Loss: 1.78098: Recon Loss: 1.37200: Latent Loss: 0.40897: Sparsity Loss: 0.00000: Recon Accuracy: 0.81608
Epoch = 42, Iteration = 0 Total Loss: 1.77789: Recon Loss: 1.37032: Latent Loss: 0.40757: Sparsity Loss: 0.00000: Recon Accuracy: 0.81603
Epoch = 42, Iteration = 1 Total Lo

Epoch = 60, Iteration = 0 Total Loss: 1.57421: Recon Loss: 1.22860: Latent Loss: 0.34561: Sparsity Loss: 0.00000: Recon Accuracy: 0.82842
Epoch = 60, Iteration = 1 Total Loss: 1.57019: Recon Loss: 1.22479: Latent Loss: 0.34540: Sparsity Loss: 0.00000: Recon Accuracy: 0.82869
Epoch = 60, Iteration = 2 Total Loss: 1.56330: Recon Loss: 1.21994: Latent Loss: 0.34337: Sparsity Loss: 0.00000: Recon Accuracy: 0.82969
Epoch = 61, Iteration = 0 Total Loss: 1.56796: Recon Loss: 1.22440: Latent Loss: 0.34357: Sparsity Loss: 0.00000: Recon Accuracy: 0.82911
Epoch = 61, Iteration = 1 Total Loss: 1.56430: Recon Loss: 1.22150: Latent Loss: 0.34280: Sparsity Loss: 0.00000: Recon Accuracy: 0.82975
Epoch = 61, Iteration = 2 Total Loss: 1.54851: Recon Loss: 1.20570: Latent Loss: 0.34280: Sparsity Loss: 0.00000: Recon Accuracy: 0.83037
Epoch = 62, Iteration = 0 Total Loss: 1.56030: Recon Loss: 1.21779: Latent Loss: 0.34250: Sparsity Loss: 0.00000: Recon Accuracy: 0.82982
Epoch = 62, Iteration = 1 Total Lo

Epoch = 80, Iteration = 0 Total Loss: 1.43528: Recon Loss: 1.11046: Latent Loss: 0.32483: Sparsity Loss: 0.00000: Recon Accuracy: 0.84516
Epoch = 80, Iteration = 1 Total Loss: 1.42894: Recon Loss: 1.10374: Latent Loss: 0.32521: Sparsity Loss: 0.00000: Recon Accuracy: 0.84599
Epoch = 80, Iteration = 2 Total Loss: 1.42328: Recon Loss: 1.09871: Latent Loss: 0.32457: Sparsity Loss: 0.00000: Recon Accuracy: 0.84590
Epoch = 81, Iteration = 0 Total Loss: 1.42933: Recon Loss: 1.10663: Latent Loss: 0.32270: Sparsity Loss: 0.00000: Recon Accuracy: 0.84611
Epoch = 81, Iteration = 1 Total Loss: 1.41954: Recon Loss: 1.09668: Latent Loss: 0.32286: Sparsity Loss: 0.00000: Recon Accuracy: 0.84654
Epoch = 81, Iteration = 2 Total Loss: 1.42881: Recon Loss: 1.10553: Latent Loss: 0.32328: Sparsity Loss: 0.00000: Recon Accuracy: 0.84683
Epoch = 82, Iteration = 0 Total Loss: 1.41925: Recon Loss: 1.09722: Latent Loss: 0.32203: Sparsity Loss: 0.00000: Recon Accuracy: 0.84653
Epoch = 82, Iteration = 1 Total Lo

Epoch = 100, Iteration = 0 Total Loss: 1.34147: Recon Loss: 1.02803: Latent Loss: 0.31344: Sparsity Loss: 0.00000: Recon Accuracy: 0.85756
Epoch = 100, Iteration = 1 Total Loss: 1.33579: Recon Loss: 1.02245: Latent Loss: 0.31333: Sparsity Loss: 0.00000: Recon Accuracy: 0.85861
Epoch = 100, Iteration = 2 Total Loss: 1.33558: Recon Loss: 1.02242: Latent Loss: 0.31315: Sparsity Loss: 0.00000: Recon Accuracy: 0.85870
Epoch = 101, Iteration = 0 Total Loss: 1.33055: Recon Loss: 1.01732: Latent Loss: 0.31323: Sparsity Loss: 0.00000: Recon Accuracy: 0.85922
Epoch = 101, Iteration = 1 Total Loss: 1.33152: Recon Loss: 1.01818: Latent Loss: 0.31334: Sparsity Loss: 0.00000: Recon Accuracy: 0.85893
Epoch = 101, Iteration = 2 Total Loss: 1.34052: Recon Loss: 1.02688: Latent Loss: 0.31364: Sparsity Loss: 0.00000: Recon Accuracy: 0.85809
Epoch = 102, Iteration = 0 Total Loss: 1.33086: Recon Loss: 1.01745: Latent Loss: 0.31342: Sparsity Loss: 0.00000: Recon Accuracy: 0.85949
Epoch = 102, Iteration = 1 

Epoch = 119, Iteration = 2 Total Loss: 1.27792: Recon Loss: 0.96765: Latent Loss: 0.31027: Sparsity Loss: 0.00000: Recon Accuracy: 0.86798
Epoch = 120, Iteration = 0 Total Loss: 1.27261: Recon Loss: 0.96325: Latent Loss: 0.30936: Sparsity Loss: 0.00000: Recon Accuracy: 0.86829
Epoch = 120, Iteration = 1 Total Loss: 1.27447: Recon Loss: 0.96550: Latent Loss: 0.30897: Sparsity Loss: 0.00000: Recon Accuracy: 0.86831
Epoch = 120, Iteration = 2 Total Loss: 1.26882: Recon Loss: 0.96025: Latent Loss: 0.30857: Sparsity Loss: 0.00000: Recon Accuracy: 0.86874
Epoch = 121, Iteration = 0 Total Loss: 1.26735: Recon Loss: 0.95850: Latent Loss: 0.30886: Sparsity Loss: 0.00000: Recon Accuracy: 0.86936
Epoch = 121, Iteration = 1 Total Loss: 1.26936: Recon Loss: 0.95934: Latent Loss: 0.31002: Sparsity Loss: 0.00000: Recon Accuracy: 0.86919
Epoch = 121, Iteration = 2 Total Loss: 1.26944: Recon Loss: 0.96042: Latent Loss: 0.30902: Sparsity Loss: 0.00000: Recon Accuracy: 0.86890
Epoch = 122, Iteration = 0 

Epoch = 139, Iteration = 1 Total Loss: 1.21679: Recon Loss: 0.90831: Latent Loss: 0.30849: Sparsity Loss: 0.00000: Recon Accuracy: 0.87779
Epoch = 139, Iteration = 2 Total Loss: 1.22014: Recon Loss: 0.91081: Latent Loss: 0.30933: Sparsity Loss: 0.00000: Recon Accuracy: 0.87731
Epoch = 140, Iteration = 0 Total Loss: 1.21465: Recon Loss: 0.90608: Latent Loss: 0.30858: Sparsity Loss: 0.00000: Recon Accuracy: 0.87776
Epoch = 140, Iteration = 1 Total Loss: 1.22128: Recon Loss: 0.91261: Latent Loss: 0.30868: Sparsity Loss: 0.00000: Recon Accuracy: 0.87730
Epoch = 140, Iteration = 2 Total Loss: 1.21625: Recon Loss: 0.90834: Latent Loss: 0.30792: Sparsity Loss: 0.00000: Recon Accuracy: 0.87874
Epoch = 141, Iteration = 0 Total Loss: 1.21650: Recon Loss: 0.90852: Latent Loss: 0.30798: Sparsity Loss: 0.00000: Recon Accuracy: 0.87727
Epoch = 141, Iteration = 1 Total Loss: 1.21414: Recon Loss: 0.90732: Latent Loss: 0.30682: Sparsity Loss: 0.00000: Recon Accuracy: 0.87776
Epoch = 141, Iteration = 2 

Epoch = 159, Iteration = 0 Total Loss: 1.16425: Recon Loss: 0.85429: Latent Loss: 0.30996: Sparsity Loss: 0.00000: Recon Accuracy: 0.88622
Epoch = 159, Iteration = 1 Total Loss: 1.17429: Recon Loss: 0.86320: Latent Loss: 0.31109: Sparsity Loss: 0.00000: Recon Accuracy: 0.88562
Epoch = 159, Iteration = 2 Total Loss: 1.16593: Recon Loss: 0.85640: Latent Loss: 0.30953: Sparsity Loss: 0.00000: Recon Accuracy: 0.88623
Epoch = 160, Iteration = 0 Total Loss: 1.15990: Recon Loss: 0.85088: Latent Loss: 0.30901: Sparsity Loss: 0.00000: Recon Accuracy: 0.88730
Epoch = 160, Iteration = 1 Total Loss: 1.16166: Recon Loss: 0.85351: Latent Loss: 0.30815: Sparsity Loss: 0.00000: Recon Accuracy: 0.88605
Epoch = 160, Iteration = 2 Total Loss: 1.17072: Recon Loss: 0.86129: Latent Loss: 0.30944: Sparsity Loss: 0.00000: Recon Accuracy: 0.88633
Epoch = 161, Iteration = 0 Total Loss: 1.17068: Recon Loss: 0.85960: Latent Loss: 0.31109: Sparsity Loss: 0.00000: Recon Accuracy: 0.88632
Epoch = 161, Iteration = 1 

Epoch = 178, Iteration = 2 Total Loss: 1.11145: Recon Loss: 0.80444: Latent Loss: 0.30701: Sparsity Loss: 0.00000: Recon Accuracy: 0.89510
Epoch = 179, Iteration = 0 Total Loss: 1.11704: Recon Loss: 0.81113: Latent Loss: 0.30591: Sparsity Loss: 0.00000: Recon Accuracy: 0.89399
Epoch = 179, Iteration = 1 Total Loss: 1.11972: Recon Loss: 0.81336: Latent Loss: 0.30637: Sparsity Loss: 0.00000: Recon Accuracy: 0.89429
Epoch = 179, Iteration = 2 Total Loss: 1.12101: Recon Loss: 0.81453: Latent Loss: 0.30649: Sparsity Loss: 0.00000: Recon Accuracy: 0.89463
Epoch = 180, Iteration = 0 Total Loss: 1.11375: Recon Loss: 0.80771: Latent Loss: 0.30604: Sparsity Loss: 0.00000: Recon Accuracy: 0.89459
Epoch = 180, Iteration = 1 Total Loss: 1.11618: Recon Loss: 0.81121: Latent Loss: 0.30498: Sparsity Loss: 0.00000: Recon Accuracy: 0.89508
Epoch = 180, Iteration = 2 Total Loss: 1.11092: Recon Loss: 0.80601: Latent Loss: 0.30490: Sparsity Loss: 0.00000: Recon Accuracy: 0.89453
Epoch = 181, Iteration = 0 

Epoch = 198, Iteration = 1 Total Loss: 1.07964: Recon Loss: 0.77686: Latent Loss: 0.30279: Sparsity Loss: 0.00000: Recon Accuracy: 0.90021
Epoch = 198, Iteration = 2 Total Loss: 1.07610: Recon Loss: 0.77286: Latent Loss: 0.30324: Sparsity Loss: 0.00000: Recon Accuracy: 0.90084
Epoch = 199, Iteration = 0 Total Loss: 1.07704: Recon Loss: 0.77479: Latent Loss: 0.30226: Sparsity Loss: 0.00000: Recon Accuracy: 0.90015
Epoch = 199, Iteration = 1 Total Loss: 1.07340: Recon Loss: 0.77104: Latent Loss: 0.30236: Sparsity Loss: 0.00000: Recon Accuracy: 0.90046
Epoch = 199, Iteration = 2 Total Loss: 1.07789: Recon Loss: 0.77488: Latent Loss: 0.30301: Sparsity Loss: 0.00000: Recon Accuracy: 0.90033
Epoch = 200, Iteration = 0 Total Loss: 1.07210: Recon Loss: 0.76798: Latent Loss: 0.30413: Sparsity Loss: 0.00000: Recon Accuracy: 0.90046
Epoch = 200, Iteration = 1 Total Loss: 1.08177: Recon Loss: 0.77630: Latent Loss: 0.30547: Sparsity Loss: 0.00000: Recon Accuracy: 0.90053
Epoch = 200, Iteration = 2 

Extract features.

In [33]:
human_features = human.features
human_features.shape

(26249, 256)

The final feature matrix shows that there's only 26249 rows left. So there are around 1000 rows missing. The reasons can be a little bit complex. I think the main reasons are that some features share same cdr3 representations and v, j segments information or these tcrs may also be found having similar information when the neutral network processed them. Overall, these reasons are just my own assumptions and the ture causes need further development.


Distance

In [34]:
from scipy.spatial import distance
human_distance_matrix = distance.cdist(human_features, human_features, metric='euclidean')

In [35]:
print(human_distance_matrix)

[[ 0.         20.24218222 20.30243341 ... 26.33789178 26.25996382
  29.282374  ]
 [20.24218222  0.         19.96508601 ... 22.67847028 22.65769975
  23.74018358]
 [20.30243341 19.96508601  0.         ... 24.37955368 24.35895216
  27.1676564 ]
 ...
 [26.33789178 22.67847028 24.37955368 ...  0.          0.29928855
  25.03429252]
 [26.25996382 22.65769975 24.35895216 ...  0.29928855  0.
  24.99474488]
 [29.282374   23.74018358 27.1676564  ... 25.03429252 24.99474488
   0.        ]]


## Mouse

#### Data preparation for mouse

For other species, the procedures are same with human.

In [17]:
df_alpha_mouse = df_alpha[df_alpha['species'] == 'MusMusculus']
df_beta_mouse = df_beta[df_beta['species'] == 'MusMusculus']

In [18]:
df_alpha_mouse = df_alpha_mouse.drop(['gene', 'species', 'mhc.a', 'mhc.b', 'mhc.class', 
                            'antigen.epitope', 'antigen.gene', 'antigen.species', 'vdjdb.score'], axis=1)

In [19]:
df_beta_mouse = df_beta_mouse.drop(['gene', 'species', 'mhc.a', 'mhc.b', 'mhc.class', 
                            'antigen.epitope', 'antigen.gene', 'vdjdb.score'], axis=1)

In [20]:
df_merge_mouse = pd.merge(df_alpha_mouse, df_beta_mouse, on='complex.id')
df_merge_mouse = df_merge_mouse.drop(['complex.id'], axis=1)

In [21]:
df_merge_mouse = df_merge_mouse.rename(columns={'cdr3_x':'cdr3_alpha',
                'v.segm_x':'v.segm_alpha',
                'j.segm_x':'j.segm_alpha',
                'cdr3_y':'cdr3_beta',
                'v.segm_y':'v.segm_beta',
                'j.segm_y':'j.segm_beta'})
df_merge_mouse.head()

Unnamed: 0,cdr3_alpha,v.segm_alpha,j.segm_alpha,cdr3_beta,v.segm_beta,j.segm_beta,antigen.species
0,CAMRGDYGGSGNKLIF,TRAV16*01,TRAJ32*01,CTCSADRVGNTLYF,TRBV1*01,TRBJ1-3*01,MusMusculus
1,CAVSGFASALTF,TRAV9-4*01,TRAJ35*01,CASGGGGTLYF,TRBV13-2*01,TRBJ2-4*01,Synthetic
2,CAARYQGGRALIF,TRAV14-1*01,TRAJ15*01,CTCSAAPDWGASAETLYF,TRBV1*01,TRBJ2-3*01,MusMusculus
3,CAVSGFASALTF,TRAV9-4*01,TRAJ35*01,CASGGGGTLYF,TRBV13-2*01,TRBJ2-4*01,MusMusculus
4,CAMRGDYGGSGNKLIF,TRAV16*01,TRAJ32*01,CTCSADRVGNTLYF,TRBV1*01,TRBJ1-3*01,VSV


In [22]:
antigen_species_mouse = df_merge_mouse['antigen.species'].unique()
print(antigen_species_mouse)

['MusMusculus' 'Synthetic' 'VSV' 'GallusGallus' 'InfluenzaA'
 'ManducaSexta' 'HomoSapiens' 'synthetic' 'LCMV' 'PlasmodiumBerghei'
 'MCMV']


"Synthetic" and "synthetic" are same antigen.

In [None]:
antigen_species_mouse = antigen_species_mouse['antigen.speices'].replace('synthetic', 'Synthetic')

In [23]:
antigen_species_mouse_dict = {}

for antigen in antigen_species_mouse:
    anti = df_merge_mouse[df_merge_mouse['antigen.species'] == antigen]
    
    if antigen != 'synthetic':# distinghuish Synthetic and synthetic to avoid error
        antigen_species_mouse_dict[antigen] = anti
    else:
        antigen_species_mouse_dict['Synthetic--'] = anti
        

If you run this notebook second time, please do not run this kernel.

In [None]:
folder_path = './alpha_beta_mouse_directory'  
os.mkdir(folder_path) 

for key, df in antigen_species_mouse_dict.items():
    path = f'alpha_beta_mouse_directory/{key}'
    os.mkdir(path)
    file_path = os.path.join(path, f"{key}.tsv")
    df.to_csv(file_path, sep='\t', index=False)

### DeepTCR

In [24]:
mouse = DeepTCR_U('mouse')

alpha_beta_mouse_directory = './alpha_beta_mouse_directory'


aa_column_alpha = 0   # amino alpha
v_alpha_column = 1   # V.alpha
j_alpha_column = 2   # J.alpha
aa_column_beta = 3 # amino beta
v_beta_column = 4 # V.beta
j_beta_column = 5 # J.beta


In [25]:
mouse.Get_Data(directory=alpha_beta_mouse_directory, Load_Prev_Data=False,aggregate_by_aa=True,
               aa_column_alpha=0,v_alpha_column=1,j_alpha_column=2,
           aa_column_beta=3, v_beta_column=4, j_beta_column=5)

Loading Data...
Embedding Sequences...
Data Loaded


In [32]:
mouse.Train_VAE(Load_Prev_Data=False)

  kernel_regularizer=tf.keras.regularizers.l2(l2_reg))
  return layer.apply(inputs)
  conv_out = tf.compat.v1.layers.flatten(tf.reduce_max(input_tensor=conv, axis=2))
  return layer.apply(inputs)
  conv = tf.compat.v1.layers.dropout(conv, prob)
  return layer.apply(inputs, training=training)
  kernel_regularizer=tf.keras.regularizers.l2(l2_reg))
  conv = tf.compat.v1.layers.dropout(conv, prob)
  conv_3_out = tf.compat.v1.layers.flatten(tf.reduce_max(input_tensor=conv_3,axis=2))
  return tf.compat.v1.layers.flatten(conv_3),conv_out,indices
  fc = tf.compat.v1.layers.dense(GO.Features, 256)
  return layer.apply(inputs)
  fc = tf.compat.v1.layers.dense(fc, latent_dim)
  z_log_var = tf.compat.v1.layers.dense(fc, latent_dim, activation=tf.nn.softplus, name='z_log_var')
  fc_up = tf.compat.v1.layers.dense(z, 128)
  fc_up = tf.compat.v1.layers.dense(fc_up, 256)
  upsample_beta = tf.compat.v1.layers.conv2d_transpose(upsample_beta, units[-1-_], (1, 3), (1, 2),activation=tf.nn.relu)
  return lay

Epoch = 0, Iteration = 0 Total Loss: 17.14811: Recon Loss: 17.06717: Latent Loss: 0.08093: Sparsity Loss: 0.00000: Recon Accuracy: 0.05702
Epoch = 1, Iteration = 0 Total Loss: 16.64240: Recon Loss: 16.56120: Latent Loss: 0.08119: Sparsity Loss: 0.00000: Recon Accuracy: 0.06925
Epoch = 2, Iteration = 0 Total Loss: 16.24068: Recon Loss: 16.15475: Latent Loss: 0.08594: Sparsity Loss: 0.00000: Recon Accuracy: 0.09522
Epoch = 3, Iteration = 0 Total Loss: 15.84255: Recon Loss: 15.73997: Latent Loss: 0.10258: Sparsity Loss: 0.00000: Recon Accuracy: 0.12500
Epoch = 4, Iteration = 0 Total Loss: 15.58827: Recon Loss: 15.43892: Latent Loss: 0.14935: Sparsity Loss: 0.00000: Recon Accuracy: 0.15001
Epoch = 5, Iteration = 0 Total Loss: 15.20001: Recon Loss: 15.06377: Latent Loss: 0.13624: Sparsity Loss: 0.00000: Recon Accuracy: 0.18345
Epoch = 6, Iteration = 0 Total Loss: 14.88828: Recon Loss: 14.75295: Latent Loss: 0.13534: Sparsity Loss: 0.00000: Recon Accuracy: 0.20258
Epoch = 7, Iteration = 0 To

Epoch = 60, Iteration = 0 Total Loss: 4.70896: Recon Loss: 4.34056: Latent Loss: 0.36840: Sparsity Loss: 0.00000: Recon Accuracy: 0.71264
Epoch = 61, Iteration = 0 Total Loss: 4.61321: Recon Loss: 4.24071: Latent Loss: 0.37250: Sparsity Loss: 0.00000: Recon Accuracy: 0.71565
Epoch = 62, Iteration = 0 Total Loss: 4.50757: Recon Loss: 4.13044: Latent Loss: 0.37713: Sparsity Loss: 0.00000: Recon Accuracy: 0.72245
Epoch = 63, Iteration = 0 Total Loss: 4.39807: Recon Loss: 4.01757: Latent Loss: 0.38050: Sparsity Loss: 0.00000: Recon Accuracy: 0.72758
Epoch = 64, Iteration = 0 Total Loss: 4.30393: Recon Loss: 3.92263: Latent Loss: 0.38130: Sparsity Loss: 0.00000: Recon Accuracy: 0.72877
Epoch = 65, Iteration = 0 Total Loss: 4.25531: Recon Loss: 3.86789: Latent Loss: 0.38742: Sparsity Loss: 0.00000: Recon Accuracy: 0.73504
Epoch = 66, Iteration = 0 Total Loss: 4.09908: Recon Loss: 3.71274: Latent Loss: 0.38634: Sparsity Loss: 0.00000: Recon Accuracy: 0.73833
Epoch = 67, Iteration = 0 Total Lo

Epoch = 120, Iteration = 0 Total Loss: 1.37626: Recon Loss: 0.95340: Latent Loss: 0.42286: Sparsity Loss: 0.00000: Recon Accuracy: 0.88086
Epoch = 121, Iteration = 0 Total Loss: 1.34665: Recon Loss: 0.92531: Latent Loss: 0.42134: Sparsity Loss: 0.00000: Recon Accuracy: 0.88364
Epoch = 122, Iteration = 0 Total Loss: 1.36002: Recon Loss: 0.94117: Latent Loss: 0.41884: Sparsity Loss: 0.00000: Recon Accuracy: 0.88280
Epoch = 123, Iteration = 0 Total Loss: 1.33945: Recon Loss: 0.92321: Latent Loss: 0.41624: Sparsity Loss: 0.00000: Recon Accuracy: 0.88442
Epoch = 124, Iteration = 0 Total Loss: 1.30420: Recon Loss: 0.89165: Latent Loss: 0.41255: Sparsity Loss: 0.00000: Recon Accuracy: 0.88579
Epoch = 125, Iteration = 0 Total Loss: 1.32391: Recon Loss: 0.91435: Latent Loss: 0.40956: Sparsity Loss: 0.00000: Recon Accuracy: 0.88366
Epoch = 126, Iteration = 0 Total Loss: 1.30700: Recon Loss: 0.89880: Latent Loss: 0.40820: Sparsity Loss: 0.00000: Recon Accuracy: 0.88614
Epoch = 127, Iteration = 0 

Epoch = 179, Iteration = 0 Total Loss: 0.98848: Recon Loss: 0.66295: Latent Loss: 0.32554: Sparsity Loss: 0.00000: Recon Accuracy: 0.90805
Epoch = 180, Iteration = 0 Total Loss: 0.97828: Recon Loss: 0.65343: Latent Loss: 0.32485: Sparsity Loss: 0.00000: Recon Accuracy: 0.90953
Epoch = 181, Iteration = 0 Total Loss: 0.98260: Recon Loss: 0.65893: Latent Loss: 0.32367: Sparsity Loss: 0.00000: Recon Accuracy: 0.90872
Epoch = 182, Iteration = 0 Total Loss: 0.96776: Recon Loss: 0.64549: Latent Loss: 0.32226: Sparsity Loss: 0.00000: Recon Accuracy: 0.90932
Epoch = 183, Iteration = 0 Total Loss: 0.98173: Recon Loss: 0.66013: Latent Loss: 0.32161: Sparsity Loss: 0.00000: Recon Accuracy: 0.90808
Epoch = 184, Iteration = 0 Total Loss: 0.97671: Recon Loss: 0.65641: Latent Loss: 0.32029: Sparsity Loss: 0.00000: Recon Accuracy: 0.90916
Epoch = 185, Iteration = 0 Total Loss: 0.98636: Recon Loss: 0.66660: Latent Loss: 0.31976: Sparsity Loss: 0.00000: Recon Accuracy: 0.90863
Epoch = 186, Iteration = 0 

Epoch = 238, Iteration = 0 Total Loss: 0.84122: Recon Loss: 0.53971: Latent Loss: 0.30151: Sparsity Loss: 0.00000: Recon Accuracy: 0.92301
Epoch = 239, Iteration = 0 Total Loss: 0.84640: Recon Loss: 0.54572: Latent Loss: 0.30068: Sparsity Loss: 0.00000: Recon Accuracy: 0.92310
Epoch = 240, Iteration = 0 Total Loss: 0.84674: Recon Loss: 0.54602: Latent Loss: 0.30072: Sparsity Loss: 0.00000: Recon Accuracy: 0.92214
Epoch = 241, Iteration = 0 Total Loss: 0.84022: Recon Loss: 0.53880: Latent Loss: 0.30142: Sparsity Loss: 0.00000: Recon Accuracy: 0.92365
Epoch = 242, Iteration = 0 Total Loss: 0.82827: Recon Loss: 0.52650: Latent Loss: 0.30176: Sparsity Loss: 0.00000: Recon Accuracy: 0.92439
Epoch = 243, Iteration = 0 Total Loss: 0.81928: Recon Loss: 0.51849: Latent Loss: 0.30079: Sparsity Loss: 0.00000: Recon Accuracy: 0.92544
Epoch = 244, Iteration = 0 Total Loss: 0.81793: Recon Loss: 0.51974: Latent Loss: 0.29819: Sparsity Loss: 0.00000: Recon Accuracy: 0.92484
Epoch = 245, Iteration = 0 

Epoch = 297, Iteration = 0 Total Loss: 0.73994: Recon Loss: 0.45407: Latent Loss: 0.28586: Sparsity Loss: 0.00000: Recon Accuracy: 0.93449
Epoch = 298, Iteration = 0 Total Loss: 0.73378: Recon Loss: 0.44765: Latent Loss: 0.28613: Sparsity Loss: 0.00000: Recon Accuracy: 0.93555
Epoch = 299, Iteration = 0 Total Loss: 0.73711: Recon Loss: 0.45043: Latent Loss: 0.28668: Sparsity Loss: 0.00000: Recon Accuracy: 0.93579
Epoch = 300, Iteration = 0 Total Loss: 0.73938: Recon Loss: 0.45349: Latent Loss: 0.28588: Sparsity Loss: 0.00000: Recon Accuracy: 0.93497
Epoch = 301, Iteration = 0 Total Loss: 0.72932: Recon Loss: 0.44481: Latent Loss: 0.28451: Sparsity Loss: 0.00000: Recon Accuracy: 0.93546
Epoch = 302, Iteration = 0 Total Loss: 0.73621: Recon Loss: 0.45259: Latent Loss: 0.28361: Sparsity Loss: 0.00000: Recon Accuracy: 0.93527
Epoch = 303, Iteration = 0 Total Loss: 0.73347: Recon Loss: 0.45009: Latent Loss: 0.28338: Sparsity Loss: 0.00000: Recon Accuracy: 0.93524
Epoch = 304, Iteration = 0 

Epoch = 356, Iteration = 0 Total Loss: 0.65810: Recon Loss: 0.37394: Latent Loss: 0.28416: Sparsity Loss: 0.00000: Recon Accuracy: 0.94539
Epoch = 357, Iteration = 0 Total Loss: 0.65025: Recon Loss: 0.36571: Latent Loss: 0.28453: Sparsity Loss: 0.00000: Recon Accuracy: 0.94575
Epoch = 358, Iteration = 0 Total Loss: 0.66058: Recon Loss: 0.37613: Latent Loss: 0.28445: Sparsity Loss: 0.00000: Recon Accuracy: 0.94433
Epoch = 359, Iteration = 0 Total Loss: 0.68297: Recon Loss: 0.39862: Latent Loss: 0.28435: Sparsity Loss: 0.00000: Recon Accuracy: 0.94337
Epoch = 360, Iteration = 0 Total Loss: 0.66091: Recon Loss: 0.37709: Latent Loss: 0.28381: Sparsity Loss: 0.00000: Recon Accuracy: 0.94530
Epoch = 361, Iteration = 0 Total Loss: 0.67250: Recon Loss: 0.38943: Latent Loss: 0.28307: Sparsity Loss: 0.00000: Recon Accuracy: 0.94531
Epoch = 362, Iteration = 0 Total Loss: 0.64417: Recon Loss: 0.36196: Latent Loss: 0.28220: Sparsity Loss: 0.00000: Recon Accuracy: 0.94589
Epoch = 363, Iteration = 0 

Epoch = 415, Iteration = 0 Total Loss: 0.60745: Recon Loss: 0.32678: Latent Loss: 0.28067: Sparsity Loss: 0.00000: Recon Accuracy: 0.95158
Epoch = 416, Iteration = 0 Total Loss: 0.60482: Recon Loss: 0.32378: Latent Loss: 0.28103: Sparsity Loss: 0.00000: Recon Accuracy: 0.95265
Epoch = 417, Iteration = 0 Total Loss: 0.60970: Recon Loss: 0.32889: Latent Loss: 0.28081: Sparsity Loss: 0.00000: Recon Accuracy: 0.95209
Epoch = 418, Iteration = 0 Total Loss: 0.61153: Recon Loss: 0.33134: Latent Loss: 0.28019: Sparsity Loss: 0.00000: Recon Accuracy: 0.95175
Epoch = 419, Iteration = 0 Total Loss: 0.60616: Recon Loss: 0.32610: Latent Loss: 0.28006: Sparsity Loss: 0.00000: Recon Accuracy: 0.95192
Epoch = 420, Iteration = 0 Total Loss: 0.60351: Recon Loss: 0.32336: Latent Loss: 0.28014: Sparsity Loss: 0.00000: Recon Accuracy: 0.95197
Epoch = 421, Iteration = 0 Total Loss: 0.60331: Recon Loss: 0.32266: Latent Loss: 0.28065: Sparsity Loss: 0.00000: Recon Accuracy: 0.95267
Epoch = 422, Iteration = 0 

Epoch = 474, Iteration = 0 Total Loss: 0.54785: Recon Loss: 0.27599: Latent Loss: 0.27186: Sparsity Loss: 0.00000: Recon Accuracy: 0.95870
Epoch = 475, Iteration = 0 Total Loss: 0.56783: Recon Loss: 0.29669: Latent Loss: 0.27114: Sparsity Loss: 0.00000: Recon Accuracy: 0.95766
Epoch = 476, Iteration = 0 Total Loss: 0.55458: Recon Loss: 0.28406: Latent Loss: 0.27052: Sparsity Loss: 0.00000: Recon Accuracy: 0.95778
Epoch = 477, Iteration = 0 Total Loss: 0.56574: Recon Loss: 0.29544: Latent Loss: 0.27030: Sparsity Loss: 0.00000: Recon Accuracy: 0.95699
Epoch = 478, Iteration = 0 Total Loss: 0.55430: Recon Loss: 0.28345: Latent Loss: 0.27085: Sparsity Loss: 0.00000: Recon Accuracy: 0.95811
Epoch = 479, Iteration = 0 Total Loss: 0.55732: Recon Loss: 0.28578: Latent Loss: 0.27154: Sparsity Loss: 0.00000: Recon Accuracy: 0.95701
Epoch = 480, Iteration = 0 Total Loss: 0.55551: Recon Loss: 0.28351: Latent Loss: 0.27200: Sparsity Loss: 0.00000: Recon Accuracy: 0.95805
Epoch = 481, Iteration = 0 

Epoch = 533, Iteration = 0 Total Loss: 0.52692: Recon Loss: 0.25317: Latent Loss: 0.27375: Sparsity Loss: 0.00000: Recon Accuracy: 0.96236
Epoch = 534, Iteration = 0 Total Loss: 0.52384: Recon Loss: 0.25045: Latent Loss: 0.27339: Sparsity Loss: 0.00000: Recon Accuracy: 0.96219
Epoch = 535, Iteration = 0 Total Loss: 0.53513: Recon Loss: 0.26211: Latent Loss: 0.27301: Sparsity Loss: 0.00000: Recon Accuracy: 0.96216
Epoch = 536, Iteration = 0 Total Loss: 0.52948: Recon Loss: 0.25641: Latent Loss: 0.27306: Sparsity Loss: 0.00000: Recon Accuracy: 0.96287
Epoch = 537, Iteration = 0 Total Loss: 0.52041: Recon Loss: 0.24713: Latent Loss: 0.27328: Sparsity Loss: 0.00000: Recon Accuracy: 0.96304
Epoch = 538, Iteration = 0 Total Loss: 0.53796: Recon Loss: 0.26404: Latent Loss: 0.27392: Sparsity Loss: 0.00000: Recon Accuracy: 0.96213
Epoch = 539, Iteration = 0 Total Loss: 0.51579: Recon Loss: 0.24143: Latent Loss: 0.27437: Sparsity Loss: 0.00000: Recon Accuracy: 0.96316
Epoch = 540, Iteration = 0 

Extract features

In [36]:
mouse_features = mouse.features
mouse_features.shape

(1919, 256)

Distance(Mouse)

In [37]:
mouse_distance_matrix = distance.cdist(mouse_features, mouse_features, metric='euclidean')
print(mouse_distance_matrix)

[[ 0.         28.4559206  26.98228627 ... 16.34020913 21.4748581
  17.19929817]
 [28.4559206   0.          6.74056713 ... 26.24602918 23.39512832
  24.98490273]
 [26.98228627  6.74056713  0.         ... 25.94427404 24.72914067
  24.75897953]
 ...
 [16.34020913 26.24602918 25.94427404 ...  0.         11.59791024
  19.59761429]
 [21.4748581  23.39512832 24.72914067 ... 11.59791024  0.
  23.69723838]
 [17.19929817 24.98490273 24.75897953 ... 19.59761429 23.69723838
   0.        ]]


## Conlusion

### Human

In [38]:
print(human_distance_matrix)

[[ 0.         20.24218222 20.30243341 ... 26.33789178 26.25996382
  29.282374  ]
 [20.24218222  0.         19.96508601 ... 22.67847028 22.65769975
  23.74018358]
 [20.30243341 19.96508601  0.         ... 24.37955368 24.35895216
  27.1676564 ]
 ...
 [26.33789178 22.67847028 24.37955368 ...  0.          0.29928855
  25.03429252]
 [26.25996382 22.65769975 24.35895216 ...  0.29928855  0.
  24.99474488]
 [29.282374   23.74018358 27.1676564  ... 25.03429252 24.99474488
   0.        ]]


### Mouse

In [39]:
print(mouse_distance_matrix)

[[ 0.         28.4559206  26.98228627 ... 16.34020913 21.4748581
  17.19929817]
 [28.4559206   0.          6.74056713 ... 26.24602918 23.39512832
  24.98490273]
 [26.98228627  6.74056713  0.         ... 25.94427404 24.72914067
  24.75897953]
 ...
 [16.34020913 26.24602918 25.94427404 ...  0.         11.59791024
  19.59761429]
 [21.4748581  23.39512832 24.72914067 ... 11.59791024  0.
  23.69723838]
 [17.19929817 24.98490273 24.75897953 ... 19.59761429 23.69723838
   0.        ]]


### Save

In [40]:
np.savez('distance_combined.npz', matrix1=human_distance_matrix, 
                                matrix2=mouse_distance_matrix,)