# Get distance matrices

We need to get different distance matrices (alpha, beta, and combined).

In [309]:
import pandas as pd
df = pd.read_csv('./data/vdjdb.txt', sep="\t")

In [310]:
# Select the columns we need
selected_features = df[['complex.id', 'gene','cdr3','v.segm','j.segm','species','mhc.a','mhc.b','mhc.class','antigen.epitope','antigen.species','vdjdb.score']]

In [311]:
# Select all human data
human_data = selected_features[(selected_features['species'] == 'HomoSapiens') & (selected_features['vdjdb.score'] > 0)]

# Drop duplicate rows
human_data = human_data.drop_duplicates()

# Delete rows with null values
human_data  = human_data.dropna()
# Print all data
human_data.head()

Unnamed: 0,complex.id,gene,cdr3,v.segm,j.segm,species,mhc.a,mhc.b,mhc.class,antigen.epitope,antigen.species,vdjdb.score
0,1,TRA,CIVRAPGRADMRF,TRAV26-1*01,TRAJ43*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,HIV-1,2
1,1,TRB,CASSYLPGQGDHYSNQPQHF,TRBV13*01,TRBJ1-5*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,HIV-1,2
2,0,TRB,CASSFEAGQGFFSNQPQHF,TRBV13*01,TRBJ1-5*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,HIV-1,2
3,2,TRA,CAVPSGAGSYQLTF,TRAV20*01,TRAJ28*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,HIV-1,2
4,2,TRB,CASSFEPGQGFYSNQPQHF,TRBV13*01,TRBJ1-5*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,HIV-1,2


# Distance matrix for beta chains

In [312]:
# get beta chains only
TRB = human_data[human_data['gene'] =='TRB']
# rename the columns for our beta chain matrix calculation
beta_chains = TRB[['cdr3', 'v.segm', 'j.segm']]
beta_chains.rename(columns={'cdr3':'cdr3_b_aa','v.segm':'v_b_gene', 'j.segm':'j_b_gene'}, inplace=True)
beta_chains

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  beta_chains.rename(columns={'cdr3':'cdr3_b_aa','v.segm':'v_b_gene', 'j.segm':'j_b_gene'}, inplace=True)


Unnamed: 0,cdr3_b_aa,v_b_gene,j_b_gene
1,CASSYLPGQGDHYSNQPQHF,TRBV13*01,TRBJ1-5*01
2,CASSFEAGQGFFSNQPQHF,TRBV13*01,TRBJ1-5*01
4,CASSFEPGQGFYSNQPQHF,TRBV13*01,TRBJ1-5*01
6,CASSYEPGQVSHYSNQPQHF,TRBV13*01,TRBJ1-5*01
8,CASSALASLNEQFF,TRBV14*01,TRBJ2-1*01
...,...,...,...
92682,CASSLRATDTQYF,TRBV7-2*01,TRBJ2-3*01
92684,CATSRAGGGGEKLFF,TRBV15*01,TRBJ1-4*01
92686,CASSQGSGGNEQFF,TRBV4-3*01,TRBJ2-1*01
92690,CASSIVGSGGYNEQFF,TRBV19*01,TRBJ2-1*01


In [313]:
from tcrdist.repertoire import TCRrep
# calculate distances for our beta chains
tr = TCRrep(cell_df = beta_chains, 
            organism = 'human', 
            chains = ['beta'], 
            db_file = 'alphabeta_gammadelta_db.tsv')


  self._validate_cell_df()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.cell_df['count'] = 1
  clones = cell_df.groupby(index_cols)['count'].agg(np.sum).reset_index()


In [314]:
beta_chain_matrix = tr.pw_cdr3_b_aa # fetch the beta chains for the matrix

In [315]:
beta_distances = pd.DataFrame(beta_chain_matrix) # create matrix

In [316]:
beta_distances # print matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4457,4458,4459,4460,4461,4462,4463,4464,4465,4466
0,0,38,36,32,36,38,34,36,36,36,...,31,34,36,20,46,36,23,40,34,33
1,38,0,23,26,24,7,31,32,28,36,...,26,24,27,28,56,32,24,40,32,36
2,36,23,0,30,24,20,23,34,24,32,...,27,18,26,24,54,16,19,52,36,26
3,32,26,30,0,32,26,24,28,27,36,...,30,28,32,31,54,27,28,42,38,35
4,36,24,24,32,0,24,24,32,28,32,...,24,20,24,20,55,28,24,48,28,28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4462,36,32,16,27,28,32,23,27,27,32,...,27,22,24,28,51,0,20,50,29,20
4463,23,24,19,28,24,24,28,28,27,32,...,28,22,31,13,53,20,0,51,32,30
4464,40,40,52,42,48,39,48,52,49,47,...,47,52,47,47,38,50,51,0,35,47
4465,34,32,36,38,28,36,26,36,35,32,...,28,36,28,35,42,29,32,35,0,32


# Distance matrix for alpha chains

In [317]:
# Do the same as above but for alpha chains

TRA = human_data[human_data['gene'] =='TRA']
alpha_chains = TRA[['cdr3', 'v.segm', 'j.segm']]
alpha_chains.rename(columns={'cdr3':'cdr3_a_aa','v.segm':'v_a_gene', 'j.segm':'j_a_gene'}, inplace=True)

tr_alpha = TCRrep(cell_df = alpha_chains, 
            organism = 'human', 
            chains = ['alpha'], 
            db_file = 'alphabeta_gammadelta_db.tsv')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  alpha_chains.rename(columns={'cdr3':'cdr3_a_aa','v.segm':'v_a_gene', 'j.segm':'j_a_gene'}, inplace=True)

  self._validate_cell_df()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.cell_df['count'] = 1
  clones = cell_df.groupby(index_cols)['count'].agg(np.sum).reset_index()


In [318]:
# get alpha chain calculations and print them.
alpha_chain_matrix = tr_alpha.pw_cdr3_a_aa
alpha_distances = pd.DataFrame(alpha_chain_matrix)
alpha_distances

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1691,1692,1693,1694,1695,1696,1697,1698,1699,1700
0,0,30,24,3,32,55,36,48,34,28,...,30,28,36,27,31,31,32,33,28,36
1,30,0,23,30,28,40,32,45,24,32,...,31,30,36,32,24,32,36,31,32,35
2,24,23,0,24,24,44,24,45,22,24,...,28,12,28,24,22,23,27,25,30,31
3,3,30,24,0,31,54,36,51,34,28,...,27,28,36,24,30,31,31,33,31,36
4,32,28,24,31,0,40,16,40,17,18,...,23,28,24,19,24,20,16,24,25,26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1696,31,32,23,31,20,48,15,38,18,20,...,32,27,24,20,20,0,24,24,26,27
1697,32,36,27,31,16,51,28,48,27,24,...,30,31,28,27,27,24,0,32,25,32
1698,33,31,25,33,24,52,20,50,24,24,...,36,29,20,28,28,24,32,0,32,19
1699,28,32,30,31,25,50,30,44,30,22,...,29,30,34,25,26,26,25,32,0,35


# distances for alpha and beta pairs

In [319]:
# Let's get all the IDs for the TCRs (A & B pairs should have the same ID)
_ids = human_data['complex.id']
_ids

0            1
1            1
2            0
3            2
4            2
         ...  
92686    30552
92689    30554
92690    30554
92767    30593
92768    30593
Name: complex.id, Length: 7799, dtype: int64

In [320]:
list_to_combine = []
checked_ids = []
def process_row(row):
    # check the complex id not already checked
    if row['complex.id'] not in checked_ids:
        # find matching rows
        matched_rows = human_data[human_data['complex.id'] == row['complex.id']]
        # should be two (some rows have only 1 match)
        if len(matched_rows) == 2:
            # get the tcra row
            tra_row = matched_rows.iloc[0]
            # get the tcrb row
            trb_row = matched_rows.iloc[1]
            # add to list as a combined row
            list_to_combine.append({'tcr_id_a':tra_row['complex.id'], 'tcr_id_b':trb_row['complex.id'], 
                                    'cdr3_a_aa': tra_row['cdr3'], 'cdr3_b_aa': trb_row['cdr3'],
                                    'v_b_gene' :trb_row['v.segm'],
                                    'j_b_gene':trb_row['j.segm'],
                                    'v_a_gene':tra_row['v.segm'],
                                    'j_a_gene':tra_row['j.segm'],
                                   })
            # we've checked this id now, so we need to make sure we don't have to check it again.
            checked_ids.append(row['complex.id'])
human_data.apply(process_row, axis=1)

0        None
1        None
2        None
3        None
4        None
         ... 
92686    None
92689    None
92690    None
92767    None
92768    None
Length: 7799, dtype: object

In [321]:
# Create table representing paired tcr rows
paired_table = pd.DataFrame(list_to_combine)
paired_table

Unnamed: 0,tcr_id_a,tcr_id_b,cdr3_a_aa,cdr3_b_aa,v_b_gene,j_b_gene,v_a_gene,j_a_gene
0,1,1,CIVRAPGRADMRF,CASSYLPGQGDHYSNQPQHF,TRBV13*01,TRBJ1-5*01,TRAV26-1*01,TRAJ43*01
1,2,2,CAVPSGAGSYQLTF,CASSFEPGQGFYSNQPQHF,TRBV13*01,TRBJ1-5*01,TRAV20*01,TRAJ28*01
2,4,4,CAYRPPGTYKYIF,CASSALASLNEQFF,TRBV14*01,TRBJ2-1*01,TRAV38-2/DV8*01,TRAJ40*01
3,5,5,CIVRAPGRADMRF,CASSYLPGQGDHYSNQPQHF,TRBV13*01,TRBJ1-5*01,TRAV26-1*01,TRAJ43*01
4,6,6,CAVPSGAGSYQLTF,CASSFEPGQGFYSNQPQHF,TRBV13*01,TRBJ1-5*01,TRAV20*01,TRAJ28*01
...,...,...,...,...,...,...,...,...
1863,30550,30550,CIALNARLMF,CASSLRATDTQYF,TRBV7-2*01,TRBJ2-3*01,TRAV26-1*01,TRAJ31*01
1864,30551,30551,CAMREGRYSSASKIIF,CATSRAGGGGEKLFF,TRBV15*01,TRBJ1-4*01,TRAV14/DV4*01,TRAJ3*01
1865,30552,30552,CLVGDGDGGATNKLIF,CASSQGSGGNEQFF,TRBV4-3*01,TRBJ2-1*01,TRAV4*01,TRAJ32*01
1866,30554,30554,CAASVLYGSSNTGKLIF,CASSIVGSGGYNEQFF,TRBV19*01,TRBJ2-1*01,TRAV29/DV5*01,TRAJ37*01


In [322]:
# Get paired distances

tr_paired = TCRrep(cell_df = paired_table, 
            organism = 'human', 
            chains = ['alpha','beta'], 
            db_file = 'alphabeta_gammadelta_db.tsv')


  self._validate_cell_df()
  clones = cell_df.groupby(index_cols)['count'].agg(np.sum).reset_index()


In [323]:
# get alpha chain distance calculations for paired tcrs and print them.
paired_matrix_alpha_chain = tr_paired.pw_alpha 
paired_alpha_distances = pd.DataFrame(paired_matrix_alpha_chain)
paired_alpha_distances


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1858,1859,1860,1861,1862,1863,1864,1865,1866,1867
0,0,138,110,0,138,0,138,0,138,158,...,151,93,150,72,90,78,164,159,194,149
1,138,0,131,138,0,138,0,138,0,126,...,146,147,102,147,129,141,145,141,152,132
2,110,131,0,110,131,110,131,110,131,168,...,178,137,131,146,122,140,137,166,173,129
3,0,138,110,0,138,0,138,0,138,158,...,151,93,150,72,90,78,164,159,194,149
4,138,0,131,138,0,138,0,138,0,126,...,146,147,102,147,129,141,145,141,152,132
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1863,78,141,140,78,141,78,141,78,141,170,...,157,72,138,54,42,0,167,144,182,128
1864,164,145,137,164,145,164,145,164,145,168,...,177,170,146,176,164,167,0,149,160,159
1865,159,141,166,159,141,159,141,159,141,154,...,152,153,149,144,138,144,149,0,141,133
1866,194,152,173,194,152,194,152,194,152,143,...,190,197,154,194,176,182,160,141,0,158


In [324]:
# get beta chain distance calculations for paired tcrs and print them.
paired_matrix_beta_chain = tr_paired.pw_beta
paired_beta_distances = pd.DataFrame(paired_matrix_beta_chain)
paired_beta_distances

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1858,1859,1860,1861,1862,1863,1864,1865,1866,1867
0,0,39,179,0,39,0,39,0,39,220,...,180,193,237,200,191,209,183,193,197,177
1,39,0,179,39,0,39,0,39,0,202,...,159,193,225,188,176,197,171,181,179,165
2,179,179,0,179,179,179,179,179,179,138,...,125,93,124,100,100,97,156,100,124,125
3,0,39,179,0,39,0,39,0,39,220,...,180,193,237,200,191,209,183,193,197,177
4,39,0,179,39,0,39,0,39,0,202,...,159,193,225,188,176,197,171,181,179,165
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1863,209,197,97,209,197,209,197,209,197,153,...,97,141,143,15,21,0,155,123,147,97
1864,183,171,156,183,171,183,171,183,171,130,...,131,157,129,155,155,155,0,98,131,119
1865,193,181,100,193,181,193,181,193,181,129,...,112,130,122,114,114,123,98,0,121,97
1866,197,179,124,197,179,197,179,197,179,115,...,155,120,159,141,153,147,131,121,0,122
