## Preprocess data

We need to fetch the data and get specific columns. We'll then try and create some different tables to see how many records we have for each epitope. This way we can use them for different classifiers or one multiclassifier (combine them into one table).

In [124]:
import pandas as pd
df = pd.read_csv('./data/vdjdb.txt', sep="\t")

In [125]:
# Select the columns we need
selected_features = df[['gene','cdr3','v.segm','j.segm','species','mhc.a','mhc.b','mhc.class','antigen.epitope','antigen.species','vdjdb.score']]

In [126]:
# Select all human data
human_data = selected_features[(selected_features['species'] == 'HomoSapiens') & (selected_features['vdjdb.score'] > 0)]

# Drop duplicate rows
human_data = human_data.drop_duplicates()

# Delete rows with null values
human_data  = human_data.dropna()
# Print all data
human_data

Unnamed: 0,gene,cdr3,v.segm,j.segm,species,mhc.a,mhc.b,mhc.class,antigen.epitope,antigen.species,vdjdb.score
0,TRA,CIVRAPGRADMRF,TRAV26-1*01,TRAJ43*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,HIV-1,2
1,TRB,CASSYLPGQGDHYSNQPQHF,TRBV13*01,TRBJ1-5*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,HIV-1,2
2,TRB,CASSFEAGQGFFSNQPQHF,TRBV13*01,TRBJ1-5*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,HIV-1,2
3,TRA,CAVPSGAGSYQLTF,TRAV20*01,TRAJ28*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,HIV-1,2
4,TRB,CASSFEPGQGFYSNQPQHF,TRBV13*01,TRBJ1-5*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,HIV-1,2
...,...,...,...,...,...,...,...,...,...,...,...
92686,TRB,CASSQGSGGNEQFF,TRBV4-3*01,TRBJ2-1*01,HomoSapiens,HLA-DQA1*05:01,HLA-DQB1*02:01,MHCII,FPQPEQPFPWQP,Wheat,2
92689,TRA,CAASVLYGSSNTGKLIF,TRAV29/DV5*01,TRAJ37*01,HomoSapiens,HLA-DQA1*05:01,HLA-DQB1*02:01,MHCII,QLQPFPQPELPY,Wheat,2
92690,TRB,CASSIVGSGGYNEQFF,TRBV19*01,TRBJ2-1*01,HomoSapiens,HLA-DQA1*05:01,HLA-DQB1*02:01,MHCII,QLQPFPQPELPY,Wheat,2
92767,TRA,CAPQGATNKLIF,TRAV12-2*01,TRAJ32*01,HomoSapiens,HLA-DQA1*05:01,HLA-DQB1*02:01,MHCII,PQQPFPQPEQPFP,Wheat,2


In [127]:
# get beta chains only
TRB = human_data[human_data['gene'] =='TRB']
# rename the columns for our beta chain matrix calculation
beta_chains = TRB[['cdr3', 'v.segm', 'j.segm']]
beta_chains.rename(columns={'cdr3':'cdr3_b_aa','v.segm':'v_b_gene', 'j.segm':'j_b_gene'}, inplace=True)
beta_chains

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  beta_chains.rename(columns={'cdr3':'cdr3_b_aa','v.segm':'v_b_gene', 'j.segm':'j_b_gene'}, inplace=True)


Unnamed: 0,cdr3_b_aa,v_b_gene,j_b_gene
1,CASSYLPGQGDHYSNQPQHF,TRBV13*01,TRBJ1-5*01
2,CASSFEAGQGFFSNQPQHF,TRBV13*01,TRBJ1-5*01
4,CASSFEPGQGFYSNQPQHF,TRBV13*01,TRBJ1-5*01
6,CASSYEPGQVSHYSNQPQHF,TRBV13*01,TRBJ1-5*01
8,CASSALASLNEQFF,TRBV14*01,TRBJ2-1*01
...,...,...,...
92682,CASSLRATDTQYF,TRBV7-2*01,TRBJ2-3*01
92684,CATSRAGGGGEKLFF,TRBV15*01,TRBJ1-4*01
92686,CASSQGSGGNEQFF,TRBV4-3*01,TRBJ2-1*01
92690,CASSIVGSGGYNEQFF,TRBV19*01,TRBJ2-1*01


In [128]:
from tcrdist.repertoire import TCRrep
# calculate distances for our beta chains
tr = TCRrep(cell_df = beta_chains, 
            organism = 'human', 
            chains = ['beta'], 
            db_file = 'alphabeta_gammadelta_db.tsv')


  self._validate_cell_df()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.cell_df['count'] = 1
  clones = cell_df.groupby(index_cols)['count'].agg(np.sum).reset_index()


In [129]:
beta_chain_matrix = tr.pw_cdr3_b_aa # fetch the beta chains for the matrix

In [130]:
beta_distances = pd.DataFrame(beta_chain_matrix) # create matrix

In [131]:
beta_distances # print matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4457,4458,4459,4460,4461,4462,4463,4464,4465,4466
0,0,38,36,32,36,38,34,36,36,36,...,31,34,36,20,46,36,23,40,34,33
1,38,0,23,26,24,7,31,32,28,36,...,26,24,27,28,56,32,24,40,32,36
2,36,23,0,30,24,20,23,34,24,32,...,27,18,26,24,54,16,19,52,36,26
3,32,26,30,0,32,26,24,28,27,36,...,30,28,32,31,54,27,28,42,38,35
4,36,24,24,32,0,24,24,32,28,32,...,24,20,24,20,55,28,24,48,28,28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4462,36,32,16,27,28,32,23,27,27,32,...,27,22,24,28,51,0,20,50,29,20
4463,23,24,19,28,24,24,28,28,27,32,...,28,22,31,13,53,20,0,51,32,30
4464,40,40,52,42,48,39,48,52,49,47,...,47,52,47,47,38,50,51,0,35,47
4465,34,32,36,38,28,36,26,36,35,32,...,28,36,28,35,42,29,32,35,0,32


In [132]:
# Do the same as above but for alpha chains

TRA = human_data[human_data['gene'] =='TRA']
alpha_chains = TRA[['cdr3', 'v.segm', 'j.segm']]
alpha_chains.rename(columns={'cdr3':'cdr3_a_aa','v.segm':'v_a_gene', 'j.segm':'j_a_gene'}, inplace=True)

tr_alpha = TCRrep(cell_df = alpha_chains, 
            organism = 'human', 
            chains = ['alpha'], 
            db_file = 'alphabeta_gammadelta_db.tsv')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  alpha_chains.rename(columns={'cdr3':'cdr3_a_aa','v.segm':'v_a_gene', 'j.segm':'j_a_gene'}, inplace=True)

  self._validate_cell_df()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.cell_df['count'] = 1
  clones = cell_df.groupby(index_cols)['count'].agg(np.sum).reset_index()


In [133]:
# get alpha chain calculations and print them.
alpha_chain_matrix = tr_alpha.pw_cdr3_a_aa
alpha_distances = pd.DataFrame(alpha_chain_matrix)
alpha_distances

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1691,1692,1693,1694,1695,1696,1697,1698,1699,1700
0,0,30,24,3,32,55,36,48,34,28,...,30,28,36,27,31,31,32,33,28,36
1,30,0,23,30,28,40,32,45,24,32,...,31,30,36,32,24,32,36,31,32,35
2,24,23,0,24,24,44,24,45,22,24,...,28,12,28,24,22,23,27,25,30,31
3,3,30,24,0,31,54,36,51,34,28,...,27,28,36,24,30,31,31,33,31,36
4,32,28,24,31,0,40,16,40,17,18,...,23,28,24,19,24,20,16,24,25,26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1696,31,32,23,31,20,48,15,38,18,20,...,32,27,24,20,20,0,24,24,26,27
1697,32,36,27,31,16,51,28,48,27,24,...,30,31,28,27,27,24,0,32,25,32
1698,33,31,25,33,24,52,20,50,24,24,...,36,29,20,28,28,24,32,0,32,19
1699,28,32,30,31,25,50,30,44,30,22,...,29,30,34,25,26,26,25,32,0,35
