Process the McPAS data used by CLAIRE to the format that can be used by DePTH

In [1]:
import pandas as pd
import numpy as np

from collections import defaultdict
from collections import Counter

import matplotlib.pyplot as plt

import pickle

import math

In [2]:
data_dir = "../data/CLAIRE_mcpas_data/TCR-HLA-binding-predictor-for-PTH-datasets/"

In [3]:
train_sample = open(data_dir + "mcpas_train_samples.pickle", "rb")
valid_sample = open(data_dir + "mcpas_validation_samples.pickle", "rb")
test_sample = open(data_dir + "mcpas_test_samples.pickle", "rb")

In [4]:
train_pairs = pickle.load(train_sample)
valid_pairs = pickle.load(valid_sample)
test_pairs = pickle.load(test_sample)

print(len(train_pairs))
print(len(valid_pairs))
print(len(test_pairs))

15060
3259
3259


In [5]:
train_pairs[0]

{'tcra': 'UNK',
 'tcrb': 'CASSLMTDQPQHF',
 'va': nan,
 'ja': nan,
 'vb': 'TRBV07-09',
 'jb': 'TRBJ01',
 't_cell_type': 'CD8',
 'mhc': 'HLA-A02:01',
 'sign': 0}

In [None]:
# based on all hlas, tcrs in train + valid + test together
# build dictionaries to
# only keep the hlas that we can deal with
# translate v genes into the format that we have sequence information for
# for the v genes that we do not have sequence information for, we write the v gene
# as "not_found" and translate to sequences of all "." with corresponding length in later steps
# for the v genes with NA, we write the v gene as "not_found" and translate to 
# sequences of all "." with corresponding length in later steps. 

In [None]:
# first, only keep the pairs having v gene beta information and
# CDR3 sequences without strange characters

# separate into pairs involving HLA-I alleles 
# and those involving HLA-II alleles

In [6]:
all_pairs = train_pairs + valid_pairs + test_pairs
len(all_pairs)

21578

In [7]:
all_pairs[0]

{'tcra': 'UNK',
 'tcrb': 'CASSLMTDQPQHF',
 'va': nan,
 'ja': nan,
 'vb': 'TRBV07-09',
 'jb': 'TRBJ01',
 't_cell_type': 'CD8',
 'mhc': 'HLA-A02:01',
 'sign': 0}

In [8]:
Counter([x['mhc'] for x in all_pairs])

Counter({'HLA-A02:01': 8462,
         'HLA-B07:01': 4371,
         'HLA-A01:01': 827,
         'HLA-B07:02': 1108,
         'HLA-C07:02': 1560,
         'HLA-DQ02:05': 19,
         'HLA-DR01:04': 965,
         'HLA-B35:01': 94,
         'HLA-B27:05': 220,
         'HLA-B08:01': 699,
         'HLA-DQ02:01': 321,
         'HLA-B42:01': 671,
         'HLA-DR11:01': 197,
         'HLA-B57:01': 212,
         'HLA-A24:02': 479,
         'HLA-A02:02': 3,
         'HLA-DR01:15': 19,
         'HLA-A11:01': 238,
         'HLA-DR05:01': 200,
         'HLA-B57:03': 174,
         'HLA-DR15:01': 261,
         'HLA-DQ01:06': 167,
         'HLA-A02:15': 4,
         'HLA-DR01:01': 106,
         'HLA-C16:01': 13,
         'HLA-DQ08:01': 31,
         'HLA-B14:01': 26,
         'HLA-B44:05': 8,
         'HLA-A02:05': 5,
         'HLA-B15:01': 14,
         'HLA-DR03:02': 5,
         'HLA-DR04:01': 11,
         'HLA-A02:17': 4,
         'HLA-A02:04': 2,
         'HLA-DQ01:01': 12,
         'HLA-A02:08': 4,


In [9]:
Counter([x['vb'] for x in all_pairs])

Counter({'TRBV07-09': 2017,
         'TRBV07-08': 1227,
         'TRBV02-01': 1163,
         'TRBV20-01': 1103,
         'TRBV09-01': 677,
         nan: 1,
         'TRBV03-01': 398,
         'TRBV13-01': 232,
         'TRBV04-01': 913,
         'TRBV06-05': 505,
         'TRBV29-01': 909,
         'TRBV10-03': 184,
         'TRBV15-01': 308,
         'TRBV01-01': 307,
         'TRBV07-03': 343,
         'TRBV25-01': 185,
         'TRBV06-01': 728,
         nan: 1,
         'TRBV11-02': 214,
         'TRBV19-01': 1346,
         'TRBV04-03': 719,
         'TRBV05-01': 382,
         'TRBV30-01': 533,
         nan: 1,
         'TRBV28-01': 758,
         'TRBV07-02': 1175,
         'TRBV26-01': 10,
         'TRBV27-01': 851,
         'TRBV12-03': 502,
         'TRBV12-04': 149,
         'TRBV04-02': 337,
         nan: 1,
         'TRBV06-09': 17,
         nan: 1,
         nan: 1,
         'TRBV05-05': 64,
         'TRBV06-02': 426,
         'TRBV10-02': 52,
         'TRBV07-04': 24,
      

In [10]:
set("".join([x['tcrb'] for x in all_pairs]))

{'A',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'K',
 'L',
 'M',
 'N',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'V',
 'W',
 'Y'}

In [11]:
AA_SYMOLS = ['A', 'R', 'N', 'D', 'C',
             'Q', 'E', 'G', 'H', 'I',
             'L', 'K', 'M', 'F', 'P',
             'S', 'T', 'W', 'Y', 'V']

In [12]:
# There is no strange character in CDR3 part
set("".join([x['tcrb'] for x in all_pairs])) == set(AA_SYMOLS)

True

In [13]:
kept_pairs = [(x['vb'], x['tcrb'], x['mhc']) for x in all_pairs]
len(kept_pairs)

21578

In [14]:
hla_vec = set([x[2] for x in kept_pairs])
v_beta_vec = set([x[0] for x in kept_pairs])

In [15]:
len(hla_vec)

49

In [16]:
len(v_beta_vec)

550

In [17]:
len(set([x[0] for x in kept_pairs if x[0]==x[0]]))

83

In [18]:
set([len(x[1]) for x in kept_pairs])

{6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24}

In [19]:
hla_vec

{'HLA-A01:01',
 'HLA-A02:01',
 'HLA-A02:02',
 'HLA-A02:03',
 'HLA-A02:04',
 'HLA-A02:05',
 'HLA-A02:06',
 'HLA-A02:07',
 'HLA-A02:08',
 'HLA-A02:09',
 'HLA-A02:10',
 'HLA-A02:11',
 'HLA-A02:12',
 'HLA-A02:13',
 'HLA-A02:14',
 'HLA-A02:15',
 'HLA-A02:16',
 'HLA-A02:17',
 'HLA-A11:01',
 'HLA-A24:02',
 'HLA-B07:01',
 'HLA-B07:02',
 'HLA-B08:01',
 'HLA-B14:01',
 'HLA-B15:01',
 'HLA-B27:01',
 'HLA-B27:05',
 'HLA-B35:01',
 'HLA-B35:02',
 'HLA-B42:01',
 'HLA-B44:05',
 'HLA-B57:01',
 'HLA-B57:03',
 'HLA-C07:02',
 'HLA-C16:01',
 'HLA-DQ01:01',
 'HLA-DQ01:06',
 'HLA-DQ02:01',
 'HLA-DQ02:05',
 'HLA-DQ05:01',
 'HLA-DQ08:01',
 'HLA-DR01:01',
 'HLA-DR01:04',
 'HLA-DR01:15',
 'HLA-DR03:02',
 'HLA-DR04:01',
 'HLA-DR05:01',
 'HLA-DR11:01',
 'HLA-DR15:01'}

In [20]:
# load the HLA-I and HLA-II alleles that we have pseudo sequences for
hla_i_pseudo = pd.read_csv("../data/for_encoders/HLA_I_pseudo_40.csv", header = 0)
hla_ii_pseudo = pd.read_csv("../data/for_encoders/HLA_II_pseudo_45.csv", header = 0)

In [21]:
set(hla_i_pseudo.hla)

{'HLA-A*01:01',
 'HLA-A*02:01',
 'HLA-A*02:05',
 'HLA-A*02:06',
 'HLA-A*03:01',
 'HLA-A*03:02',
 'HLA-A*11:01',
 'HLA-A*23:01',
 'HLA-A*24:02',
 'HLA-A*24:03',
 'HLA-A*25:01',
 'HLA-A*26:01',
 'HLA-A*29:01',
 'HLA-A*29:02',
 'HLA-A*30:01',
 'HLA-A*30:02',
 'HLA-A*31:01',
 'HLA-A*32:01',
 'HLA-A*33:01',
 'HLA-A*33:03',
 'HLA-A*34:01',
 'HLA-A*66:01',
 'HLA-A*68:01',
 'HLA-A*68:02',
 'HLA-B*07:02',
 'HLA-B*07:05',
 'HLA-B*08:01',
 'HLA-B*13:02',
 'HLA-B*14:01',
 'HLA-B*14:02',
 'HLA-B*15:01',
 'HLA-B*15:03',
 'HLA-B*15:07',
 'HLA-B*15:17',
 'HLA-B*15:18',
 'HLA-B*18:01',
 'HLA-B*27:05',
 'HLA-B*35:01',
 'HLA-B*35:02',
 'HLA-B*35:03',
 'HLA-B*35:08',
 'HLA-B*37:01',
 'HLA-B*38:01',
 'HLA-B*38:02',
 'HLA-B*39:01',
 'HLA-B*39:06',
 'HLA-B*40:01',
 'HLA-B*40:02',
 'HLA-B*40:06',
 'HLA-B*41:01',
 'HLA-B*41:02',
 'HLA-B*44:02',
 'HLA-B*44:03',
 'HLA-B*45:01',
 'HLA-B*48:01',
 'HLA-B*49:01',
 'HLA-B*50:01',
 'HLA-B*51:01',
 'HLA-B*52:01',
 'HLA-B*53:01',
 'HLA-B*55:01',
 'HLA-B*56:01',
 'HLA-B*

In [22]:
set(hla_ii_pseudo.hla)

{'HLA-DPAB*01:03_01:01',
 'HLA-DPAB*01:03_02:01',
 'HLA-DPAB*01:03_03:01',
 'HLA-DPAB*01:03_04:01',
 'HLA-DPAB*01:03_04:02',
 'HLA-DPAB*01:03_05:01',
 'HLA-DPAB*01:03_10:01',
 'HLA-DPAB*01:03_11:01',
 'HLA-DPAB*01:03_13:01',
 'HLA-DPAB*01:03_17:01',
 'HLA-DPAB*02:01_01:01',
 'HLA-DPAB*02:01_02:01',
 'HLA-DPAB*02:01_03:01',
 'HLA-DPAB*02:01_04:01',
 'HLA-DPAB*02:01_04:02',
 'HLA-DPAB*02:01_05:01',
 'HLA-DPAB*02:01_10:01',
 'HLA-DPAB*02:01_11:01',
 'HLA-DPAB*02:01_13:01',
 'HLA-DPAB*02:01_17:01',
 'HLA-DPAB*02:02_01:01',
 'HLA-DPAB*02:02_02:01',
 'HLA-DPAB*02:02_04:01',
 'HLA-DPAB*02:02_04:02',
 'HLA-DPAB*02:02_05:01',
 'HLA-DQAB*01:01_02:02',
 'HLA-DQAB*01:01_03:01',
 'HLA-DQAB*01:01_03:02',
 'HLA-DQAB*01:01_05:01',
 'HLA-DQAB*01:01_05:03',
 'HLA-DQAB*01:01_06:02',
 'HLA-DQAB*01:01_06:03',
 'HLA-DQAB*01:02_02:01',
 'HLA-DQAB*01:02_02:02',
 'HLA-DQAB*01:02_03:01',
 'HLA-DQAB*01:02_03:02',
 'HLA-DQAB*01:02_03:03',
 'HLA-DQAB*01:02_04:02',
 'HLA-DQAB*01:02_05:01',
 'HLA-DQAB*01:02_05:02',


In [23]:
# translate the hla in Glazer 2022 format to our format

hla_vec_list = list(hla_vec)
hla_i_vec_list = [x for x in hla_vec_list if x[:5] in set(["HLA-A", "HLA-B", "HLA-C"])]
hla_ii_vec_list = [x for x in hla_vec_list if x[:5] not in set(["HLA-A", "HLA-B", "HLA-C"])]

hla_i_vec_list

['HLA-B35:02',
 'HLA-A01:01',
 'HLA-A02:05',
 'HLA-B07:01',
 'HLA-C16:01',
 'HLA-A02:02',
 'HLA-A02:16',
 'HLA-B07:02',
 'HLA-A02:11',
 'HLA-B27:01',
 'HLA-A02:10',
 'HLA-A11:01',
 'HLA-A02:14',
 'HLA-A02:07',
 'HLA-B08:01',
 'HLA-A02:17',
 'HLA-B35:01',
 'HLA-A02:13',
 'HLA-A02:04',
 'HLA-C07:02',
 'HLA-A02:15',
 'HLA-A02:09',
 'HLA-A02:12',
 'HLA-B42:01',
 'HLA-B44:05',
 'HLA-B14:01',
 'HLA-A02:01',
 'HLA-A02:03',
 'HLA-A02:06',
 'HLA-A24:02',
 'HLA-B27:05',
 'HLA-A02:08',
 'HLA-B57:03',
 'HLA-B15:01',
 'HLA-B57:01']

In [24]:
hla_ii_vec_list

['HLA-DR03:02',
 'HLA-DQ01:01',
 'HLA-DR01:15',
 'HLA-DR04:01',
 'HLA-DQ02:01',
 'HLA-DR11:01',
 'HLA-DQ02:05',
 'HLA-DR01:04',
 'HLA-DR05:01',
 'HLA-DQ08:01',
 'HLA-DQ01:06',
 'HLA-DQ05:01',
 'HLA-DR01:01',
 'HLA-DR15:01']

In [25]:
# deal with hla-i alleles first
hla_i_vec_list_start = [x[:5]+"*"+x[5:] for x in hla_i_vec_list]
len(hla_i_vec_list_start)

35

In [26]:
hla_i_inter = set(hla_i_vec_list_start).intersection(set(hla_i_pseudo.hla))
hla_i_inter

{'HLA-A*01:01',
 'HLA-A*02:01',
 'HLA-A*02:05',
 'HLA-A*02:06',
 'HLA-A*11:01',
 'HLA-A*24:02',
 'HLA-B*07:02',
 'HLA-B*08:01',
 'HLA-B*14:01',
 'HLA-B*15:01',
 'HLA-B*27:05',
 'HLA-B*35:01',
 'HLA-B*35:02',
 'HLA-B*57:01',
 'HLA-C*07:02',
 'HLA-C*16:01'}

In [27]:
len(hla_i_inter)

16

In [28]:
# next, translate the v genes to the format that we have sequence 
# information for

df_combo = pd.read_csv("../data/for_encoders/combo_xcr.tsv", sep = "\t", header = 0)
df_combo.shape

(1793, 9)

In [29]:
df_combo[:2]

Unnamed: 0,id,organism,chain,region,nucseq,frame,aligned_protseq,cdr_columns,cdrs
0,TRAV1*01,mouse,A,V,ggacagggcgtggagcagcctgacaacttgatgtctgtagagggaa...,1,GQGVEQ.P.DNLMSVEGTFARVNCTYSTSG......FNGLSWYQQR...,28-39;57-66;82-88;106-111,TSG......FNG;VVL....DGL;SRSN.GY;CAVR..
1,TRAV1*02,mouse,A,V,ggacagggtgtggagcagcctgccaaattgatgtctgtggagggaa...,1,GQGVEQ.P.AKLMSVEGTFARVNCTYSTSG......FNGLSWYQQR...,28-39;57-66;82-88;106-111,TSG......FNG;VVL....DGL;SRSN.GY;CAVR..


In [30]:
Counter(df_combo.organism)

Counter({'mouse': 395,
         'human': 332,
         'mouse_gd': 301,
         'human_gd': 140,
         'human_ig': 625})

In [31]:
Counter(df_combo.frame)

Counter({1: 1588, 2: 80, 3: 125})

In [32]:
combo_v_gene = [v_gene for v_gene, org, chain_name, region_name in zip(df_combo.id, 
                                                                       df_combo.organism, 
                                                                       df_combo.chain, 
                                                                       df_combo.region)\
               if ((org == "human") and (chain_name == "B") and (region_name == "V"))]

In [33]:
combo_v_gene

['TRBV1*01',
 'TRBV10-1*01',
 'TRBV10-1*02',
 'TRBV10-2*01',
 'TRBV10-2*02',
 'TRBV10-3*01',
 'TRBV10-3*02',
 'TRBV10-3*03',
 'TRBV10-3*04',
 'TRBV11-1*01',
 'TRBV11-2*01',
 'TRBV11-2*02',
 'TRBV11-2*03',
 'TRBV11-3*01',
 'TRBV11-3*02',
 'TRBV11-3*03',
 'TRBV12-1*01',
 'TRBV12-2*01',
 'TRBV12-3*01',
 'TRBV12-4*01',
 'TRBV12-4*02',
 'TRBV12-5*01',
 'TRBV13*01',
 'TRBV13*02',
 'TRBV14*01',
 'TRBV14*02',
 'TRBV15*01',
 'TRBV15*02',
 'TRBV15*03',
 'TRBV16*01',
 'TRBV16*02',
 'TRBV16*03',
 'TRBV17*01',
 'TRBV18*01',
 'TRBV19*01',
 'TRBV19*02',
 'TRBV19*03',
 'TRBV2*01',
 'TRBV2*02',
 'TRBV2*03',
 'TRBV20-1*01',
 'TRBV20-1*02',
 'TRBV20-1*03',
 'TRBV20-1*04',
 'TRBV20-1*05',
 'TRBV20-1*06',
 'TRBV20-1*07',
 'TRBV20/OR9-2*01',
 'TRBV20/OR9-2*02',
 'TRBV20/OR9-2*03',
 'TRBV21-1*01',
 'TRBV21/OR9-2*01',
 'TRBV23-1*01',
 'TRBV23/OR9-2*01',
 'TRBV23/OR9-2*02',
 'TRBV24-1*01',
 'TRBV24/OR9-2*01',
 'TRBV25-1*01',
 'TRBV26*01',
 'TRBV26/OR9-2*01',
 'TRBV26/OR9-2*02',
 'TRBV27*01',
 'TRBV28*01',
 'TR

In [34]:
# the 'TRBV201-01' is weird and should be translated into another format
[v for v in v_beta_vec if v==v]

['TRBV07-07',
 'TRBV07-05',
 'TRBV07-09',
 'TRBV12-03',
 'TRBV23-01',
 'TRBV02-05',
 'TRBV02-01',
 'TRBV25-01',
 'TRBV05-03',
 'TRBV05-07',
 'TRBV06-04',
 'TRBV12-02',
 'TRBV09-02',
 'TRBV14-01',
 'TRBV08-03',
 'TRBV08-02',
 'TRBV06-08',
 'TRBV01-01',
 'TRBV06-01',
 'TRBV02-07',
 'TRBV201-01',
 'TRBV12-01',
 'TRBV12-05',
 'TRBV27-01',
 'TRBV06-05',
 'TRBV07-04',
 'TRBV18-01',
 'TRBV06-07',
 'TRBV07-06',
 'TRBV11-03',
 'TRBV29-05',
 'TRBV08-01',
 'TRBV15-02',
 'TRBV11-01',
 'TRBV07-03',
 'TRBV01-05',
 'TRBV05-08',
 'TRBV21-03',
 'TRBV06-02',
 'TRBV10-03',
 'TRBV04-01',
 'TRBV02-02',
 'TRBV06-06',
 'TRBV22-01',
 'TRBV05-06',
 'TRBV19-01',
 'TRBV20-01',
 'TRBV28-01',
 'TRBV06-03',
 'TRBV08-04',
 'TRBV10-01',
 'TRBV13-06',
 'TRBV15-01',
 'TRBV04-02',
 'TRBV05-01',
 'TRBV04-03',
 'TRBV02-03',
 'TRBV05-04',
 'TRBV23-06',
 'TRBV13-02',
 'TRBV26-02',
 'TRBV13-01',
 'TRBV07-08',
 'TRBV07-02',
 'TRBV21-01',
 'TRBV17-01',
 'TRBV11-02',
 'TRBV05-05',
 'TRBV03-01',
 'TRBV05-02',
 'TRBV06-09',
 'TRB

write a dictionary to translate v gene into the format we can deal with in the 

    ../data/for_encoders/combo_xcr.tsv

reference resource for matching

https://www.imgt.org/IMGTrepertoire/index.php?section=LocusGenes&repertoire=genetable&species=human&group=TRBV

In [35]:
v_trans_dict = defaultdict(str)


v_trans_dict['TRBV01-01'] = 'TRBV1*01'

v_trans_dict['TRBV10-01'] = 'TRBV10-1*01'
v_trans_dict['TRBV10-02'] = 'TRBV10-2*01'
v_trans_dict['TRBV10-03'] = 'TRBV10-3*01'

v_trans_dict['TRBV11-01'] = 'TRBV11-1*01'
v_trans_dict['TRBV11-02'] = 'TRBV11-2*01'
v_trans_dict['TRBV11-03'] = 'TRBV11-3*01'

v_trans_dict['TRBV12-01'] = 'TRBV12-1*01'
v_trans_dict['TRBV12-02'] = 'TRBV12-2*01'
v_trans_dict['TRBV12-03'] = 'TRBV12-3*01'
v_trans_dict['TRBV12-04'] = 'TRBV12-4*01'
v_trans_dict['TRBV12-05'] = 'TRBV12-5*01'

v_trans_dict['TRBV13-01'] = 'TRBV13*01'
v_trans_dict['TRBV13-02'] = 'TRBV13*02'

v_trans_dict['TRBV14-01'] = 'TRBV14*01'

v_trans_dict['TRBV15-01'] = 'TRBV15*01'
v_trans_dict['TRBV15-02'] = 'TRBV15*02'

v_trans_dict['TRBV16-01'] = 'TRBV16*01'

v_trans_dict['TRBV17-01'] = 'TRBV17*01'

v_trans_dict['TRBV18-01'] = 'TRBV18*01'

v_trans_dict['TRBV19-01'] = 'TRBV19*01'

v_trans_dict['TRBV02-01'] = 'TRBV2*01'
v_trans_dict['TRBV02-02'] = 'TRBV2*02'
v_trans_dict['TRBV02-03'] = 'TRBV2*03'

v_trans_dict['TRBV20-01'] = 'TRBV20-1*01'
v_trans_dict['TRBV201-1'] = 'TRBV20-1*01'

v_trans_dict['TRBV21-01'] = 'TRBV21-1*01'

v_trans_dict['TRBV23-01'] = 'TRBV23-1*01'

v_trans_dict['TRBV24-01'] = 'TRBV24-1*01'

v_trans_dict['TRBV25-01'] = 'TRBV25-1*01'

v_trans_dict['TRBV26-01'] = 'TRBV26*01'

v_trans_dict['TRBV27-01'] = 'TRBV27*01'

v_trans_dict['TRBV28-01'] = 'TRBV28*01'

v_trans_dict['TRBV29-01'] = 'TRBV29-1*01'


v_trans_dict['TRBV03-01'] = 'TRBV3-1*01'
v_trans_dict['TRBV03-02'] = 'TRBV3-2*01'
    
v_trans_dict['TRBV30-01'] = 'TRBV30*01'

v_trans_dict['TRBV04-01'] = 'TRBV4-1*01'
v_trans_dict['TRBV04-02'] = 'TRBV4-2*01'
v_trans_dict['TRBV04-03'] = 'TRBV4-3*01'

v_trans_dict['TRBV05-01'] = 'TRBV5-1*01'
v_trans_dict['TRBV05-04'] = 'TRBV5-4*01'
v_trans_dict['TRBV05-05'] = 'TRBV5-5*01'  
v_trans_dict['TRBV05-06'] = 'TRBV5-6*01'   
v_trans_dict['TRBV05-07'] = 'TRBV5-7*01'
v_trans_dict['TRBV05-08'] = 'TRBV5-8*01'

v_trans_dict['TRBV06-01'] = 'TRBV6-1*01'
v_trans_dict['TRBV06-02'] = 'TRBV6-2*01'
v_trans_dict['TRBV06-03'] = 'TRBV6-3*01'
v_trans_dict['TRBV06-04'] = 'TRBV6-4*01' 
v_trans_dict['TRBV06-05'] = 'TRBV6-5*01'
v_trans_dict['TRBV06-06'] = 'TRBV6-6*01'
v_trans_dict['TRBV06-07'] = 'TRBV6-7*01'
v_trans_dict['TRBV06-08'] = 'TRBV6-8*01'
v_trans_dict['TRBV06-09'] = 'TRBV6-9*01'
    
v_trans_dict['TRBV07-01'] = 'TRBV7-1*01'
v_trans_dict['TRBV07-02'] = 'TRBV7-2*01'   
v_trans_dict['TRBV07-03'] = 'TRBV7-3*01'
v_trans_dict['TRBV07-04'] = 'TRBV7-4*01'
v_trans_dict['TRBV07-06'] = 'TRBV7-6*01'
v_trans_dict['TRBV07-07'] = 'TRBV7-7*01'
v_trans_dict['TRBV07-08'] = 'TRBV7-8*01'
v_trans_dict['TRBV07-09'] = 'TRBV7-9*01'
    
v_trans_dict['TRBV09-01'] = 'TRBV9*01'
v_trans_dict['TRBV09-02'] = 'TRBV9*02'

In [36]:
len(v_trans_dict)

65

In [37]:
# see how many positive/negative pairs involving HLA-I alleles there are
# if we do not enforce the requirement of knowing pseudo sequences

attempt_train_pairs_pos = [x for x in train_pairs if ((x['mhc'][:5] in set(["HLA-A", "HLA-B", "HLA-C"])) 
                                                  and (x['vb']==x['vb']) and (x['sign'] == 1))]
attempt_train_pairs_neg = [x for x in train_pairs if ((x['mhc'][:5] in set(["HLA-A", "HLA-B", "HLA-C"])) 
                                                  and (x['vb']==x['vb']) and (x['sign'] == 0))]

print(len(attempt_train_pairs_pos))
print(len(attempt_train_pairs_neg))

6789
6454


In [38]:
# see how many positive/negative pairs involving HLA-I alleles there are
# if we do not enforce the vb not being na requirement

print(len([x for x in train_pairs if ((x['mhc'][:5] in set(["HLA-A", "HLA-B", "HLA-C"])) 
                                                  and (x['sign'] == 1))]))
print(len([x for x in train_pairs if ((x['mhc'][:5] in set(["HLA-A", "HLA-B", "HLA-C"])) 
                                                  and (x['sign'] == 0))]))


6852
6609


In [45]:
# define a function to prepare the data for training, validation and testing

def extract_data(cur_pairs):
    
    cur_pos = []
    cur_neg = []
    
    for x in cur_pairs:
        cur_hla = x['mhc']
        if cur_hla[:5] in set(["HLA-A", "HLA-B", "HLA-C"]):
            cur_hla_star = cur_hla[:5]+"*"+cur_hla[5:]
            if (cur_hla_star in hla_i_inter):
                if x['vb'] in v_trans_dict:
                    if x['sign']==1:
                        cur_pos += [(v_trans_dict[x['vb']]+","+x['tcrb'], cur_hla_star)]
                    elif x['sign']==0:
                        cur_neg += [(v_trans_dict[x['vb']]+","+x['tcrb'], cur_hla_star)]
                else: # this includes both when v beta does not have corresponding v allele and v beta = na
                    if x['sign']==1:
                        cur_pos += [("not_found"+","+x['tcrb'], cur_hla_star)]
                    elif x['sign']==0:
                        cur_neg += [("not_found"+","+x['tcrb'], cur_hla_star)]                

    return cur_pos, cur_neg

In [46]:
train_pos_pairs, train_neg_pairs = extract_data(train_pairs)
valid_pos_pairs, valid_neg_pairs = extract_data(valid_pairs)
test_pos_pairs, test_neg_pairs = extract_data(test_pairs)

In [47]:
len(train_pos_pairs)

5090

In [48]:
len(train_neg_pairs)

4691

In [49]:
len(valid_pos_pairs)

1102

In [50]:
len(valid_neg_pairs)

994

In [51]:
len(test_pos_pairs)

1045

In [52]:
len(test_neg_pairs)

1041

In [54]:
df_train_pos = pd.DataFrame(train_pos_pairs, columns = ['tcr', 'hla_allele'])
df_train_neg = pd.DataFrame(train_neg_pairs, columns = ['tcr', 'hla_allele'])

df_valid_pos = pd.DataFrame(valid_pos_pairs, columns = ['tcr', 'hla_allele'])
df_valid_neg = pd.DataFrame(valid_neg_pairs, columns = ['tcr', 'hla_allele'])

df_test_pos = pd.DataFrame(test_pos_pairs, columns = ['tcr', 'hla_allele'])
df_test_neg = pd.DataFrame(test_neg_pairs, columns = ['tcr', 'hla_allele'])

In [55]:
df_train_pos.to_csv("../data/HLA_I_full_mcpas/train_valid/train_pos.csv", index = False)
df_train_neg.to_csv("../data/HLA_I_full_mcpas/train_valid/train_neg.csv", index = False)

df_valid_pos.to_csv("../data/HLA_I_full_mcpas/train_valid/valid_pos.csv", index = False)
df_valid_neg.to_csv("../data/HLA_I_full_mcpas/train_valid/valid_neg.csv", index = False)

df_test_pos.to_csv("../data/HLA_I_full_mcpas/test/test_pos.csv", index = False)
df_test_neg.to_csv("../data/HLA_I_full_mcpas/test/test_neg.csv", index = False)

In [56]:
tcrs_used = df_train_pos.tcr.tolist() + df_train_neg.tcr.tolist() + \
            df_valid_pos.tcr.tolist() + df_valid_neg.tcr.tolist() + \
            df_test_pos.tcr.tolist() + df_test_neg.tcr.tolist()

In [57]:
cdr3s_used = [x.split(",")[1] for x in tcrs_used]

In [58]:
min([len(x) for x in cdr3s_used])

6

In [59]:
max([len(x) for x in cdr3s_used])

24

In [60]:
df_sub_combo = df_combo.loc[(df_combo.organism == 'human')
                        & (df_combo.chain == 'B')
                        & (df_combo.region == 'V')]

In [61]:
df_sub_combo.shape

(142, 9)

In [62]:
cdr1s = [x.split(";")[0] for x in df_sub_combo.cdrs.tolist()]
cdr2s = [x.split(";")[1] for x in df_sub_combo.cdrs.tolist()]
cdr25s = [x.split(";")[2] for x in df_sub_combo.cdrs.tolist()]

In [63]:
set(list("".join(cdr1s)))

{'*',
 '.',
 'A',
 'D',
 'E',
 'F',
 'G',
 'H',
 'K',
 'L',
 'M',
 'N',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'V',
 'W',
 'Y'}

In [64]:
set(list("".join(cdr2s)))

{'*',
 '.',
 'A',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'K',
 'L',
 'M',
 'N',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'V',
 'W',
 'Y'}

In [65]:
set(list("".join(cdr25s)))

{'.',
 'A',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'K',
 'L',
 'N',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'V',
 'Y'}

In [66]:
trans_value_set = set(v_trans_dict.values())

In [67]:
trans_value_set

{'TRBV1*01',
 'TRBV10-1*01',
 'TRBV10-2*01',
 'TRBV10-3*01',
 'TRBV11-1*01',
 'TRBV11-2*01',
 'TRBV11-3*01',
 'TRBV12-1*01',
 'TRBV12-2*01',
 'TRBV12-3*01',
 'TRBV12-4*01',
 'TRBV12-5*01',
 'TRBV13*01',
 'TRBV13*02',
 'TRBV14*01',
 'TRBV15*01',
 'TRBV15*02',
 'TRBV16*01',
 'TRBV17*01',
 'TRBV18*01',
 'TRBV19*01',
 'TRBV2*01',
 'TRBV2*02',
 'TRBV2*03',
 'TRBV20-1*01',
 'TRBV21-1*01',
 'TRBV23-1*01',
 'TRBV24-1*01',
 'TRBV25-1*01',
 'TRBV26*01',
 'TRBV27*01',
 'TRBV28*01',
 'TRBV29-1*01',
 'TRBV3-1*01',
 'TRBV3-2*01',
 'TRBV30*01',
 'TRBV4-1*01',
 'TRBV4-2*01',
 'TRBV4-3*01',
 'TRBV5-1*01',
 'TRBV5-4*01',
 'TRBV5-5*01',
 'TRBV5-6*01',
 'TRBV5-7*01',
 'TRBV5-8*01',
 'TRBV6-1*01',
 'TRBV6-2*01',
 'TRBV6-3*01',
 'TRBV6-4*01',
 'TRBV6-5*01',
 'TRBV6-6*01',
 'TRBV6-7*01',
 'TRBV6-8*01',
 'TRBV6-9*01',
 'TRBV7-1*01',
 'TRBV7-2*01',
 'TRBV7-3*01',
 'TRBV7-4*01',
 'TRBV7-6*01',
 'TRBV7-7*01',
 'TRBV7-8*01',
 'TRBV7-9*01',
 'TRBV9*01',
 'TRBV9*02'}

In [69]:
# look into the cdrs involved in the HLA-I pairs
involved_cdrs = [cdrs for v, cdrs in zip(df_sub_combo.id.tolist(), 
                         df_sub_combo.cdrs.tolist())]

In [70]:
involved_cdr1s = [x.split(";")[0] for x in involved_cdrs]

In [71]:
involved_cdr1s

['GH........DS',
 'WNH.......NN',
 'WNH.......NN',
 'WSH.......SY',
 'WSH.......SY',
 'ENH.......RY',
 'ENH.......RY',
 'ENH.......RY',
 'ENH.......RY',
 'SGH.......AT',
 'SGH.......AT',
 'SGH.......AT',
 'SGH.......AT',
 'SGH.......NT',
 'SGH.......NT',
 'SGH.......NT',
 'SGH.......ND',
 'FGH.......NF',
 'SGH.......NS',
 'SGH.......DY',
 'SGH.......DY',
 'LGH.......NT',
 'PRH.......DT',
 'PRH.......DT',
 'SGH.......DN',
 'SGH.......DN',
 'LNH.......NV',
 'LNH.......NV',
 'LNH.......NV',
 'KGH.......SY',
 'KGH.......S*',
 'KGH.......SY',
 'SGH.......MF',
 'KGH.......SH',
 'LNH.......DA',
 'LNH.......DA',
 'LNH.......DA',
 'SNH.......LY',
 'SNH.......LY',
 'SNH.......LY',
 'DFQ......ATT',
 'DFQ......ATT',
 'DFQ......ATT',
 'DFQ......ATT',
 'DFQ......ATT',
 'DFQ......ATT',
 'DFQ......ATT',
 'DFQ......ATT',
 'DFQ......ATT',
 'DFQ......ATT',
 'KAH.......SY',
 'KRH.......SY',
 'KGH.......TF',
 'NGH.......TF',
 'NGH.......TF',
 'KGH.......DR',
 'KGH.......DR',
 'MGH.......DK',
 'MNH.......VT