Process the Szeto 2021 pairs:

    pad both ends of CDR3 if it does not start with C and end with F
    exclude those with non-human MHC and strange Vb TRGV8*01
    only keep those involving HLA-Is from the 85 ones from Emerson data
    translate Vb not in combo_xcr.tsv to "not_found"

In [1]:
import pandas as pd
import numpy as np

from collections import defaultdict
from collections import Counter

In [2]:
data_dir = "../data/"
data_folder = "Szeto_2020/"
data_file = "Szeto_2020_data.csv"

In [3]:
df = pd.read_csv(data_dir+data_folder+data_file, header = 0)

In [4]:
def convert_cdr3(x):
    x_list = list(x)
    if ((x_list[0] == "C") and (x_list[-1] == "F")):
        return x
    else:
        return "".join(["C"] + x_list + ["F"])

In [5]:
raw_pairs = [(a, b, convert_cdr3(c)) for a, b, c in zip(df.MHC, df.TRBV, df.CDR3)
             if ((list(a)[1]=="L") and (b!='TRGV8*01'))]

In [6]:
len(raw_pairs)

68

In [7]:
unique_raw_pairs = list(set(raw_pairs))
len(unique_raw_pairs)

59

In [59]:
unique_raw_pairs

[('HLA-A*02:01', '6-5*01', 'CASSYQGTEAFF'),
 ('HLA-B*35:01', '6-1*01', 'CASRTRGGTLIEQYF'),
 ('HLA-A*24:02', '30*01', 'CAWSVSVGAGVPTIYF'),
 ('HLA-A*24:02', '4-1*01', 'CASSPTSGIYEQYF'),
 ('HLA-B*08:01', '6-2*01', 'CASGQGNFDIQYF'),
 ('HLA-A*02:01', '6-4*01', 'CASSLSFGTEAFF'),
 ('HLA-A*02:01', '7-9*03', 'CASSLGFGRDVMRFF'),
 ('HLA-A*02:01', '6-5*01', 'CASRPGLAGGRPEQYF'),
 ('HLA-A*24:02', '27*01', 'CASSGASHEQYF'),
 ('HLA-B*08:01', '11-2*01', 'CASSFTWTSGGATDTQYF'),
 ('HLA-A*11:01', '11-2*01', 'CASSLGQGLLYGYTF'),
 ('HLA-A*02:01', '13-3*01', 'CASSDWVSYEQYF'),
 ('HLA-E*01:03', '9*01', 'CASSANPGDSSNEKLFF'),
 ('HLA-A*02:01', '10-3*01', 'CAISEVGVGQPQHF'),
 ('HLA-A*02:01', '6-5*01', 'CASSYPGGGFYEQYF'),
 ('HLA-B*51:01', '7-3*01', 'CASSLTGGGELFF'),
 ('HLA-A*02:01', '6-1*01', 'CASSEGLWQVGDEQYFF'),
 ('HLA-A*02:01', '6-5*01', 'CASSYLNRDSALDF'),
 ('HLA-A*02:01', '20-1*01', 'CSARDGTGNGYTF'),
 ('HLA-A*02:01', '29-1*01', 'CSVGGSGGADTQYF'),
 ('HLA-B*27:05', '6-5*01', 'CASREGLGGTEAFF'),
 ('HLA-A*02:01', '27*01

In [8]:
Counter([len(x[2]) for x in unique_raw_pairs])

Counter({14: 15, 12: 6, 13: 14, 17: 3, 19: 1, 15: 13, 16: 5, 11: 1, 18: 1})

In [9]:
# only keep those with v alleles that we can encode for
# load the information of those that we can encode for
V_info = pd.read_csv("../data/for_encoders/combo_xcr.tsv", sep='\t')
V_sub_info = V_info.loc[(V_info.organism == 'human')
                        & (V_info.chain == 'B')
                        & (V_info.region == 'V')]

V_sub_info[:2]

Unnamed: 0,id,organism,chain,region,nucseq,frame,aligned_protseq,cdr_columns,cdrs
566,TRBV1*01,human,B,V,gatactggaattacccagacaccaaaatacctggtcacagcaatgg...,1,DTGITQTPKYLVTAMGSKRTMKREHLGH........DSMYWYRQKA...,27-38;56-65;81-86;104-109,GH........DS;YNC....KEF;P.DSSR;CTSSQ.
567,TRBV10-1*01,human,B,V,gatgctgaaatcacccagagcccaagacacaagatcacagagacag...,1,DAEITQSPRHKITETGRQVTLACHQTWNH.......NNMFWYRQDL...,27-38;56-65;81-86;104-109,WNH.......NN;SYG....VQD;S.NTED;CASSE.


In [10]:
set(V_sub_info.id)

{'TRBV1*01',
 'TRBV10-1*01',
 'TRBV10-1*02',
 'TRBV10-2*01',
 'TRBV10-2*02',
 'TRBV10-3*01',
 'TRBV10-3*02',
 'TRBV10-3*03',
 'TRBV10-3*04',
 'TRBV11-1*01',
 'TRBV11-2*01',
 'TRBV11-2*02',
 'TRBV11-2*03',
 'TRBV11-3*01',
 'TRBV11-3*02',
 'TRBV11-3*03',
 'TRBV12-1*01',
 'TRBV12-2*01',
 'TRBV12-3*01',
 'TRBV12-4*01',
 'TRBV12-4*02',
 'TRBV12-5*01',
 'TRBV13*01',
 'TRBV13*02',
 'TRBV14*01',
 'TRBV14*02',
 'TRBV15*01',
 'TRBV15*02',
 'TRBV15*03',
 'TRBV16*01',
 'TRBV16*02',
 'TRBV16*03',
 'TRBV17*01',
 'TRBV18*01',
 'TRBV19*01',
 'TRBV19*02',
 'TRBV19*03',
 'TRBV2*01',
 'TRBV2*02',
 'TRBV2*03',
 'TRBV20-1*01',
 'TRBV20-1*02',
 'TRBV20-1*03',
 'TRBV20-1*04',
 'TRBV20-1*05',
 'TRBV20-1*06',
 'TRBV20-1*07',
 'TRBV20/OR9-2*01',
 'TRBV20/OR9-2*02',
 'TRBV20/OR9-2*03',
 'TRBV21-1*01',
 'TRBV21/OR9-2*01',
 'TRBV23-1*01',
 'TRBV23/OR9-2*01',
 'TRBV23/OR9-2*02',
 'TRBV24-1*01',
 'TRBV24/OR9-2*01',
 'TRBV25-1*01',
 'TRBV26*01',
 'TRBV26/OR9-2*01',
 'TRBV26/OR9-2*02',
 'TRBV27*01',
 'TRBV28*01',
 'TR

In [11]:
set(["TRBV"+x[1] for x in unique_raw_pairs]) - set(V_sub_info.id)

{'TRBV13-3*01', 'TRBV19-01'}

In [12]:
# load HLA info to help with filtering
HLA_pseudo = pd.read_csv("../data/for_encoders/HLA_I_pseudo_40.csv", sep=',', header=0)
HLA_pseudo[:2]

Unnamed: 0,hla,seq
0,HLA-B*08:01,YDSEYRNQIFTNTDESNLYLSYNYYTWAARVDAYETEWRY
1,HLA-A*24:02,YSAMYEEGKVAHTDENIAYLMFHYYTWAAHVQAYETDGRY


In [13]:
kept_raw_pairs = [(x[0], "TRBV"+x[1], x[2]) for x in unique_raw_pairs if \
                  (x[0] in set(HLA_pseudo.hla))]
len(kept_raw_pairs)

54

In [69]:
kept_raw_pairs

[('HLA-A*02:01', 'TRBV6-5*01', 'CASSYQGTEAFF'),
 ('HLA-B*35:01', 'TRBV6-1*01', 'CASRTRGGTLIEQYF'),
 ('HLA-A*24:02', 'TRBV30*01', 'CAWSVSVGAGVPTIYF'),
 ('HLA-A*24:02', 'TRBV4-1*01', 'CASSPTSGIYEQYF'),
 ('HLA-B*08:01', 'TRBV6-2*01', 'CASGQGNFDIQYF'),
 ('HLA-A*02:01', 'TRBV6-4*01', 'CASSLSFGTEAFF'),
 ('HLA-A*02:01', 'TRBV7-9*03', 'CASSLGFGRDVMRFF'),
 ('HLA-A*02:01', 'TRBV6-5*01', 'CASRPGLAGGRPEQYF'),
 ('HLA-A*24:02', 'TRBV27*01', 'CASSGASHEQYF'),
 ('HLA-B*08:01', 'TRBV11-2*01', 'CASSFTWTSGGATDTQYF'),
 ('HLA-A*11:01', 'TRBV11-2*01', 'CASSLGQGLLYGYTF'),
 ('HLA-A*02:01', 'TRBV13-3*01', 'CASSDWVSYEQYF'),
 ('HLA-A*02:01', 'TRBV10-3*01', 'CAISEVGVGQPQHF'),
 ('HLA-A*02:01', 'TRBV6-5*01', 'CASSYPGGGFYEQYF'),
 ('HLA-B*51:01', 'TRBV7-3*01', 'CASSLTGGGELFF'),
 ('HLA-A*02:01', 'TRBV6-1*01', 'CASSEGLWQVGDEQYFF'),
 ('HLA-A*02:01', 'TRBV6-5*01', 'CASSYLNRDSALDF'),
 ('HLA-A*02:01', 'TRBV20-1*01', 'CSARDGTGNGYTF'),
 ('HLA-A*02:01', 'TRBV29-1*01', 'CSVGGSGGADTQYF'),
 ('HLA-B*27:05', 'TRBV6-5*01', 'CASREGLG

In [71]:
# encode the v genes not in combo_xcr.tsv as 'not_found'
kept_pairs = [x if x[1] in set(V_sub_info.id) \
              else (x[0], 'not_found', x[2]) for x in kept_raw_pairs]
kept_pairs

[('HLA-A*02:01', 'TRBV6-5*01', 'CASSYQGTEAFF'),
 ('HLA-B*35:01', 'TRBV6-1*01', 'CASRTRGGTLIEQYF'),
 ('HLA-A*24:02', 'TRBV30*01', 'CAWSVSVGAGVPTIYF'),
 ('HLA-A*24:02', 'TRBV4-1*01', 'CASSPTSGIYEQYF'),
 ('HLA-B*08:01', 'TRBV6-2*01', 'CASGQGNFDIQYF'),
 ('HLA-A*02:01', 'TRBV6-4*01', 'CASSLSFGTEAFF'),
 ('HLA-A*02:01', 'TRBV7-9*03', 'CASSLGFGRDVMRFF'),
 ('HLA-A*02:01', 'TRBV6-5*01', 'CASRPGLAGGRPEQYF'),
 ('HLA-A*24:02', 'TRBV27*01', 'CASSGASHEQYF'),
 ('HLA-B*08:01', 'TRBV11-2*01', 'CASSFTWTSGGATDTQYF'),
 ('HLA-A*11:01', 'TRBV11-2*01', 'CASSLGQGLLYGYTF'),
 ('HLA-A*02:01', 'not_found', 'CASSDWVSYEQYF'),
 ('HLA-A*02:01', 'TRBV10-3*01', 'CAISEVGVGQPQHF'),
 ('HLA-A*02:01', 'TRBV6-5*01', 'CASSYPGGGFYEQYF'),
 ('HLA-B*51:01', 'TRBV7-3*01', 'CASSLTGGGELFF'),
 ('HLA-A*02:01', 'TRBV6-1*01', 'CASSEGLWQVGDEQYFF'),
 ('HLA-A*02:01', 'TRBV6-5*01', 'CASSYLNRDSALDF'),
 ('HLA-A*02:01', 'TRBV20-1*01', 'CSARDGTGNGYTF'),
 ('HLA-A*02:01', 'TRBV29-1*01', 'CSVGGSGGADTQYF'),
 ('HLA-B*27:05', 'TRBV6-5*01', 'CASREGLGGT

In [79]:
set([len(x[2]) for x in kept_pairs])

{12, 13, 14, 15, 16, 17, 18, 19}

In [75]:
# convert to the format suitable for directly passing into the model for output

kept_pairs_reformat = [(x[1]+","+x[2], x[0]) for x in kept_pairs]
kept_pairs_reformat

[('TRBV6-5*01,CASSYQGTEAFF', 'HLA-A*02:01'),
 ('TRBV6-1*01,CASRTRGGTLIEQYF', 'HLA-B*35:01'),
 ('TRBV30*01,CAWSVSVGAGVPTIYF', 'HLA-A*24:02'),
 ('TRBV4-1*01,CASSPTSGIYEQYF', 'HLA-A*24:02'),
 ('TRBV6-2*01,CASGQGNFDIQYF', 'HLA-B*08:01'),
 ('TRBV6-4*01,CASSLSFGTEAFF', 'HLA-A*02:01'),
 ('TRBV7-9*03,CASSLGFGRDVMRFF', 'HLA-A*02:01'),
 ('TRBV6-5*01,CASRPGLAGGRPEQYF', 'HLA-A*02:01'),
 ('TRBV27*01,CASSGASHEQYF', 'HLA-A*24:02'),
 ('TRBV11-2*01,CASSFTWTSGGATDTQYF', 'HLA-B*08:01'),
 ('TRBV11-2*01,CASSLGQGLLYGYTF', 'HLA-A*11:01'),
 ('not_found,CASSDWVSYEQYF', 'HLA-A*02:01'),
 ('TRBV10-3*01,CAISEVGVGQPQHF', 'HLA-A*02:01'),
 ('TRBV6-5*01,CASSYPGGGFYEQYF', 'HLA-A*02:01'),
 ('TRBV7-3*01,CASSLTGGGELFF', 'HLA-B*51:01'),
 ('TRBV6-1*01,CASSEGLWQVGDEQYFF', 'HLA-A*02:01'),
 ('TRBV6-5*01,CASSYLNRDSALDF', 'HLA-A*02:01'),
 ('TRBV20-1*01,CSARDGTGNGYTF', 'HLA-A*02:01'),
 ('TRBV29-1*01,CSVGGSGGADTQYF', 'HLA-A*02:01'),
 ('TRBV6-5*01,CASREGLGGTEAFF', 'HLA-B*27:05'),
 ('TRBV27*01,CASSIQQGADTQYF', 'HLA-A*02:01'),
 ('TRB

In [76]:
df_kept_pairs = pd.DataFrame(kept_pairs_reformat, columns = ["tcr", "hla_allele"])
df_kept_pairs

Unnamed: 0,tcr,hla_allele
0,"TRBV6-5*01,CASSYQGTEAFF",HLA-A*02:01
1,"TRBV6-1*01,CASRTRGGTLIEQYF",HLA-B*35:01
2,"TRBV30*01,CAWSVSVGAGVPTIYF",HLA-A*24:02
3,"TRBV4-1*01,CASSPTSGIYEQYF",HLA-A*24:02
4,"TRBV6-2*01,CASGQGNFDIQYF",HLA-B*08:01
5,"TRBV6-4*01,CASSLSFGTEAFF",HLA-A*02:01
6,"TRBV7-9*03,CASSLGFGRDVMRFF",HLA-A*02:01
7,"TRBV6-5*01,CASRPGLAGGRPEQYF",HLA-A*02:01
8,"TRBV27*01,CASSGASHEQYF",HLA-A*24:02
9,"TRBV11-2*01,CASSFTWTSGGATDTQYF",HLA-B*08:01


In [77]:
df_kept_pairs.shape

(54, 2)

In [78]:
df_kept_pairs.to_csv("../data/Szeto_2020/HLA_I_szeto_2020_compatible_pairs.csv", index = False)