Convert candidate TCRs from zheng 2021 single cell TCRs for getting predicted scores, in order to prepare for computing correlations

The TCRs should have v alleles belonging to one of the 142 human beta alleles in 

    ../data/for_encoders/combo_xcr.tsv
    
the original v part has a format different from what we used for DePTH model, we need to transform it

The amino acid sequences should not have strange characters

The amino acid sequences should not have length exceeding what we have encoded for

In [1]:
import pandas as pd
import numpy as np

from collections import defaultdict
from collections import Counter

In [2]:
data_dir = "../data/Zheng_2021/"

file_pos = "TCRs_CD8_pos_1_with_v_gene_weighted.csv"
file_neg = "TCRs_CD8_neg_1_with_v_gene_weighted.csv"

In [3]:
df_pos = pd.read_csv(data_dir + file_pos, header = 0)
df_neg = pd.read_csv(data_dir + file_neg, header = 0)

print(df_pos.shape)
print(df_neg.shape)

print(df_pos[:6])
print(df_neg[:6])

(10008, 1)
(25081, 1)
                   tcr
0  TRBV13,CASSLTGNEQFF
1  TRBV13,CASSLTGNEQFF
2  TRBV13,CASSLTGNEQFF
3  TRBV13,CASSLTGNEQFF
4  TRBV13,CASSLTGNEQFF
5  TRBV13,CASSLTGNEQFF
                    tcr
0  TRBV19,CASSGRSTDTQYF
1  TRBV19,CASSGRSTDTQYF
2  TRBV19,CASSGRSTDTQYF
3  TRBV19,CASSGRSTDTQYF
4  TRBV19,CASSGRSTDTQYF
5  TRBV19,CASSGRSTDTQYF


In [4]:
pos_v = [x.split(",")[0] for x in df_pos.tcr.tolist()]
pos_aa = [x.split(",")[1] for x in df_pos.tcr.tolist()]

neg_v = [x.split(",")[0] for x in df_neg.tcr.tolist()]
neg_aa = [x.split(",")[1] for x in df_neg.tcr.tolist()]

In [5]:
print(Counter([len(x) for x in pos_aa]))
print(Counter([len(x) for x in neg_aa]))

Counter({15: 2782, 14: 2660, 13: 1980, 16: 1693, 12: 893})
Counter({15: 7081, 14: 6680, 13: 4766, 16: 4249, 12: 2305})


In [6]:
# add *01 to v gene to get the format as allowed in tcrdist -- why we want the format as in tcrdist? in combo_xcr.tsv?

all_vs = list(set(pos_v + neg_v))
len(all_vs)

48

In [7]:
all_vs.sort()
all_vs

['TRBV10-1',
 'TRBV10-2',
 'TRBV10-3',
 'TRBV11-1',
 'TRBV11-2',
 'TRBV11-3',
 'TRBV12-3',
 'TRBV12-4',
 'TRBV12-5',
 'TRBV13',
 'TRBV14',
 'TRBV15',
 'TRBV16',
 'TRBV18',
 'TRBV19',
 'TRBV2',
 'TRBV20-1',
 'TRBV21-1',
 'TRBV24-1',
 'TRBV25-1',
 'TRBV27',
 'TRBV28',
 'TRBV29-1',
 'TRBV3-1',
 'TRBV30',
 'TRBV4-1',
 'TRBV4-2',
 'TRBV4-3',
 'TRBV5-1',
 'TRBV5-4',
 'TRBV5-5',
 'TRBV5-6',
 'TRBV5-7',
 'TRBV5-8',
 'TRBV6-1',
 'TRBV6-3',
 'TRBV6-4',
 'TRBV6-5',
 'TRBV6-6',
 'TRBV6-7',
 'TRBV6-9',
 'TRBV7-2',
 'TRBV7-3',
 'TRBV7-6',
 'TRBV7-7',
 'TRBV7-8',
 'TRBV7-9',
 'TRBV9']

In [8]:
# load the combo_xcr.tsv table to translate the format to 
V_info = pd.read_csv("../data/for_encoders/combo_xcr.tsv", sep='\t')
V_sub_info = V_info.loc[(V_info.organism == 'human')
                        & (V_info.chain == 'B')
                        & (V_info.region == 'V')]

V_sub_info[:6]

Unnamed: 0,id,organism,chain,region,nucseq,frame,aligned_protseq,cdr_columns,cdrs
566,TRBV1*01,human,B,V,gatactggaattacccagacaccaaaatacctggtcacagcaatgg...,1,DTGITQTPKYLVTAMGSKRTMKREHLGH........DSMYWYRQKA...,27-38;56-65;81-86;104-109,GH........DS;YNC....KEF;P.DSSR;CTSSQ.
567,TRBV10-1*01,human,B,V,gatgctgaaatcacccagagcccaagacacaagatcacagagacag...,1,DAEITQSPRHKITETGRQVTLACHQTWNH.......NNMFWYRQDL...,27-38;56-65;81-86;104-109,WNH.......NN;SYG....VQD;S.NTED;CASSE.
568,TRBV10-1*02,human,B,V,gatgctgaaatcacccagagcccaagacacaagatcacagagacag...,1,DAEITQSPRHKITETGRQVTLACHQTWNH.......NNMFWYRQDL...,27-38;56-65;81-86;104-109,WNH.......NN;SYG....VHD;S.NTED;CASSE.
569,TRBV10-2*01,human,B,V,gatgctggaatcacccagagcccaagatacaagatcacagagacag...,1,DAGITQSPRYKITETGRQVTLMCHQTWSH.......SYMFWYRQDL...,27-38;56-65;81-86;104-109,WSH.......SY;SAA....ADI;S.KTEN;CASSE.
570,TRBV10-2*02,human,B,V,aaggcaggtgaccttgatgtgtcaccagacttggagccacagctat...,2,................RQVTLMCHQTWSH.......SYMFWYRQDL...,27-38;56-65;81-86;104-109,WSH.......SY;SAA....ADI;S.KTEN;......
571,TRBV10-3*01,human,B,V,gatgctggaatcacccagagcccaagacacaaggtcacagagacag...,1,DAGITQSPRHKVTETGTPVTLRCHQTENH.......RYMYWYRQDP...,27-38;56-65;81-86;104-109,ENH.......RY;SYG....VKD;S.KTED;CAISE.


In [9]:
combo_xcr_ids = list(set(V_sub_info.id))
combo_xcr_ids.sort()
combo_xcr_ids

['TRBV1*01',
 'TRBV10-1*01',
 'TRBV10-1*02',
 'TRBV10-2*01',
 'TRBV10-2*02',
 'TRBV10-3*01',
 'TRBV10-3*02',
 'TRBV10-3*03',
 'TRBV10-3*04',
 'TRBV11-1*01',
 'TRBV11-2*01',
 'TRBV11-2*02',
 'TRBV11-2*03',
 'TRBV11-3*01',
 'TRBV11-3*02',
 'TRBV11-3*03',
 'TRBV12-1*01',
 'TRBV12-2*01',
 'TRBV12-3*01',
 'TRBV12-4*01',
 'TRBV12-4*02',
 'TRBV12-5*01',
 'TRBV13*01',
 'TRBV13*02',
 'TRBV14*01',
 'TRBV14*02',
 'TRBV15*01',
 'TRBV15*02',
 'TRBV15*03',
 'TRBV16*01',
 'TRBV16*02',
 'TRBV16*03',
 'TRBV17*01',
 'TRBV18*01',
 'TRBV19*01',
 'TRBV19*02',
 'TRBV19*03',
 'TRBV2*01',
 'TRBV2*02',
 'TRBV2*03',
 'TRBV20-1*01',
 'TRBV20-1*02',
 'TRBV20-1*03',
 'TRBV20-1*04',
 'TRBV20-1*05',
 'TRBV20-1*06',
 'TRBV20-1*07',
 'TRBV20/OR9-2*01',
 'TRBV20/OR9-2*02',
 'TRBV20/OR9-2*03',
 'TRBV21-1*01',
 'TRBV21/OR9-2*01',
 'TRBV23-1*01',
 'TRBV23/OR9-2*01',
 'TRBV23/OR9-2*02',
 'TRBV24-1*01',
 'TRBV24/OR9-2*01',
 'TRBV25-1*01',
 'TRBV26*01',
 'TRBV26/OR9-2*01',
 'TRBV26/OR9-2*02',
 'TRBV27*01',
 'TRBV28*01',
 'TR

In [10]:
to_combo_dict = defaultdict(str)

to_combo_dict['TRBV10-1'] = 'TRBV10-1*01'
to_combo_dict['TRBV10-2'] = 'TRBV10-2*01'
to_combo_dict['TRBV10-3'] = 'TRBV10-3*01'

to_combo_dict['TRBV11-1'] = 'TRBV11-1*01'
to_combo_dict['TRBV11-2'] = 'TRBV11-2*01'
to_combo_dict['TRBV11-3'] = 'TRBV11-3*01'

to_combo_dict['TRBV12-3'] = 'TRBV12-3*01'
to_combo_dict['TRBV12-4'] = 'TRBV12-4*01'
to_combo_dict['TRBV12-5'] = 'TRBV12-5*01'

to_combo_dict['TRBV13'] = 'TRBV13*01'
to_combo_dict['TRBV14'] = 'TRBV14*01'
to_combo_dict['TRBV15'] = 'TRBV15*01'
to_combo_dict['TRBV16'] = 'TRBV16*01'
to_combo_dict['TRBV18'] = 'TRBV18*01'
to_combo_dict['TRBV19'] = 'TRBV19*01'

to_combo_dict['TRBV2'] = 'TRBV2*01'
to_combo_dict['TRBV20-1'] = 'TRBV20-1*01'
to_combo_dict['TRBV21-1'] = 'TRBV21-1*01'
to_combo_dict['TRBV24-1'] = 'TRBV24-1*01'
to_combo_dict['TRBV25-1'] = 'TRBV25-1*01'
to_combo_dict['TRBV27'] = 'TRBV27*01'
to_combo_dict['TRBV28'] = 'TRBV28*01'
to_combo_dict['TRBV29-1'] = 'TRBV29-1*01'

to_combo_dict['TRBV3-1'] = 'TRBV3-1*01'
to_combo_dict['TRBV30'] = 'TRBV30*01'

to_combo_dict['TRBV4-1'] = 'TRBV4-1*01'
to_combo_dict['TRBV4-2'] = 'TRBV4-2*01'
to_combo_dict['TRBV4-3'] = 'TRBV4-3*01'

to_combo_dict['TRBV5-1'] = 'TRBV5-1*01'
to_combo_dict['TRBV5-4'] = 'TRBV5-4*01'
to_combo_dict['TRBV5-5'] = 'TRBV5-5*01'
to_combo_dict['TRBV5-6'] = 'TRBV5-6*01'
to_combo_dict['TRBV5-7'] = 'TRBV5-7*01'
to_combo_dict['TRBV5-8'] = 'TRBV5-8*01'

to_combo_dict['TRBV6-1'] = 'TRBV6-1*01'
to_combo_dict['TRBV6-3'] = 'TRBV6-3*01'
to_combo_dict['TRBV6-4'] = 'TRBV6-4*01'
to_combo_dict['TRBV6-5'] = 'TRBV6-5*01'
to_combo_dict['TRBV6-6'] = 'TRBV6-6*01'
to_combo_dict['TRBV6-7'] = 'TRBV6-7*01'
to_combo_dict['TRBV6-9'] = 'TRBV6-9*01'

to_combo_dict['TRBV7-2'] = 'TRBV7-2*01'
to_combo_dict['TRBV7-3'] = 'TRBV7-3*01'
to_combo_dict['TRBV7-6'] = 'TRBV7-6*01'
to_combo_dict['TRBV7-7'] = 'TRBV7-7*01'
to_combo_dict['TRBV7-8'] = 'TRBV7-8*01'
to_combo_dict['TRBV7-9'] = 'TRBV7-9*01'

to_combo_dict['TRBV9'] = 'TRBV9*01'

In [11]:
len(to_combo_dict)

48

In [12]:
set(to_combo_dict.keys()) == set(all_vs)

True

In [13]:
# verify whether everything is simply by adding *01
flag_list = []

for v in all_vs:
    flag_list += [to_combo_dict[v] == v + "*01"]

sum(flag_list)/len(flag_list)

1.0

In [14]:
# verify whether all values belong to the ids in combo_xcr.tsv
set(to_combo_dict.values()) - set(V_sub_info.id)

set()

In [15]:
# look into the characters among amino acid sequences
Counter("".join(pos_aa + neg_aa))

Counter({'C': 35133,
         'A': 51229,
         'S': 78656,
         'L': 20235,
         'T': 30283,
         'G': 48494,
         'N': 14223,
         'E': 30700,
         'Q': 34246,
         'F': 52441,
         'D': 14637,
         'R': 15742,
         'V': 9081,
         'P': 14031,
         'H': 5718,
         'Y': 32505,
         'W': 3050,
         'K': 3120,
         'I': 5016,
         'M': 1311})

In [None]:
# by using the cdr1, cdr2, cdr2.5 sequences for V part, we can allow more v genes as long
# as they are allowed for tcrdist

In [16]:
df_pos[:6]

Unnamed: 0,tcr
0,"TRBV13,CASSLTGNEQFF"
1,"TRBV13,CASSLTGNEQFF"
2,"TRBV13,CASSLTGNEQFF"
3,"TRBV13,CASSLTGNEQFF"
4,"TRBV13,CASSLTGNEQFF"
5,"TRBV13,CASSLTGNEQFF"


In [21]:
df_pos["v_forward"] = [to_combo_dict[x] for x in pos_v]
df_neg["v_forward"] = [to_combo_dict[x] for x in neg_v]

df_pos["aa"] = pos_aa
df_neg["aa"] = neg_aa

df_pos[:6]

Unnamed: 0,tcr,v_forward,aa
0,"TRBV13,CASSLTGNEQFF",TRBV13*01,CASSLTGNEQFF
1,"TRBV13,CASSLTGNEQFF",TRBV13*01,CASSLTGNEQFF
2,"TRBV13,CASSLTGNEQFF",TRBV13*01,CASSLTGNEQFF
3,"TRBV13,CASSLTGNEQFF",TRBV13*01,CASSLTGNEQFF
4,"TRBV13,CASSLTGNEQFF",TRBV13*01,CASSLTGNEQFF
5,"TRBV13,CASSLTGNEQFF",TRBV13*01,CASSLTGNEQFF


In [22]:
df_pos.shape

(10008, 3)

In [23]:
len(set([(v, aa) for v, aa in zip(df_pos.v_forward.tolist(), df_pos.aa.tolist())]))

3514

In [24]:
Counter(df_pos.v_forward)

Counter({'TRBV13*01': 109,
         'TRBV7-9*01': 741,
         'TRBV20-1*01': 784,
         'TRBV6-5*01': 380,
         'TRBV5-8*01': 59,
         'TRBV11-1*01': 46,
         'TRBV2*01': 353,
         'TRBV12-3*01': 216,
         'TRBV30*01': 156,
         'TRBV5-4*01': 198,
         'TRBV28*01': 709,
         'TRBV6-1*01': 270,
         'TRBV15*01': 233,
         'TRBV7-8*01': 298,
         'TRBV5-6*01': 323,
         'TRBV10-1*01': 77,
         'TRBV4-1*01': 362,
         'TRBV4-2*01': 151,
         'TRBV12-4*01': 178,
         'TRBV5-5*01': 115,
         'TRBV10-3*01': 111,
         'TRBV29-1*01': 312,
         'TRBV9*01': 614,
         'TRBV11-2*01': 263,
         'TRBV5-1*01': 347,
         'TRBV3-1*01': 162,
         'TRBV7-6*01': 185,
         'TRBV7-2*01': 160,
         'TRBV19*01': 415,
         'TRBV27*01': 447,
         'TRBV6-3*01': 288,
         'TRBV7-3*01': 143,
         'TRBV4-3*01': 189,
         'TRBV21-1*01': 30,
         'TRBV18*01': 82,
         'TRBV24-1*01': 107

In [25]:
df_neg.shape

(25081, 3)

In [26]:
len(set([(v, aa) for v, aa in zip(df_neg.v_forward.tolist(), df_neg.aa.tolist())]))

14248

In [27]:
Counter(df_neg.v_forward)

Counter({'TRBV19*01': 1433,
         'TRBV29-1*01': 813,
         'TRBV4-2*01': 563,
         'TRBV5-6*01': 566,
         'TRBV9*01': 1227,
         'TRBV13*01': 297,
         'TRBV6-6*01': 361,
         'TRBV7-3*01': 476,
         'TRBV6-5*01': 818,
         'TRBV18*01': 351,
         'TRBV6-1*01': 887,
         'TRBV20-1*01': 1845,
         'TRBV28*01': 1350,
         'TRBV6-3*01': 691,
         'TRBV11-3*01': 169,
         'TRBV27*01': 1206,
         'TRBV7-2*01': 634,
         'TRBV14*01': 283,
         'TRBV7-9*01': 1450,
         'TRBV10-3*01': 347,
         'TRBV5-1*01': 954,
         'TRBV7-8*01': 517,
         'TRBV4-3*01': 509,
         'TRBV2*01': 873,
         'TRBV12-3*01': 454,
         'TRBV4-1*01': 1006,
         'TRBV3-1*01': 527,
         'TRBV5-4*01': 387,
         'TRBV25-1*01': 224,
         'TRBV24-1*01': 255,
         'TRBV7-7*01': 96,
         'TRBV11-2*01': 688,
         'TRBV30*01': 544,
         'TRBV5-5*01': 278,
         'TRBV11-1*01': 95,
         'TRBV12-

In [28]:
output_pos = pd.DataFrame(list(zip(df_pos.v_forward.tolist(), 
                                   df_pos.aa.tolist())), columns = ["v", "aa"])
output_neg = pd.DataFrame(list(zip(df_neg.v_forward.tolist(), 
                                   df_neg.aa.tolist())), columns = ["v", "aa"])

output_pos.to_csv(data_dir + "pos_TCRs_weighted_for_DePTH_comb_xcr_format.csv", index = False)
output_neg.to_csv(data_dir + "neg_TCRs_weighted_for_DePTH_comb_xcr_format.csv", index = False)

In [29]:
# output another file with both pos and neg
output_both = pd.DataFrame(list(zip(df_pos.v_forward.tolist() + df_neg.v_forward.tolist(), 
                                    df_pos.aa.tolist() + df_neg.aa.tolist())), columns = ["v", "aa"])

output_both.to_csv(data_dir + "both_TCRs_weighted_for_DePTH_comb_xcr_format.csv", index = False)

In [30]:
output_both.shape

(35089, 2)