Explore the TCRs provided in Chen et al. 2017

    Chen et al. 2017 for NLV, (HLA)-A2+

Transform to the format as in Emerson data

Save out the subsets that appear in Emerson data

In [1]:
import pandas as pd
import numpy as np
import os

from collections import defaultdict
from collections import Counter

In [2]:
inter_dir = "../intermediate_files/Chen_2017"
data_dir = "../conditional_TCR_prediction/data"

# load all frequent TCRs from Emerson data
df_tcrs = pd.read_csv(os.path.join(inter_dir,"TCR_names.csv"), 
                      header=0)
df_tcrs.shape

(1098738, 1)

In [3]:
df_tcrs[:6]

Unnamed: 0,TCR_name
0,"TCRBV29-01*01,CSVEESYEQYF"
1,"TCRBV05-01*01,CASSLRGSGNTIYF"
2,"TCRBV04-02*01,CASSQEGQSSYEQYF"
3,"TCRBV06-05*01,CASSYSGSNQPQHF"
4,"TCRBV29-01*01,CSVYRGHEQYF"
5,"TCRBV06-04*01,CASSDNSGANVLTF"


In [4]:
# chen et al. 2017 for NLV, (HLA)-A2+
df_chen = pd.read_excel(os.path.join(data_dir, "Chen_2017", "Chen_2017_Table_S3.xlsx"), 
                        sheet_name = "ST3", skiprows=1)
df_chen = df_chen[:11637]

In [5]:
df_chen[11632:11637]

Unnamed: 0,Ag,chain,TCRV,CDR3,TCRJ,% of UMIs*,# of Cross**,Ref***,Pair-index****
11632,GIL,beta,TRBV6,SYSAAAGTSLDIQ,TRBJ2-4,,,43,pair216
11633,GIL,beta,TRBV27,SLIFPSGEQ,TRBJ2-7,,,43,pair217
11634,GIL,beta,TRBV19,SIGSYGY,TRBJ1-2,,,43,pair218
11635,GIL,beta,TRBV20-1,RTSGDFGEQ,TRBJ2-1,,,43,pair225
11636,GIL,beta,TRBV19,SIYGNTEA,TRBJ1-1,,,43,pair227


In [6]:
print(Counter(df_chen["Ag"]))
print(Counter(df_chen["chain"]))

Counter({'NLV': 6732, 'GIL': 4905})
Counter({'beta': 6968, 'alpha': 4669})


In [7]:
df_chen_NLV = df_chen[(df_chen["Ag"]=="NLV") & (df_chen["chain"]=="beta")]
df_chen_NLV.shape

(4508, 9)

In [8]:
print(Counter(df_chen_NLV["Ag"]))
print(Counter(df_chen_NLV["chain"]))

Counter({'NLV': 4508})
Counter({'beta': 4508})


In [9]:
len(set(df_chen_NLV["TCRV"]))

69

In [10]:
all_NLVs = list(set([str(x)+","+str(y) for x,y in zip(df_chen_NLV["TCRV"].tolist(), 
                                                      df_chen_NLV["CDR3"].tolist())]))
len(all_NLVs)

4492

In [11]:
set([x.split(",")[0] for x in all_NLVs])

{'0',
 'TRBV1',
 'TRBV10-1',
 'TRBV10-2',
 'TRBV10-3',
 'TRBV11',
 'TRBV11-1',
 'TRBV11-2',
 'TRBV11-3',
 'TRBV12-3',
 'TRBV12-4',
 'TRBV12-5',
 'TRBV13',
 'TRBV13-3',
 'TRBV14',
 'TRBV14-1',
 'TRBV15',
 'TRBV15-1',
 'TRBV16',
 'TRBV17-1',
 'TRBV18',
 'TRBV19',
 'TRBV2',
 'TRBV2-1',
 'TRBV20',
 'TRBV20-1',
 'TRBV21',
 'TRBV22',
 'TRBV23',
 'TRBV23-1',
 'TRBV24-1',
 'TRBV25-1',
 'TRBV27',
 'TRBV28',
 'TRBV29',
 'TRBV29-1',
 'TRBV3',
 'TRBV3-1',
 'TRBV30',
 'TRBV4',
 'TRBV4-1',
 'TRBV4-2',
 'TRBV4-3',
 'TRBV5-1',
 'TRBV5-3',
 'TRBV5-4',
 'TRBV5-5',
 'TRBV5-6',
 'TRBV5-7',
 'TRBV5-8',
 'TRBV6',
 'TRBV6-1',
 'TRBV6-2',
 'TRBV6-3',
 'TRBV6-4',
 'TRBV6-5',
 'TRBV6-6',
 'TRBV6-8',
 'TRBV6-9',
 'TRBV7-2',
 'TRBV7-3',
 'TRBV7-4',
 'TRBV7-6',
 'TRBV7-7',
 'TRBV7-8',
 'TRBV7-9',
 'TRBV8',
 'TRBV8-1',
 'TRBV9'}

In [12]:
set("".join([x.split(",")[1] for x in all_NLVs]))

{'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'K',
 'L',
 'M',
 'N',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'V',
 'W',
 'X',
 'Y'}

In [13]:
len(set("".join([x.split(",")[1] for x in all_NLVs])))

22

In [14]:
df_chen[df_chen["TCRV"]==0]

Unnamed: 0,Ag,chain,TCRV,CDR3,TCRJ,% of UMIs*,# of Cross**,Ref***,Pair-index****
6274,NLV,beta,0,ASQGTAATNTGELF,TRBJ2-2,,,14,
6280,NLV,beta,0,ASRGLSGLTEAF,TRBJ1-1,,,14,
6286,NLV,beta,0,ASRLLAGGDNEQF,0,,,25,
6299,NLV,beta,0,ASSAYTGTVYGYT,0,,,25,
6303,NLV,beta,0,ASSEALLGRANYGYT,0,,,25,
6375,NLV,beta,0,ASSLEQHTEAF,TRBJ1-1,,,14,
6431,NLV,beta,0,ASSPGQTFSNQPQH,0,,,25,
6447,NLV,beta,0,ASSPLTGTGVYGYT,0,,,25,
6462,NLV,beta,0,ASSPTTGTGNYGYT,0,,,25,
6491,NLV,beta,0,ASSSRQGADTGELF,TRBJ2-2,,,14,


In [15]:
all_NLVs_kept = [x for x in all_NLVs if x.split(",")[0]!="0"]
len(all_NLVs_kept)

4476

In [16]:
all_Emerson_Vs = [x.split(",")[0] for x in df_tcrs["TCR_name"].tolist()]
set(all_Emerson_Vs)

{'TCRBV01-01*01',
 'TCRBV02-01*01',
 'TCRBV03-01*01',
 'TCRBV04-01*01',
 'TCRBV04-02*01',
 'TCRBV04-03*01',
 'TCRBV05-01*01',
 'TCRBV05-03*01',
 'TCRBV05-04*01',
 'TCRBV05-05*01',
 'TCRBV05-06*01',
 'TCRBV05-08*01',
 'TCRBV06-01*01',
 'TCRBV06-04*01',
 'TCRBV06-05*01',
 'TCRBV06-06*01',
 'TCRBV06-07*01',
 'TCRBV06-09*01',
 'TCRBV07-02*01',
 'TCRBV07-03*01',
 'TCRBV07-04*01',
 'TCRBV07-06*01',
 'TCRBV07-07*01',
 'TCRBV07-08*01',
 'TCRBV07-08*02',
 'TCRBV07-09*01',
 'TCRBV08-02*01',
 'TCRBV09-01*01',
 'TCRBV10-01*01',
 'TCRBV10-02*01',
 'TCRBV10-03*01',
 'TCRBV11-01*01',
 'TCRBV11-02*02',
 'TCRBV11-03*01',
 'TCRBV12-01*01',
 'TCRBV12-02*01',
 'TCRBV12-05*01',
 'TCRBV13-01*01',
 'TCRBV14-01*01',
 'TCRBV15-01*01',
 'TCRBV16-01*01',
 'TCRBV18-01*01',
 'TCRBV19-01*01',
 'TCRBV20-01*01',
 'TCRBV21-01*01',
 'TCRBV23-01*01',
 'TCRBV25-01*01',
 'TCRBV27-01*01',
 'TCRBV28-01*01',
 'TCRBV29-01*01',
 'TCRBV30-01*01',
 'TCRBV30-01*02'}

In [17]:
Emerson_num_dict = defaultdict(list)

temp_list = list(set(all_Emerson_Vs))

for x in temp_list:
    x_split = x.split("*")[0].split("-")
    Emerson_num_dict[x_split[0]] += [x_split[1]]

In [18]:
Emerson_num_dict

defaultdict(list,
            {'TCRBV16': ['01'],
             'TCRBV01': ['01'],
             'TCRBV13': ['01'],
             'TCRBV06': ['04', '07', '05', '06', '09', '01'],
             'TCRBV07': ['02', '08', '07', '08', '03', '04', '09', '06'],
             'TCRBV11': ['03', '01', '02'],
             'TCRBV05': ['03', '06', '05', '08', '04', '01'],
             'TCRBV10': ['01', '03', '02'],
             'TCRBV30': ['01', '01'],
             'TCRBV28': ['01'],
             'TCRBV20': ['01'],
             'TCRBV03': ['01'],
             'TCRBV09': ['01'],
             'TCRBV02': ['01'],
             'TCRBV27': ['01'],
             'TCRBV12': ['05', '01', '02'],
             'TCRBV21': ['01'],
             'TCRBV23': ['01'],
             'TCRBV29': ['01'],
             'TCRBV18': ['01'],
             'TCRBV04': ['02', '01', '03'],
             'TCRBV25': ['01'],
             'TCRBV08': ['02'],
             'TCRBV19': ['01'],
             'TCRBV14': ['01'],
             'TCRBV15': ['

In [19]:
emerson_1st_layer = defaultdict(list)
emerson_2nd_layer = defaultdict(list)

for x in temp_list:
    x_split_2nd = x.split("*")
    x_split_1st = x_split_2nd[0].split("-")
    emerson_1st_layer[x_split_1st[0]] += [x_split_1st[1]]
    emerson_2nd_layer[x_split_2nd[0]] += [x_split_2nd[1]]    

In [20]:
emerson_1st_layer

defaultdict(list,
            {'TCRBV16': ['01'],
             'TCRBV01': ['01'],
             'TCRBV13': ['01'],
             'TCRBV06': ['04', '07', '05', '06', '09', '01'],
             'TCRBV07': ['02', '08', '07', '08', '03', '04', '09', '06'],
             'TCRBV11': ['03', '01', '02'],
             'TCRBV05': ['03', '06', '05', '08', '04', '01'],
             'TCRBV10': ['01', '03', '02'],
             'TCRBV30': ['01', '01'],
             'TCRBV28': ['01'],
             'TCRBV20': ['01'],
             'TCRBV03': ['01'],
             'TCRBV09': ['01'],
             'TCRBV02': ['01'],
             'TCRBV27': ['01'],
             'TCRBV12': ['05', '01', '02'],
             'TCRBV21': ['01'],
             'TCRBV23': ['01'],
             'TCRBV29': ['01'],
             'TCRBV18': ['01'],
             'TCRBV04': ['02', '01', '03'],
             'TCRBV25': ['01'],
             'TCRBV08': ['02'],
             'TCRBV19': ['01'],
             'TCRBV14': ['01'],
             'TCRBV15': ['

In [21]:
# translate the format of V gene from Chen et al. 2017 data
# if first num string has only one character, add 0
# if no second num string, take the first one in the ordered Emerson list
# if second num string has only one character, add 0
# after the second num string, take the first one in the ordered Emerson list for the part after *

chen_trans_dict = defaultdict(str)

chen_V_list = list(set([x.split(",")[0] for x in all_NLVs_kept]))
chen_V_list.sort()

for x in chen_V_list:
    x_split = x.split("-")
    if len(x_split)==1:
        if len(x_split[0])==5:
            x_key = "TCRBV0"+x[-1]
        else:
            x_key = "TCRBV"+x_split[0][(-2):] 
        if x_key in emerson_1st_layer:
            x_first_vs = emerson_1st_layer[x_key]
            x_first_vs.sort()
            x_key_2nd = x_key+"-"+x_first_vs[0]
            x_second_vs = emerson_2nd_layer[x_key_2nd]
            x_second_vs.sort()
            x_value = x_key_2nd+"*"+x_second_vs[0]
            chen_trans_dict[x] = x_value 
    else:
        if len(x_split[0])==5:
            x_key = "TCRBV0"+x_split[0][-1]
        else:
            x_key = "TCRBV"+x_split[0][(-2):] 
        x_key_2nd = x_key+"-0"+x_split[1]
        if x_key_2nd in emerson_2nd_layer:
            x_second_vs = emerson_2nd_layer[x_key_2nd]
            x_second_vs.sort()
            x_value = x_key_2nd+"*"+x_second_vs[0]  
            chen_trans_dict[x] = x_value 

In [22]:
chen_trans_dict

defaultdict(str,
            {'TRBV1': 'TCRBV01-01*01',
             'TRBV10-1': 'TCRBV10-01*01',
             'TRBV10-2': 'TCRBV10-02*01',
             'TRBV10-3': 'TCRBV10-03*01',
             'TRBV11': 'TCRBV11-01*01',
             'TRBV11-1': 'TCRBV11-01*01',
             'TRBV11-2': 'TCRBV11-02*02',
             'TRBV11-3': 'TCRBV11-03*01',
             'TRBV12-5': 'TCRBV12-05*01',
             'TRBV13': 'TCRBV13-01*01',
             'TRBV14': 'TCRBV14-01*01',
             'TRBV14-1': 'TCRBV14-01*01',
             'TRBV15': 'TCRBV15-01*01',
             'TRBV15-1': 'TCRBV15-01*01',
             'TRBV16': 'TCRBV16-01*01',
             'TRBV18': 'TCRBV18-01*01',
             'TRBV19': 'TCRBV19-01*01',
             'TRBV2': 'TCRBV02-01*01',
             'TRBV2-1': 'TCRBV02-01*01',
             'TRBV20': 'TCRBV20-01*01',
             'TRBV20-1': 'TCRBV20-01*01',
             'TRBV21': 'TCRBV21-01*01',
             'TRBV23': 'TCRBV23-01*01',
             'TRBV23-1': 'TCRBV23-01*01',
  

In [23]:
len(chen_trans_dict)

57

In [24]:
len(chen_V_list)

68

In [25]:
[x for x in chen_V_list if x not in chen_trans_dict]

['TRBV12-3',
 'TRBV12-4',
 'TRBV13-3',
 'TRBV17-1',
 'TRBV22',
 'TRBV24-1',
 'TRBV5-7',
 'TRBV6-2',
 'TRBV6-3',
 'TRBV6-8',
 'TRBV8-1']

## It is needed to pad the starting and the ending locations with "C" and "F"

Chen et al. 2017 TCRs seem to have left these out. 

If directly matching without padding these two, there will be no overlap. \

About "C" and "F", see the question:

    How are the positions of gaps and insertions placed in IMGT Collier de Perles?

from this IMGT webpage:

    https://www.imgt.org/FAQ/

In [26]:
all_NLVs_kept_translated = [chen_trans_dict[x.split(",")[0]]+",C"+x.split(",")[1]+"F"
                            for x in all_NLVs_kept if x.split(",")[0] in chen_trans_dict]
len(all_NLVs_kept_translated)

4068

In [27]:
all_NLVs_kept_translated_unique = list(set(all_NLVs_kept_translated))
len(all_NLVs_kept_translated_unique)

4068

In [28]:
NLV_common = set(all_NLVs_kept_translated_unique).intersection(set(df_tcrs["TCR_name"].tolist()))
len(NLV_common)

535

In [29]:
Counter([x[-1] for x in df_tcrs["TCR_name"].tolist()])

Counter({'F': 1098624, 'V': 108, 'P': 2, 'A': 2, 'Q': 1, 'L': 1})

In [30]:
df_NLV_common = pd.DataFrame(NLV_common, columns=["TCR_name"])
df_NLV_common.to_csv(os.path.join(inter_dir, "Chen_2017_TCRs_processed_common.csv"), 
                     index=False)