Explore the TCRs provided in Huth et al. 2019

    Huth et al. 2019
    HLA-B*07:02: RPH, TPR
    HLA-C*07:02: CRV, FRC

Transform to the format as in Emerson data

Save out the subsets that appear in Emerson data

In [1]:
import pandas as pd
import numpy as np
import os

from collections import defaultdict
from collections import Counter

In [5]:
inter_dir = "../intermediate_files/Huth_2019"
data_dir = "../../data"

# load all frequent TCRs from Emerson data
df_tcrs = pd.read_csv(os.path.join(inter_dir,"TCR_names.csv"), 
                      header=0)
df_tcrs.shape

(1098738, 1)

In [3]:
df_tcrs[:6]

Unnamed: 0,TCR_name
0,"TCRBV29-01*01,CSVEESYEQYF"
1,"TCRBV05-01*01,CASSLRGSGNTIYF"
2,"TCRBV04-02*01,CASSQEGQSSYEQYF"
3,"TCRBV06-05*01,CASSYSGSNQPQHF"
4,"TCRBV29-01*01,CSVYRGHEQYF"
5,"TCRBV06-04*01,CASSDNSGANVLTF"


In [10]:
# Huth et al. 2019 for RPH, TPR, CRV, FRC
df_CRV = pd.read_excel(os.path.join(data_dir, "Huth_2019", "Huth_2019_supplemental_table_2.xlsx"), 
                        sheet_name = "CRV-specific TCRs", skiprows=3)
print(df_CRV.shape)

df_FRC = pd.read_excel(os.path.join(data_dir, "Huth_2019", "Huth_2019_supplemental_table_2.xlsx"), 
                        sheet_name = "FRC-specific TCRs", skiprows=3)
print(df_FRC.shape)

df_RPH = pd.read_excel(os.path.join(data_dir, "Huth_2019", "Huth_2019_supplemental_table_2.xlsx"), 
                        sheet_name = "RPH-specific TCRs", skiprows=3)
print(df_RPH.shape)

df_TPR = pd.read_excel(os.path.join(data_dir, "Huth_2019", "Huth_2019_supplemental_table_2.xlsx"), 
                        sheet_name = "TPR-specific TCRs", skiprows=3)
print(df_TPR.shape)

(435, 8)
(266, 8)
(191, 8)
(160, 8)


In [11]:
df_CRV[:6]

Unnamed: 0,V.gene,CDR3.sequence,J.gene,Specificity,Donor,Sum.donors,Unnamed: 6,unique TCRs:
0,TRBV25-1,CASSPGDEQFF,TRBJ2-1,CRV,"P01,P05,P06,P07",4,,435.0
1,TRBV25-1,CASTPGDEQFF,TRBJ2-1,CRV,"P03,P06,P07",3,,
2,TRBV20-1,CSAPDWNNEQFF,TRBJ2-1,CRV,"P01,P02",2,,
3,TRBV28,CASSFPDTQYF,TRBJ2-3,CRV,"P01,P02",2,,
4,TRBV28,CASTPWGAEAFF,TRBJ1-1,CRV,"P04,P08",2,,
5,TRBV28,CASSPISNEQFF,TRBJ2-1,CRV,"P01,P07",2,,


In [13]:
V_gene_concat_list = df_CRV["V.gene"].tolist() + \
                     df_FRC["V.gene"].tolist() + \
                     df_RPH["V.gene"].tolist() + \
                     df_TPR["V.gene"].tolist()

len(V_gene_concat_list)

1052

In [15]:
raw_v_genes = list(set(V_gene_concat_list))
raw_v_genes.sort()
len(raw_v_genes)

41

In [16]:
raw_v_genes

['TRBV10-1',
 'TRBV10-2',
 'TRBV10-3',
 'TRBV11-1',
 'TRBV11-2',
 'TRBV11-3',
 'TRBV12-2',
 'TRBV12-3/12-4',
 'TRBV12-5',
 'TRBV13',
 'TRBV14',
 'TRBV15',
 'TRBV18',
 'TRBV19',
 'TRBV2',
 'TRBV20-1',
 'TRBV24-1',
 'TRBV25-1',
 'TRBV27',
 'TRBV28',
 'TRBV29-1',
 'TRBV3-1/3-2',
 'TRBV30',
 'TRBV4-1',
 'TRBV4-2',
 'TRBV4-3',
 'TRBV5-1',
 'TRBV5-4',
 'TRBV5-6',
 'TRBV5-8',
 'TRBV6-1/6-5/6-6',
 'TRBV6-2/6-3',
 'TRBV6-4',
 'TRBV7-2',
 'TRBV7-3',
 'TRBV7-4',
 'TRBV7-6',
 'TRBV7-7',
 'TRBV7-8',
 'TRBV7-9',
 'TRBV9']

In [17]:
all_Emerson_Vs = [x.split(",")[0] for x in df_tcrs["TCR_name"].tolist()]
set(all_Emerson_Vs)

{'TCRBV01-01*01',
 'TCRBV02-01*01',
 'TCRBV03-01*01',
 'TCRBV04-01*01',
 'TCRBV04-02*01',
 'TCRBV04-03*01',
 'TCRBV05-01*01',
 'TCRBV05-03*01',
 'TCRBV05-04*01',
 'TCRBV05-05*01',
 'TCRBV05-06*01',
 'TCRBV05-08*01',
 'TCRBV06-01*01',
 'TCRBV06-04*01',
 'TCRBV06-05*01',
 'TCRBV06-06*01',
 'TCRBV06-07*01',
 'TCRBV06-09*01',
 'TCRBV07-02*01',
 'TCRBV07-03*01',
 'TCRBV07-04*01',
 'TCRBV07-06*01',
 'TCRBV07-07*01',
 'TCRBV07-08*01',
 'TCRBV07-08*02',
 'TCRBV07-09*01',
 'TCRBV08-02*01',
 'TCRBV09-01*01',
 'TCRBV10-01*01',
 'TCRBV10-02*01',
 'TCRBV10-03*01',
 'TCRBV11-01*01',
 'TCRBV11-02*02',
 'TCRBV11-03*01',
 'TCRBV12-01*01',
 'TCRBV12-02*01',
 'TCRBV12-05*01',
 'TCRBV13-01*01',
 'TCRBV14-01*01',
 'TCRBV15-01*01',
 'TCRBV16-01*01',
 'TCRBV18-01*01',
 'TCRBV19-01*01',
 'TCRBV20-01*01',
 'TCRBV21-01*01',
 'TCRBV23-01*01',
 'TCRBV25-01*01',
 'TCRBV27-01*01',
 'TCRBV28-01*01',
 'TCRBV29-01*01',
 'TCRBV30-01*01',
 'TCRBV30-01*02'}

In [18]:
temp_list = list(set(all_Emerson_Vs))
temp_list.sort()

emerson_1st_layer = defaultdict(list)
emerson_2nd_layer = defaultdict(list)

for x in temp_list:
    x_split_2nd = x.split("*")
    x_split_1st = x_split_2nd[0].split("-")
    emerson_1st_layer[x_split_1st[0]] += [x_split_1st[1]]
    emerson_2nd_layer[x_split_2nd[0]] += [x_split_2nd[1]]    

In [19]:
emerson_1st_layer

defaultdict(list,
            {'TCRBV01': ['01'],
             'TCRBV02': ['01'],
             'TCRBV03': ['01'],
             'TCRBV04': ['01', '02', '03'],
             'TCRBV05': ['01', '03', '04', '05', '06', '08'],
             'TCRBV06': ['01', '04', '05', '06', '07', '09'],
             'TCRBV07': ['02', '03', '04', '06', '07', '08', '08', '09'],
             'TCRBV08': ['02'],
             'TCRBV09': ['01'],
             'TCRBV10': ['01', '02', '03'],
             'TCRBV11': ['01', '02', '03'],
             'TCRBV12': ['01', '02', '05'],
             'TCRBV13': ['01'],
             'TCRBV14': ['01'],
             'TCRBV15': ['01'],
             'TCRBV16': ['01'],
             'TCRBV18': ['01'],
             'TCRBV19': ['01'],
             'TCRBV20': ['01'],
             'TCRBV21': ['01'],
             'TCRBV23': ['01'],
             'TCRBV25': ['01'],
             'TCRBV27': ['01'],
             'TCRBV28': ['01'],
             'TCRBV29': ['01'],
             'TCRBV30': ['01', '

In [26]:
# translate the format of V gene from Huth et al. 2019 data
# if first num string has only one character, add 0
# if no second num string, take the first one in the ordered Emerson list
# if second num string has only one character, add 0
# after the second num string, take the first one in the ordered Emerson list for the part after *

huth_trans_dict = defaultdict(str)

for x in raw_v_genes:
    # manually set a few
    if x in ['TRBV12-3/12-4', 'TRBV6-2/6-3']:
        huth_trans_dict[x] = "not_found"
    elif x=='TRBV3-1/3-2':
        huth_trans_dict[x] = "TCRBV03-01*01"    
    elif x=='TRBV6-1/6-5/6-6':
        huth_trans_dict[x] = "TCRBV06-01*01"   
    else:
        # automatically set others
        x_split = x.split("-")
        if len(x_split)==1:
            if len(x_split[0])==5:
                x_key = "TCRBV0"+x[-1]
            else:
                x_key = "TCRBV"+x_split[0][(-2):] 
            if x_key in emerson_1st_layer:
                x_first_vs = emerson_1st_layer[x_key]
                x_first_vs.sort()
                x_key_2nd = x_key+"-"+x_first_vs[0]
                x_second_vs = emerson_2nd_layer[x_key_2nd]
                x_second_vs.sort()
                x_value = x_key_2nd+"*"+x_second_vs[0]
                huth_trans_dict[x] = x_value 
            else:
                huth_trans_dict[x] = "not_found"
        else:
            if len(x_split[0])==5:
                x_key = "TCRBV0"+x_split[0][-1]
            else:
                x_key = "TCRBV"+x_split[0][(-2):] 
            x_key_2nd = x_key+"-0"+x_split[1]
            if x_key_2nd in emerson_2nd_layer:
                x_second_vs = emerson_2nd_layer[x_key_2nd]
                x_second_vs.sort()
                x_value = x_key_2nd+"*"+x_second_vs[0]  
                huth_trans_dict[x] = x_value 
            else:
                huth_trans_dict[x] = "not_found"

In [27]:
len(huth_trans_dict)

41

In [28]:
for x in raw_v_genes:
    print(f"{x}: {huth_trans_dict[x]}")

TRBV10-1: TCRBV10-01*01
TRBV10-2: TCRBV10-02*01
TRBV10-3: TCRBV10-03*01
TRBV11-1: TCRBV11-01*01
TRBV11-2: TCRBV11-02*02
TRBV11-3: TCRBV11-03*01
TRBV12-2: TCRBV12-02*01
TRBV12-3/12-4: not_found
TRBV12-5: TCRBV12-05*01
TRBV13: TCRBV13-01*01
TRBV14: TCRBV14-01*01
TRBV15: TCRBV15-01*01
TRBV18: TCRBV18-01*01
TRBV19: TCRBV19-01*01
TRBV2: TCRBV02-01*01
TRBV20-1: TCRBV20-01*01
TRBV24-1: not_found
TRBV25-1: TCRBV25-01*01
TRBV27: TCRBV27-01*01
TRBV28: TCRBV28-01*01
TRBV29-1: TCRBV29-01*01
TRBV3-1/3-2: TCRBV03-01*01
TRBV30: TCRBV30-01*01
TRBV4-1: TCRBV04-01*01
TRBV4-2: TCRBV04-02*01
TRBV4-3: TCRBV04-03*01
TRBV5-1: TCRBV05-01*01
TRBV5-4: TCRBV05-04*01
TRBV5-6: TCRBV05-06*01
TRBV5-8: TCRBV05-08*01
TRBV6-1/6-5/6-6: TCRBV06-01*01
TRBV6-2/6-3: not_found
TRBV6-4: TCRBV06-04*01
TRBV7-2: TCRBV07-02*01
TRBV7-3: TCRBV07-03*01
TRBV7-4: TCRBV07-04*01
TRBV7-6: TCRBV07-06*01
TRBV7-7: TCRBV07-07*01
TRBV7-8: TCRBV07-08*01
TRBV7-9: TCRBV07-09*01
TRBV9: TCRBV09-01*01


In [29]:
set(huth_trans_dict.values()) - set(temp_list)

{'not_found'}

In [30]:
CDR3_list = df_CRV["CDR3.sequence"].tolist() + \
             df_FRC["CDR3.sequence"].tolist() + \
             df_RPH["CDR3.sequence"].tolist() + \
             df_TPR["CDR3.sequence"].tolist()

len(CDR3_list)

1052

In [31]:
set("".join(CDR3_list))

{'A',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'K',
 'L',
 'M',
 'N',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'V',
 'W',
 'Y'}

In [32]:
len(set("".join(CDR3_list)))

20

In [35]:
tcr_dict = defaultdict(list)


CRV_list = [huth_trans_dict[x]+","+y for x,y in zip(df_CRV["V.gene"].tolist(),
                                                    df_CRV["CDR3.sequence"].tolist())]
CRV_list = list(set(CRV_list))
print(len(CRV_list))

FRC_list = [huth_trans_dict[x]+","+y for x,y in zip(df_FRC["V.gene"].tolist(),
                                                    df_FRC["CDR3.sequence"].tolist())]
FRC_list = list(set(FRC_list))
print(len(FRC_list))

RPH_list = [huth_trans_dict[x]+","+y for x,y in zip(df_RPH["V.gene"].tolist(),
                                                    df_RPH["CDR3.sequence"].tolist())]
RPH_list = list(set(RPH_list))
print(len(RPH_list))

TPR_list = [huth_trans_dict[x]+","+y for x,y in zip(df_TPR["V.gene"].tolist(),
                                                    df_TPR["CDR3.sequence"].tolist())]
TPR_list = list(set(TPR_list))
print(len(TPR_list))

# HLA-B*07:02: RPH, TPR
# HLA-C*07:02: CRV, FRC

B0702_list = list(set(RPH_list + TPR_list))
C0702_list = list(set(CRV_list + FRC_list))

print(len(B0702_list))
print(len(C0702_list))

tcr_dict["B0702"] = B0702_list
tcr_dict["C0702"] = C0702_list

435
266
191
160
351
701


In [36]:
for key in ["B0702", "C0702"]:
    cur_list = tcr_dict[key]
    cur_common = set(cur_list).intersection(set(df_tcrs["TCR_name"].tolist()))
    print(len(cur_common))
    df_common = pd.DataFrame(cur_common, columns=["TCR_name"])
    df_common.to_csv(os.path.join(inter_dir, f"Huth_2019_TCRs_processed_common_{key}.csv"), 
                     index=False)

54
69
