In [1]:
import os
import re
import pickle
import numpy as np
import pandas as pd

from collections import defaultdict
from collections import Counter

In [2]:
file_path = '../../data/for_encoders/combo_xcr.tsv'
V_info = pd.read_csv(file_path, sep='\t')

In [3]:
Counter(V_info.organism.tolist())

Counter({'human': 332,
         'human_gd': 140,
         'human_ig': 625,
         'mouse': 395,
         'mouse_gd': 301})

In [4]:
V_sub_info = V_info.loc[(V_info.organism == 'human') & (V_info.chain == 'B') & (V_info.region == 'V')]
V_sub_info.shape

(142, 9)

In [5]:
Counter(V_sub_info.frame)

Counter({1: 139, 2: 2, 3: 1})

In [6]:
V_sub_info[:6]

Unnamed: 0,id,organism,chain,region,nucseq,frame,aligned_protseq,cdr_columns,cdrs
566,TRBV1*01,human,B,V,gatactggaattacccagacaccaaaatacctggtcacagcaatgg...,1,DTGITQTPKYLVTAMGSKRTMKREHLGH........DSMYWYRQKA...,27-38;56-65;81-86;104-109,GH........DS;YNC....KEF;P.DSSR;CTSSQ.
567,TRBV10-1*01,human,B,V,gatgctgaaatcacccagagcccaagacacaagatcacagagacag...,1,DAEITQSPRHKITETGRQVTLACHQTWNH.......NNMFWYRQDL...,27-38;56-65;81-86;104-109,WNH.......NN;SYG....VQD;S.NTED;CASSE.
568,TRBV10-1*02,human,B,V,gatgctgaaatcacccagagcccaagacacaagatcacagagacag...,1,DAEITQSPRHKITETGRQVTLACHQTWNH.......NNMFWYRQDL...,27-38;56-65;81-86;104-109,WNH.......NN;SYG....VHD;S.NTED;CASSE.
569,TRBV10-2*01,human,B,V,gatgctggaatcacccagagcccaagatacaagatcacagagacag...,1,DAGITQSPRYKITETGRQVTLMCHQTWSH.......SYMFWYRQDL...,27-38;56-65;81-86;104-109,WSH.......SY;SAA....ADI;S.KTEN;CASSE.
570,TRBV10-2*02,human,B,V,aaggcaggtgaccttgatgtgtcaccagacttggagccacagctat...,2,................RQVTLMCHQTWSH.......SYMFWYRQDL...,27-38;56-65;81-86;104-109,WSH.......SY;SAA....ADI;S.KTEN;......
571,TRBV10-3*01,human,B,V,gatgctggaatcacccagagcccaagacacaaggtcacagagacag...,1,DAGITQSPRHKVTETGTPVTLRCHQTENH.......RYMYWYRQDP...,27-38;56-65;81-86;104-109,ENH.......RY;SYG....VKD;S.KTED;CAISE.


In [7]:
V_sub_info.shape

(142, 9)

In [8]:
len(Counter(V_sub_info.id))

142

In [10]:
cdr1_dict = defaultdict(str)
cdr2_dict = defaultdict(str)
cdr25_dict = defaultdict(str)

In [11]:
for allele, cdrs in zip(V_sub_info.id.tolist(), V_sub_info.cdrs.tolist()):
    cdr_seqs = cdrs.split(";")
    if len(cdr_seqs) != 4:
        print("length of cdr sequences list is not 4")
        break
    cdr1_dict[allele] = cdr_seqs[0]
    cdr2_dict[allele] = cdr_seqs[1]
    cdr25_dict[allele]= cdr_seqs[2]

In [12]:
len(cdr1_dict)

142

In [13]:
cdr1_key_list = list(cdr1_dict.keys())

In [13]:
# the length of each cdr1 seq
set([len(cdr1_dict[allele]) for allele in cdr1_key_list])

{12}

In [17]:
# the length of each cdr2 seq
set([len(cdr2_dict[allele]) for allele in cdr1_key_list])

{10}

In [15]:
# the length of each cdr25 seq
set([len(cdr25_dict[allele]) for allele in cdr1_key_list])

{6}

write the mapping from the v_allele as in

    ../../data/intermediate_data/t5_public_allele_level_tcr_filtered_wrt_vf_and_aa.csv

to the allele as in 
    
    ../../data/for_encoders/combo_xcr.tsv
    
according to the information on page 

http://www.imgt.org/IMGTrepertoire/index.php?section=LocusGenes&repertoire=genetable&species=human&group=TRBV

In [26]:
public_tcr_file = "../../data/intermediate_data/t5_public_allele_level_tcr_filtered_wrt_vf_and_aa.csv"
public_allele_level_tcr = pd.read_csv(public_tcr_file, header = 0)

In [17]:
v_alleles_new = list(set(public_allele_level_tcr.v_allele))
v_alleles_new.sort()

In [None]:
v_alleles_new_df = pd.DataFrame(v_alleles_new, columns = ['v_allele_new'])

v_alleles_new_df.to_csv("../../data/intermediate_data/v_alleles_new.csv", index = False)

A second column of the .csv file written out above was later filled by hand, and saved together with the first column as 

    ../../data/intermediate_data/v_allele_translate_table_fillin.csv

### Note

in the manually filling process, these 5 v_alleles did not find corresponding ones in 

    ../../data/for_encoders/combo_xcr.tsv

    TCRBV05-02*01
    TCRBV07-05*01
    TCRBV07-05*02
    TCRBV08-02*01
    TCRBVA-or09_02*01

they are mapped to

    "not_found" 

in

    ../../data/intermediate_data/v_allele_translate_table_fillin.csv

In [19]:
# load the "v_allele_translate_table_fillin.csv" to build
# the translate dictionary
v_allele_trans_table = pd.read_csv("../../data/intermediate_data/v_allele_translate_table_fillin.csv", 
                                   header = 0)
v_allele_trans_table[:1]

Unnamed: 0,v_allele_new,v_allele_translate
0,TCRBV01-01*01,TRBV1*01


In [27]:
v_allele_trans_dict = defaultdict(str)
for new, translate in zip(v_allele_trans_table.v_allele_new.tolist(), \
                          v_allele_trans_table.v_allele_translate.tolist()):
    v_allele_trans_dict[new] = translate

In [28]:
len(v_allele_trans_dict)

65

In [30]:
#Counter(v_allele_trans_table.v_allele_translate.tolist())

see how frequently these 5 v_alleles show up in the total v_allele level public tcr data

In [None]:
v_allele_big_counter = Counter(public_allele_level_tcr.v_allele)

In [28]:
#v_allele_big_counter

In [32]:
v_allele_big_counter['TCRBV05-02*01']

2

In [33]:
v_allele_big_counter['TCRBV07-05*01']

97

In [34]:
v_allele_big_counter['TCRBV07-05*02']

30

In [35]:
v_allele_big_counter['TCRBV08-02*01']

9

In [36]:
v_allele_big_counter['TCRBVA-or09_02*01']

3