### This file combines the extended pseudo sequences of HLA-II A alleles with HLA-II B alleles

**In addition to the 135 pairs, this version also covers all pairs formed between DPAs & DPBs and between DQAs & DQBs from DeWitt_2018 data**

From the 135 pairs contained in HLA v2 matrix of DeWitt_2018 (original format is 130 items with 5 haplotypes. After separating each haplotype into two pairs we eventually get 135 pairs), extract the unique DRBs, DPAs, DPBs, DQAs and DQBs, and then go through all unique DRBs, all unique (DPA, DPB) and all unique (DQA, DQB).

Input files:

The dictionary of extended pseudo sequences for 17 HLA-II A alleles(all DRAs treated as one) stored in 

    ../../data/intermediate_data/t4_HLA_II_v2_alpha_pseudo_22_dict.csv

The dictionary of extended pseudo sequences for 62 HLA-II B alleles stored in

    ../../data/intermediate_data/t4_HLA_II_v2_beta_pseudo_23_dict.csv

The names of HLA-II items from HLA v2 data of DeWitt_2018 stored in 

    ../../data/intermediate_data/DeWitt_2018/HLA_v2_features_row_names.txt

Output file:

A csv file storing the combined extended pseudo sequences for 135 HLA-II pairs: 

    ../../data/for_encoders/HLA_II_pseudo_45.csv

In [1]:
import numpy as np
import pandas as pd

from collections import Counter
from collections import defaultdict

In [2]:
# get HLA-II pair names from HLA_v2_features
HLA_v2_features_row_names = pd.read_csv("../../data/intermediate_data/DeWitt_2018/HLA_v2_features_row_names.txt", 
                                        sep = " ", header = None)
HLA_v2_features_row_names.columns = ["feature", "hla"]
HLA_v2_features_row_names.shape
# (215, 2)
HLA_v2_features_row_names[:6]

Unnamed: 0,feature,hla
0,feature:,HLA-DPAB*02:01_04:01
1,feature:,HLA-DQAB*05:05_06:04
2,feature:,HLA-B*08:01
3,feature:,HLA-A*24:02
4,feature:,HLA-A*24:03
5,feature:,HLA-B*38:02


In [3]:
HLA_II_v2_pairs = [hla for hla in HLA_v2_features_row_names.hla.tolist() if hla[:7] in ["HLA-DPA", "HLA-DQA", "HLA-DRD", "HLA-DRB"]]

In [4]:
HLA_II_v2_5DRDQ = [item for item in HLA_II_v2_pairs if len(item.split("_")) > 2]
HLA_II_v2_5DRDQ

['HLA-DRDQ*10:01_01:05_05:01',
 'HLA-DRDQ*03:01_05:01_02:01',
 'HLA-DRDQ*13:01_01:03_06:03',
 'HLA-DRDQ*15:01_01:02_06:02',
 'HLA-DRDQ*09:01_03:02_03:03']

In [5]:
HLA_II_v2_5DRDQ_DRB = ["HLA-DRB1*" + item[9:].split("_")[0] for item in HLA_II_v2_5DRDQ]
HLA_II_v2_5DRDQ_DQAB = ["HLA-DQAB*" + "_".join(item[9:].split("_")[1:]) for item in HLA_II_v2_5DRDQ]

In [8]:
HLA_II_complete = list(set(HLA_II_v2_pairs + HLA_II_v2_5DRDQ_DRB + HLA_II_v2_5DRDQ_DQAB) - set(HLA_II_v2_5DRDQ))
print(len(HLA_II_complete))
# 135
HLA_II_complete.sort()
HLA_II_complete[:6]

135


['HLA-DPAB*01:03_01:01',
 'HLA-DPAB*01:03_02:01',
 'HLA-DPAB*01:03_03:01',
 'HLA-DPAB*01:03_04:01',
 'HLA-DPAB*01:03_04:02',
 'HLA-DPAB*01:03_05:01']

In [10]:
HLA_II_complete

['HLA-DPAB*01:03_01:01',
 'HLA-DPAB*01:03_02:01',
 'HLA-DPAB*01:03_03:01',
 'HLA-DPAB*01:03_04:01',
 'HLA-DPAB*01:03_04:02',
 'HLA-DPAB*01:03_05:01',
 'HLA-DPAB*01:03_10:01',
 'HLA-DPAB*01:03_11:01',
 'HLA-DPAB*01:03_13:01',
 'HLA-DPAB*01:03_17:01',
 'HLA-DPAB*02:01_01:01',
 'HLA-DPAB*02:01_02:01',
 'HLA-DPAB*02:01_03:01',
 'HLA-DPAB*02:01_04:01',
 'HLA-DPAB*02:01_04:02',
 'HLA-DPAB*02:01_05:01',
 'HLA-DPAB*02:01_10:01',
 'HLA-DPAB*02:01_11:01',
 'HLA-DPAB*02:01_13:01',
 'HLA-DPAB*02:01_17:01',
 'HLA-DPAB*02:02_01:01',
 'HLA-DPAB*02:02_02:01',
 'HLA-DPAB*02:02_04:01',
 'HLA-DPAB*02:02_04:02',
 'HLA-DPAB*02:02_05:01',
 'HLA-DQAB*01:01_02:02',
 'HLA-DQAB*01:01_03:01',
 'HLA-DQAB*01:01_03:02',
 'HLA-DQAB*01:01_05:01',
 'HLA-DQAB*01:01_05:03',
 'HLA-DQAB*01:01_06:02',
 'HLA-DQAB*01:01_06:03',
 'HLA-DQAB*01:02_02:01',
 'HLA-DQAB*01:02_02:02',
 'HLA-DQAB*01:02_03:01',
 'HLA-DQAB*01:02_03:02',
 'HLA-DQAB*01:02_03:03',
 'HLA-DQAB*01:02_04:02',
 'HLA-DQAB*01:02_05:01',
 'HLA-DQAB*01:02_05:02',


In [22]:
# extract
# all unique DRB
# all unique DQA, DQB
# all unique DPA, DPB

DRB_raw = ["DRB1*"+item[9:] for item in HLA_II_complete if item[:8]=='HLA-DRB1']
DQA_raw = ["DQA1*"+item[9:].split("_")[0] for item in HLA_II_complete if item[:8]=='HLA-DQAB']
DQB_raw = ["DQB1*"+item[9:].split("_")[1] for item in HLA_II_complete if item[:8]=='HLA-DQAB']
DPA_raw = ["DPA1*"+item[9:].split("_")[0] for item in HLA_II_complete if item[:8]=='HLA-DPAB']
DPB_raw = ["DPB1*"+item[9:].split("_")[1] for item in HLA_II_complete if item[:8]=='HLA-DPAB']

In [23]:
print(len(DRB_raw)+len(DQA_raw)+len(DPA_raw))
print(len(DRB_raw)+len(DQB_raw)+len(DPB_raw))

135
135


In [24]:
DRB_list = list(set(DRB_raw))
DQA_list = list(set(DQA_raw))
DQB_list = list(set(DQB_raw))
DPA_list = list(set(DPA_raw))
DPB_list = list(set(DPB_raw))

DRB_list.sort()
DQA_list.sort()
DQB_list.sort()
DPA_list.sort()
DPB_list.sort()

print(len(DRB_list))
print(len(DQA_list))
print(len(DQB_list))
print(len(DPA_list))
print(len(DPB_list))

print(len(DRB_list)+len(DQA_list)+len(DQB_list)+len(DPA_list)+len(DPB_list))

38
13
14
3
10
78


In [31]:
# load extended pseudo sequences dicts for HLA-II A alleles and HLA-II B alleles
pseudo_a = pd.read_csv("../../data/intermediate_data/t4_HLA_II_v2_alpha_pseudo_22_dict.csv", 
                       header = 0)
print(pseudo_a.shape)
# (17, 2)
pseudo_b = pd.read_csv("../../data/intermediate_data/t4_HLA_II_v2_beta_pseudo_23_dict.csv", 
                       header = 0)
print(pseudo_b.shape)
# (62, 2)

(17, 2)
(62, 2)


In [37]:
pseudo_a_dict = defaultdict(str)
pseudo_b_dict = defaultdict(str)

for allele, seq in zip(pseudo_a.allele.tolist(), pseudo_a.seq.tolist()):
    pseudo_a_dict[allele] = seq
    
for allele, seq in zip(pseudo_b.allele.tolist(), pseudo_b.seq.tolist()):
    pseudo_b_dict[allele] = seq
    
print(len(pseudo_a_dict))
# 17
print(len(pseudo_b_dict))
# 62

17
62


Deal with DRB, DPA/DPB and DQA/DQB separately

    add DRA for DRB
    go through pairs formed between all DPAs and all DPBs
    go through pairs formed between all DQAs and all DQBs

In [79]:
# this first two lists holds the corresponding A/B alleles of each HLA-II pair
HLA_II_alpha_list = []
HLA_II_beta_list = []
# this list holds the A/B alleles names put together, just as a sanity check
HLA_II_rec_names = []
# the dictionary below holds the combined pseudo sequences
all_seq_dict = defaultdict(str)

# separate the HLA-II pairs into two alleles each
# translate them into the names in file "../../data/intermediate_data/pseudosequence_2016_all_X.dat"


for item in DRB_list:
    item_1 = "DRA"
    item_2 = item
    HLA_II_alpha_list += [item_1]
    HLA_II_beta_list += [item_2]
    HLA_II_rec_names += [item_1 + '-' + item_2]
    full_name = "HLA-"+item
    all_seq_dict[full_name] = pseudo_a_dict[item_1] + pseudo_b_dict[item_2]   

for item_1 in DQA_list:
    for item_2 in DQB_list:
        HLA_II_alpha_list += [item_1]
        HLA_II_beta_list += [item_2]
        HLA_II_rec_names += [item_1 + '-' + item_2]
        full_name = "HLA-DQAB*"+item_1.split("*")[1]+"_"+item_2.split("*")[1]
        all_seq_dict[full_name] = pseudo_a_dict[item_1] + pseudo_b_dict[item_2]
        
for item_1 in DPA_list:
    for item_2 in DPB_list:
        HLA_II_alpha_list += [item_1]
        HLA_II_beta_list += [item_2]
        HLA_II_rec_names += [item_1 + '-' + item_2]
        full_name = "HLA-DPAB*"+item_1.split("*")[1]+"_"+item_2.split("*")[1]
        all_seq_dict[full_name] = pseudo_a_dict[item_1] + pseudo_b_dict[item_2]

In [81]:
print(len(all_seq_dict))
print(len(DRB_list)+len(DQA_list)*len(DQB_list)+len(DPA_list)*len(DPB_list))

250
250


In [83]:
Counter(HLA_II_alpha_list)

Counter({'DRA': 38,
         'DQA1*01:01': 14,
         'DQA1*01:02': 14,
         'DQA1*01:03': 14,
         'DQA1*01:04': 14,
         'DQA1*01:05': 14,
         'DQA1*02:01': 14,
         'DQA1*03:01': 14,
         'DQA1*03:02': 14,
         'DQA1*03:03': 14,
         'DQA1*04:01': 14,
         'DQA1*05:01': 14,
         'DQA1*05:05': 14,
         'DQA1*06:01': 14,
         'DPA1*01:03': 10,
         'DPA1*02:01': 10,
         'DPA1*02:02': 10})

In [96]:
print(len(DQB_list))
print(len(DPB_list))

14
10


In [95]:
sanity_check_df = pd.DataFrame(list(zip(HLA_II_alpha_list, \
                                        HLA_II_beta_list, HLA_II_rec_names)),\
                               columns = ["a_name", "b_name", "rec_name"])
print(sanity_check_df.shape)

Counter([c==(a+"-"+b) for a,b,c in zip(sanity_check_df.a_name.tolist(), 
                                       sanity_check_df.b_name.tolist(), 
                                       sanity_check_df.rec_name.tolist()) if a!="DRA"])

(250, 3)


Counter({True: 212})

In [97]:
Counter([len(value) for value in list(all_seq_dict.values())])

Counter({45: 250})

In [98]:
all_keys = list(all_seq_dict.keys())
all_keys.sort()

all_seqs_list = [all_seq_dict[key] for key in all_keys]
                  
df_all_seqs = pd.DataFrame(list(zip(all_keys, all_seqs_list)), \
                               columns = ['hla', "seq"])

df_all_seqs.nunique()

hla    250
seq    169
dtype: int64

In [99]:
df_all_seqs

Unnamed: 0,hla,seq
0,HLA-DPAB*01:03_01:01,YAFFMGQAFSEGGAILNNNTLQYGQFEYFPAYDIEKAVRVHLDVT
1,HLA-DPAB*01:03_02:01,YAFFMGQAFSEGGAILNNNTLQFGQFEYFPDYDIEEAVRMHLGMT
2,HLA-DPAB*01:03_03:01,YAFFMGQAFSEGGAILNNNTLQYLQFEYFPDYDLEKAVRVHLDVT
3,HLA-DPAB*01:03_04:01,YAFFMGQAFSEGGAILNNNTLQFGQFEYFPAYDIEKAVRMHLGMT
4,HLA-DPAB*01:03_04:02,YAFFMGQAFSEGGAILNNNTLQFGQFEYFPDYDIEKAVRMHLGMT
...,...,...
245,HLA-DRB1*15:01,QEFFIGRFASEGAAVDKAEIMKWPRFDYFPDYDIQAAATYHVVFT
246,HLA-DRB1*15:02,QEFFIGRFASEGAAVDKAEIMKWPRFDYFPDYDIQAAATYHVGFT
247,HLA-DRB1*15:03,QEFFIGRFASEGAAVDKAEIMKWPRFDHFPDYDIQAAATYHVVFT
248,HLA-DRB1*16:01,QEFFIGRFASEGAAVDKAEIMKWPRFDYYPDYDFDRAATYHVGFT


In [100]:
df_all_seqs.to_csv("../../data/for_encoders/HLA_II_pseudo_45.csv", index = False)

In [107]:
Counter([len(s) for s in df_all_seqs.seq.tolist()])

Counter({45: 250})