### This file combines the extended pseudo sequences of HLA-II A alleles with HLA-II B alleles

for the 135 pairs contained in HLA v2 matrix of DeWitt_2018 (original format is 130 items with 5 haplotypes. After separating each haplotype into two pairs we eventually get 135 pairs). 

Input files:

The dictionary of extended pseudo sequences for 17 HLA-II A alleles(all DRAs treated as one) stored in 

    ../../data/intermediate_data/t4_HLA_II_v2_alpha_pseudo_22_dict.csv

The dictionary of extended pseudo sequences for 62 HLA-II B alleles stored in

    ../../data/intermediate_data/t4_HLA_II_v2_beta_pseudo_23_dict.csv

The names of HLA-II items from HLA v2 data of DeWitt_2018 stored in 

    ../../data/intermediate_data/DeWitt_2018/HLA_v2_features_row_names.txt

Output file:

A csv file storing the combined extended pseudo sequences for 135 HLA-II pairs: 

    ../../data/for_encoders/HLA_II_pseudo_45.csv

In [1]:
import numpy as np
import pandas as pd

from collections import Counter
from collections import defaultdict

In [2]:
# get HLA-II pair names from HLA_v2_features
HLA_v2_features_row_names = pd.read_csv("../../data/intermediate_data/DeWitt_2018/HLA_v2_features_row_names.txt", 
                                        sep = " ", header = None)
HLA_v2_features_row_names.columns = ["feature", "hla"]
HLA_v2_features_row_names.shape
# (215, 2)
HLA_v2_features_row_names[:6]

Unnamed: 0,feature,hla
0,feature:,HLA-DPAB*02:01_04:01
1,feature:,HLA-DQAB*05:05_06:04
2,feature:,HLA-B*08:01
3,feature:,HLA-A*24:02
4,feature:,HLA-A*24:03
5,feature:,HLA-B*38:02


In [3]:
HLA_II_v2_pairs = [hla for hla in HLA_v2_features_row_names.hla.tolist() if hla[:7] in ["HLA-DPA", "HLA-DQA", "HLA-DRD", "HLA-DRB"]]

In [4]:
HLA_II_v2_5DRDQ = [item for item in HLA_II_v2_pairs if len(item.split("_")) > 2]
HLA_II_v2_5DRDQ

['HLA-DRDQ*10:01_01:05_05:01',
 'HLA-DRDQ*03:01_05:01_02:01',
 'HLA-DRDQ*13:01_01:03_06:03',
 'HLA-DRDQ*15:01_01:02_06:02',
 'HLA-DRDQ*09:01_03:02_03:03']

In [5]:
HLA_II_v2_5DRDQ_DRB = ["HLA-DRB1*" + item[9:].split("_")[0] for item in HLA_II_v2_5DRDQ]
HLA_II_v2_5DRDQ_DQAB = ["HLA-DQAB*" + "_".join(item[9:].split("_")[1:]) for item in HLA_II_v2_5DRDQ]

In [7]:
HLA_II_complete = list(set(HLA_II_v2_pairs + HLA_II_v2_5DRDQ_DRB + HLA_II_v2_5DRDQ_DQAB) - set(HLA_II_v2_5DRDQ))
len(HLA_II_complete)
# 135
HLA_II_complete[:6]

['HLA-DQAB*01:02_06:04',
 'HLA-DQAB*05:05_03:03',
 'HLA-DQAB*01:02_03:02',
 'HLA-DQAB*05:05_02:02',
 'HLA-DQAB*05:05_03:01',
 'HLA-DPAB*02:01_17:01']

In [8]:
len(HLA_II_complete)

135

In [None]:
# load extended pseudo sequences dicts for HLA-II A alleles and HLA-II B alleles
pseudo_a = pd.read_csv("../../data/intermediate_data/t4_HLA_II_v2_alpha_pseudo_22_dict.csv", 
                       header = 0)
pseudo_a.shape
# (17, 2)
pseudo_b = pd.read_csv("../../data/intermediate_data/t4_HLA_II_v2_beta_pseudo_23_dict.csv", 
                       header = 0)
pseudo_b.shape
# (62, 2)

In [20]:
pseudo_a_dict = defaultdict(str)
pseudo_b_dict = defaultdict(str)

for allele, seq in zip(pseudo_a.allele.tolist(), pseudo_a.seq.tolist()):
    pseudo_a_dict[allele] = seq
    
for allele, seq in zip(pseudo_b.allele.tolist(), pseudo_b.seq.tolist()):
    pseudo_b_dict[allele] = seq
    
len(pseudo_a_dict)
# 17
len(pseudo_b_dict)
# 62

In [30]:
# this first two lists holds the corresponding A/B alleles of each HLA-II pair
HLA_II_alpha_list = []
HLA_II_beta_list = []
# this list holds the A/B alleles names put together, just as a sanity check
HLA_II_rec_names = []
# the dictionary below holds the combined pseudo sequences
comb_seq_dict = defaultdict(str)

# separate the HLA-II pairs into two alleles each
# translate them into the names in file "../../data/intermediate_data/pseudosequence_2016_all_X.dat"
for item in HLA_II_complete:
    if item[:8] == "HLA-DQAB":
        item_1 = "DQA1" + "*" + item[9:].split("_")[0]
        item_2 = "DQB1" + "*" + item[9:].split("_")[1]
        HLA_II_alpha_list += [item_1]
        HLA_II_beta_list += [item_2]
        HLA_II_rec_names += [item_1 + '-' + item_2]
        comb_seq_dict[item] = pseudo_a_dict[item_1] + pseudo_b_dict[item_2]
    elif item[:8] == "HLA-DPAB":
        item_1 = "DPA1" + "*" + item[9:].split("_")[0]
        item_2 = "DPB1" + "*" + item[9:].split("_")[1]
        HLA_II_alpha_list += [item_1]
        HLA_II_beta_list += [item_2]
        HLA_II_rec_names += [item_1 + '-' + item_2]
        comb_seq_dict[item] = pseudo_a_dict[item_1] + pseudo_b_dict[item_2]
    elif item[:8] == "HLA-DRB1":
        item_1 = "DRA"
        item_2 = "DRB1" + "*" + item[9:]
        HLA_II_alpha_list += [item_1]
        HLA_II_beta_list += [item_2]
        HLA_II_rec_names += [item_1 + '-' + item_2]
        comb_seq_dict[item] = pseudo_a_dict[item_1] + pseudo_b_dict[item_2]
    else:
        print("error found, first eight letters exception")
        print(item)
        break

In [32]:
#sanity_check_df = pd.DataFrame(list(zip(HLA_II_complete, HLA_II_alpha_list, \
#                                        HLA_II_beta_list, HLA_II_rec_names)),\
#                               columns = ["ori_name", "a_name", "b_name", "rec_name"])

In [35]:
Counter([len(value) for value in list(comb_seq_dict.values())])

Counter({45: 135})

In [44]:
HLA_II_complete.sort()
comb_seqs_list = [comb_seq_dict[key] for key in HLA_II_complete]
                  
df_combine_seqs = pd.DataFrame(list(zip(HLA_II_complete, comb_seqs_list)), \
                               columns = ['hla', "seq"])

df_combine_seqs.nunique()

hla    135
seq    111
dtype: int64

In [47]:
df_combine_seqs.to_csv("../../data/for_encoders/HLA_II_pseudo_45.csv", index = False)

In [46]:
Counter([len(seq) for seq in df_combine_seqs.seq.tolist()])

Counter({45: 135})